osx fixes. Fix issue 211.

2014-12-02 20:23:21 +00:00
parent 2d6fc0e304
commit 7e2a9d1adb
42 changed files with 176 additions and 940 deletions
--- a/src/bc6h/CMakeLists.txt
+++ b/src/bc6h/CMakeLists.txt
@ -0,0 +1,22 @@
+PROJECT(bc6h)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(BC6H_SRCS
+	bits.h
+	shapes_two.h
+	tile.h
+	zoh_utils.cpp
+	zoh_utils.h
+	zoh.cpp
+	zoh.h
+	zohone.cpp
+	zohtwo.cpp)
+
+ADD_LIBRARY(bc6h STATIC ${BC6H_SRCS})
+
+IF(NOT WIN32)
+    IF(CMAKE_COMPILER_IS_GNUCXX)
+        SET_TARGET_PROPERTIES(bc6h PROPERTIES COMPILE_FLAGS -fPIC)
+    ENDIF(CMAKE_COMPILER_IS_GNUCXX)
+ENDIF(NOT WIN32)
--- a/src/bc6h/bits.h
+++ b/src/bc6h/bits.h
@ -0,0 +1,76 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_BITS_H
+#define _ZOH_BITS_H
+
+// read/write a bitstream
+
+#include "nvcore/Debug.h"
+
+namespace ZOH {
+
+class Bits
+{
+public:
+
+	Bits(char *data, int maxdatabits) { nvAssert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;}
+	Bits(const char *data, int availdatabits) { nvAssert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;}
+
+	void write(int value, int nbits) {
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		for (int i=0; i<nbits; ++i)
+			writeone(value>>i);
+	}
+	int read(int nbits) { 
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		int out = 0;
+		for (int i=0; i<nbits; ++i)
+			out |= readone() << i;
+		return out;
+	}
+	int getptr() { return bptr; }
+	void setptr(int ptr) { nvAssert (ptr >= 0 && ptr < maxbits); bptr = ptr; }
+	int getsize() { return bend; }
+
+private:
+	int	bptr;		// next bit to read
+	int bend;		// last written bit + 1
+	char *bits;		// ptr to user bit stream
+	const char *cbits;	// ptr to const user bit stream
+	int maxbits;	// max size of user bit stream
+	char readonly;	// 1 if this is a read-only stream
+
+	int readone() {
+		nvAssert (bptr < bend);
+		if (bptr >= bend) return 0;
+		int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7));
+		++bptr;
+		return bit != 0;
+	}
+	void writeone(int bit) {
+		nvAssert (!readonly); // "Writing a read-only bit stream"
+		nvAssert (bptr < maxbits);
+		if (bptr >= maxbits) return;
+		if (bit&1)
+			bits[bptr>>3] |= 1 << (bptr & 7);
+		else
+			bits[bptr>>3] &= ~(1 << (bptr & 7));
+		if (bptr++ >= bend) bend = bptr;
+	}
+};
+
+}
+
+#endif
--- a/src/bc6h/shapes_two.h
+++ b/src/bc6h/shapes_two.h
@ -0,0 +1,133 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_SHAPES_TWO_H
+#define _ZOH_SHAPES_TWO_H
+
+// shapes for two regions
+
+#define NREGIONS 2
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static const int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   0, 0, 0, 1,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   1, 1, 1, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 0,   0, 1, 1, 1,   
+1, 0, 0, 0,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 1, 1,   
+1, 1, 1, 0,   0, 0, 0, 0,   1, 0, 0, 0,   0, 0, 0, 1,   
+1, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 0, 0, 0,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 0, 0,   1, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   0, 0, 0, 1,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 0,   1, 1, 0, 0,   0, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 1,   1, 0, 0, 1,   
+1, 1, 1, 0,   1, 1, 1, 1,   1, 0, 0, 0,   1, 0, 0, 1,   
+1, 0, 0, 0,   0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   0, 0, 1, 1,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   0, 0, 1, 1,   
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   1, 1, 0, 0,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 1, 1, 0,   0, 1, 0, 1,   
+1, 1, 0, 0,   0, 1, 0, 1,   1, 0, 0, 1,   1, 0, 1, 0,   
+0, 0, 1, 1,   1, 0, 1, 0,   0, 1, 1, 0,   1, 0, 1, 0,   
+1, 1, 0, 0,   1, 0, 1, 0,   1, 0, 0, 1,   0, 1, 0, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   1, 0, 1, 1,   
+1, 1, 0, 0,   1, 1, 0, 0,   0, 1, 0, 0,   1, 1, 0, 1,   
+1, 1, 1, 0,   1, 0, 0, 0,   1, 1, 0, 0,   1, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 0, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   0, 1, 1, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   0, 0, 0, 0,   
+
+0, 1, 0, 0,   0, 0, 1, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+1, 1, 1, 0,   0, 1, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 1, 0, 0,   0, 0, 1, 0,   0, 1, 1, 1,   1, 1, 1, 0,   
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 0,   0, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 1, 1,   
+1, 1, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   1, 1, 0, 0,   
+0, 0, 1, 1,   1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   
+
+0, 1, 1, 0,   0, 1, 1, 0,   0, 1, 1, 1,   0, 0, 0, 1,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 1, 1, 0,   1, 0, 0, 0,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 0, 0, 0,   1, 1, 1, 0,   
+1, 0, 0, 1,   1, 0, 0, 1,   0, 0, 0, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+1, 1, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 0,   0, 1, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 1, 1, 1,   
+
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static const int shapeindex_to_compressed_indices[NSHAPES*2] = 
+{
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+
+	0,15,  0, 2,  0, 8,  0, 2,
+	0, 2,  0, 8,  0, 8,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 8,  0, 8,  0, 2,  0, 2,
+
+	0,15,  0,15,  0, 6,  0, 8,
+	0, 2,  0, 8,  0,15,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 2,  0,15,  0,15,  0, 6,
+
+	0, 6,  0, 2,  0, 6,  0, 8,
+	0,15,  0,15,  0, 2,  0, 2,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0, 2,  0, 2,  0,15
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*2+(region)]
+
+#endif
--- a/src/bc6h/tile.h
+++ b/src/bc6h/tile.h
@ -0,0 +1,83 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_TILE_H
+#define _ZOH_TILE_H
+
+#include "zoh_utils.h"
+#include "nvmath/Vector.h"
+#include <math.h>
+
+namespace ZOH {
+
+//#define	USE_IMPORTANCE_MAP	1		// define this if you want to increase importance of some pixels in tile
+class Tile
+{
+public:
+	// NOTE: this returns the appropriately-clamped BIT PATTERN of the half as an INTEGRAL float value
+	static float half2float(uint16 h)
+	{
+		return (float) Utils::ushort_to_format(h);
+	}
+	// NOTE: this is the inverse of the above operation
+	static uint16 float2half(float f)
+	{
+		return Utils::format_to_ushort((int)f);
+	}
+
+	// look for adjacent pixels that are identical. if there are enough of them, increase their importance
+	void generate_importance_map()
+	{
+		// initialize
+		for (int y=0; y<size_y; ++y)
+		for (int x=0; x<size_x; ++x)
+		{
+			// my importance is increased if I am identical to any of my 4-neighbors
+			importance_map[y][x] = match_4_neighbor(x,y) ? 5.0f : 1.0f;
+		}
+	}
+	bool is_equal(int x, int y, int xn, int yn)
+	{
+		if (xn < 0 || xn >= size_x || yn < 0 || yn >= size_y)
+			return false;
+		return( (data[y][x].x == data[yn][xn].x) &&
+				(data[y][x].y == data[yn][xn].y) &&
+				(data[y][x].z == data[yn][xn].z) );
+	}
+
+#ifdef USE_IMPORTANCE_MAP
+	bool match_4_neighbor(int x, int y)
+	{
+		return is_equal(x,y,x-1,y) || is_equal(x,y,x+1,y) || is_equal(x,y,x,y-1) || is_equal(x,y,x,y+1);
+	}
+#else
+	bool match_4_neighbor(int x, int y)
+	{
+		return false;
+	}
+#endif
+
+	Tile() {};
+	~Tile(){};
+	Tile(int xs, int ys) {size_x = xs; size_y = ys;}
+
+	static const int TILE_H = 4;
+	static const int TILE_W = 4;
+	static const int TILE_TOTAL = TILE_H * TILE_W;
+    nv::Vector3 data[TILE_H][TILE_W];
+	float importance_map[TILE_H][TILE_W];
+	int	size_x, size_y;			// actual size of tile
+};
+
+}
+
+#endif // _ZOH_TILE_H
--- a/src/bc6h/zoh.cpp
+++ b/src/bc6h/zoh.cpp
@ -0,0 +1,197 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// the zoh compressor and decompressor
+
+#include "tile.h"
+#include "zoh.h"
+
+#include <string.h> // memcpy
+
+using namespace ZOH;
+
+
+bool ZOH::isone(const char *block)
+{
+	char code = block[0] & 0x1F;
+
+	return (code == 0x03 || code == 0x07 || code == 0x0b || code == 0x0f);
+}
+
+void ZOH::compress(const Tile &t, char *block)
+{
+	char oneblock[ZOH::BLOCKSIZE], twoblock[ZOH::BLOCKSIZE];
+
+	float mseone = ZOH::compressone(t, oneblock);
+	float msetwo = ZOH::compresstwo(t, twoblock);
+
+	if (mseone <= msetwo)
+		memcpy(block, oneblock, ZOH::BLOCKSIZE);
+	else
+		memcpy(block, twoblock, ZOH::BLOCKSIZE);
+}
+
+void ZOH::decompress(const char *block, Tile &t)
+{
+	if (ZOH::isone(block))
+		ZOH::decompressone(block, t);
+	else
+		ZOH::decompresstwo(block, t);
+}
+
+/*
+void ZOH::compress(string inf, string zohf)
+{
+	Array2D<Rgba> pixels;
+	int w, h;
+	char block[ZOH::BLOCKSIZE];
+
+	Exr::readRgba(inf, pixels, w, h);
+	FILE *zohfile = fopen(zohf.c_str(), "wb");
+	if (zohfile == NULL) throw "Unable to open .zoh file for write";
+
+	// stuff for progress bar O.o
+	int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W);
+	int tilecnt = 0;
+	int ndots = 25;
+	int dotcnt = 0;
+	printf("Progress [");
+	for (int i=0; i<ndots;++i) printf(" ");
+	printf("]\rProgress ["); fflush(stdout);
+
+	// convert to tiles and compress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			t.insert(pixels, x, y);
+
+			ZOH::compress(t, block);
+			if (fwrite(block, sizeof(char), ZOH::BLOCKSIZE, zohfile) != ZOH::BLOCKSIZE)
+				throw "File error on write";
+
+			// progress bar
+			++tilecnt;
+			if (tilecnt > (ntiles * dotcnt)/ndots) { printf("."); fflush(stdout); ++dotcnt; }
+		}
+	}
+
+	printf("]\n");		// advance to next line finally
+
+	if (fclose(zohfile)) throw "Close failed on .zoh file";
+}
+
+static int str2int(std::string s)
+{
+	int thing;
+	std::stringstream str (stringstream::in | stringstream::out);
+	str << s;
+	str >> thing;
+	return thing;
+}
+
+// zoh file name is ...-w-h.zoh, extract width and height
+static void extract(string zohf, int &w, int &h)
+{
+	size_t n = zohf.rfind('.', zohf.length()-1);
+	size_t n1 = zohf.rfind('-', n-1);
+	size_t n2 = zohf.rfind('-', n1-1);
+	string width = zohf.substr(n2+1, n1-n2-1);
+	w = str2int(width);
+	string height = zohf.substr(n1+1, n-n1-1);
+	h = str2int(height);
+}
+
+static int mode_to_prec[] = {
+	10,7,11,10,
+	10,7,11,11,
+	10,7,11,12,
+	10,7,9,16,
+	10,7,8,-1,
+	10,7,8,-1,
+	10,7,8,-1,
+	10,7,6,-1,
+};
+
+static int shapeindexhist[32], modehist[32], prechistone[16], prechisttwo[16], oneregion, tworegions;
+
+static void stats(char block[ZOH::BLOCKSIZE])
+{
+	char mode = block[0] & 0x1F; if ((mode & 0x3) == 0) mode = 0; if ((mode & 0x3) == 1) mode = 1; modehist[mode]++;
+	int prec = mode_to_prec[mode];
+	nvAssert (prec != -1);
+	if (!ZOH::isone(block))
+	{
+		tworegions++;
+		prechisttwo[prec]++;
+		int shapeindex = ((block[0] & 0xe0) >> 5) | ((block[1] & 0x3) << 3);
+		shapeindexhist[shapeindex]++;
+	}
+	else
+	{
+		oneregion++;
+		prechistone[prec]++;
+	}
+}
+
+static void printstats()
+{
+	printf("\nPrecision histogram 10b to 16b one region: "); for (int i=10; i<=16; ++i) printf("%d,", prechistone[i]);
+	printf("\nPrecision histogram 6b to 11b two regions: "); for (int i=6; i<=11; ++i) printf("%d,", prechisttwo[i]);
+	printf("\nMode histogram: "); for (int i=0; i<32; ++i) printf("%d,", modehist[i]);
+	printf("\nShape index histogram: "); for (int i=0; i<32; ++i) printf("%d,", shapeindexhist[i]);
+	printf("\nOne region %5.2f%%  Two regions %5.2f%%", 100.0*oneregion/float(oneregion+tworegions), 100.0*tworegions/float(oneregion+tworegions));
+	printf("\n");
+}
+
+void ZOH::decompress(string zohf, string outf)
+{
+	Array2D<Rgba> pixels;
+	int w, h;
+	char block[ZOH::BLOCKSIZE];
+
+	extract(zohf, w, h);
+	FILE *zohfile = fopen(zohf.c_str(), "rb");
+	if (zohfile == NULL) throw "Unable to open .zoh file for read";
+	pixels.resizeErase(h, w);
+
+	// convert to tiles and decompress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			if (fread(block, sizeof(char), ZOH::BLOCKSIZE, zohfile) != ZOH::BLOCKSIZE)
+				throw "File error on read";
+
+			stats(block);	// collect statistics
+
+			ZOH::decompress(block, t);
+
+			t.extract(pixels, x, y);
+		}
+	}
+	if (fclose(zohfile)) throw "Close failed on .zoh file";
+	Exr::writeRgba(outf, pixels, w, h);
+
+#ifndef EXTERNAL_RELEASE
+	printstats();	// print statistics
+#endif
+}
+*/
--- a/src/bc6h/zoh.h
+++ b/src/bc6h/zoh.h
@ -0,0 +1,65 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_H
+#define _ZOH_H
+
+#include "tile.h"
+
+namespace ZOH {
+
+// UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f
+
+static const int NREGIONS_TWO	= 2;
+static const int NREGIONS_ONE	= 1;
+static const int NCHANNELS		= 3;
+
+struct FltEndpts
+{
+    nv::Vector3 A;
+    nv::Vector3 B;
+};
+
+struct IntEndpts
+{
+	int A[NCHANNELS];
+	int B[NCHANNELS];
+};
+
+struct ComprEndpts
+{
+	uint A[NCHANNELS];
+	uint B[NCHANNELS];
+};
+
+static const int BLOCKSIZE=16;
+static const int BITSIZE=128;
+
+void compress(const Tile &t, char *block);
+void decompress(const char *block, Tile &t);
+
+float compressone(const Tile &t, char *block);
+float compresstwo(const Tile &t, char *block);
+void decompressone(const char *block, Tile &t);
+void decompresstwo(const char *block, Tile &t);
+
+float refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block);
+float roughtwo(const Tile &tile, int shape, FltEndpts endpts[NREGIONS_TWO]);
+
+float refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block);
+float roughone(const Tile &tile, int shape, FltEndpts endpts[NREGIONS_ONE]);
+
+bool isone(const char *block);
+
+}
+
+#endif // _ZOH_H
--- a/src/bc6h/zoh_utils.cpp
+++ b/src/bc6h/zoh_utils.cpp
@ -0,0 +1,324 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Utility and common routines
+
+#include "zoh_utils.h"
+#include "nvmath/Vector.inl"
+#include <math.h>
+
+using namespace nv;
+using namespace ZOH;
+
+static const int denom7_weights_64[] = {0, 9, 18, 27, 37, 46, 55, 64};										// divided by 64
+static const int denom15_weights_64[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};		// divided by 64
+
+/*static*/ Format Utils::FORMAT;
+
+int Utils::lerp(int a, int b, int i, int denom)
+{
+	nvDebugCheck (denom == 3 || denom == 7 || denom == 15);
+	nvDebugCheck (i >= 0 && i <= denom);
+
+	int round = 32, shift = 6;
+	const int *weights;
+
+	switch(denom)
+	{
+	case 3:		denom *= 5; i *= 5;	// fall through to case 15
+	case 15:	weights = denom15_weights_64; break;
+	case 7:		weights = denom7_weights_64; break;
+	default:	nvDebugCheck(0);
+	}
+
+	return (a*weights[denom-i] +b*weights[i] + round) >> shift;
+}
+
+Vector3 Utils::lerp(const Vector3& a, const Vector3 &b, int i, int denom)
+{
+	nvDebugCheck (denom == 3 || denom == 7 || denom == 15);
+	nvDebugCheck (i >= 0 && i <= denom);
+
+	int shift = 6;
+	const int *weights;
+
+	switch(denom)
+	{
+	case 3:		denom *= 5; i *= 5;	// fall through to case 15
+	case 15:	weights = denom15_weights_64; break;
+	case 7:		weights = denom7_weights_64; break;
+	default:	nvUnreachable();
+	}
+
+	// no need to round these as this is an exact division
+	return (a*float(weights[denom-i]) +b*float(weights[i])) / float(1 << shift);
+}
+
+
+/*
+	For unsigned f16, clamp the input to [0,F16MAX]. Thus u15.
+	For signed f16, clamp the input to [-F16MAX,F16MAX]. Thus s16.
+
+	The conversions proceed as follows:
+
+	unsigned f16: get bits. if high bit set, clamp to 0, else clamp to F16MAX.
+	signed f16: get bits. extract exp+mantissa and clamp to F16MAX. return -value if sign bit was set, else value
+	unsigned int: get bits. return as a positive value.
+	signed int. get bits. return as a value in -32768..32767.
+
+	The inverse conversions are just the inverse of the above.
+*/
+
+// clamp the 3 channels of the input vector to the allowable range based on FORMAT
+// note that each channel is a float storing the allowable range as a bit pattern converted to float
+// that is, for unsigned f16 say, we would clamp each channel to the range [0, F16MAX]
+
+void Utils::clamp(Vector3 &v)
+{
+	for (int i=0; i<3; ++i)
+	{
+		switch(Utils::FORMAT)
+		{
+		case UNSIGNED_F16:
+			if (v.component[i] < 0.0) v.component[i] = 0;
+			else if (v.component[i] > F16MAX) v.component[i] = F16MAX;
+			break;
+
+		case SIGNED_F16:
+			if (v.component[i] < -F16MAX) v.component[i] = -F16MAX;
+			else if (v.component[i] > F16MAX) v.component[i] = F16MAX;
+			break;
+
+		default:
+			nvUnreachable();
+		}
+	}
+}
+
+// convert a u16 value to s17 (represented as an int) based on the format expected
+int Utils::ushort_to_format(unsigned short input)
+{
+	int out, s;
+
+	// clamp to the valid range we are expecting
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		if (input & F16S_MASK) out = 0;
+		else if (input > F16MAX) out = F16MAX;
+		else out = input;
+		break;
+
+	case SIGNED_F16:
+		s = input & F16S_MASK;
+		input &= F16EM_MASK;
+		if (input > F16MAX) out = F16MAX;
+		else out = input;
+		out = s ? -out : out;
+		break;
+	}
+	return out;
+}
+
+// convert a s17 value to u16 based on the format expected
+unsigned short Utils::format_to_ushort(int input)
+{
+	unsigned short out;
+
+	// clamp to the valid range we are expecting
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		nvDebugCheck (input >= 0 && input <= F16MAX);
+		out = input;
+		break;
+
+	case SIGNED_F16:
+		nvDebugCheck (input >= -F16MAX && input <= F16MAX);
+		// convert to sign-magnitude
+		int s;
+		if (input < 0) { s = F16S_MASK; input = -input; }
+		else           { s = 0; }
+		out = s | input;
+		break;
+	}
+	return out;
+}
+
+// quantize the input range into equal-sized bins
+int Utils::quantize(float value, int prec)
+{
+	int q, ivalue, s;
+
+	nvDebugCheck (prec > 1);	// didn't bother to make it work for 1
+
+	value = (float)floor(value + 0.5);
+
+	int bias = (prec > 10) ? ((1<<(prec-1))-1) : 0;	// bias precisions 11..16 to get a more accurate quantization
+
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		nvDebugCheck (value >= 0 && value <= F16MAX);
+		ivalue = (int)value;
+		q = ((ivalue << prec) + bias) / (F16MAX+1);
+		nvDebugCheck (q >= 0 && q < (1 << prec));
+		break;
+
+	case SIGNED_F16:
+		nvDebugCheck (value >= -F16MAX && value <= F16MAX);
+		// convert to sign-magnitude
+		ivalue = (int)value;
+		if (ivalue < 0) { s = 1; ivalue = -ivalue; } else s = 0;
+
+		q = ((ivalue << (prec-1)) + bias) / (F16MAX+1);
+		if (s)
+			q = -q;
+		nvDebugCheck (q > -(1 << (prec-1)) && q < (1 << (prec-1)));
+		break;
+	}
+
+	return q;
+}
+
+int Utils::finish_unquantize(int q, int prec)
+{
+	if (Utils::FORMAT == UNSIGNED_F16)
+		return (q * 31) >> 6;										// scale the magnitude by 31/64
+	else if (Utils::FORMAT == SIGNED_F16)
+		return (q < 0) ? -(((-q) * 31) >> 5) : (q * 31) >> 5;		// scale the magnitude by 31/32
+	else
+		return q;
+}
+
+// unquantize each bin to midpoint of original bin range, except
+// for the end bins which we push to an endpoint of the bin range.
+// we do this to ensure we can represent all possible original values.
+// the asymmetric end bins do not affect PSNR for the test images.
+//
+// code this function assuming an arbitrary bit pattern as the encoded block
+int Utils::unquantize(int q, int prec)
+{
+	int unq, s;
+
+	nvDebugCheck (prec > 1);	// not implemented for prec 1
+
+	switch (Utils::FORMAT)
+	{
+	// modify this case to move the multiplication by 31 after interpolation.
+	// Need to use finish_unquantize.
+
+	// since we have 16 bits available, let's unquantize this to 16 bits unsigned
+	// thus the scale factor is [0-7c00)/[0-10000) = 31/64
+	case UNSIGNED_F16:
+		if (prec >= 15) 
+			unq = q;
+		else if (q == 0) 
+			unq = 0;
+		else if (q == ((1<<prec)-1)) 
+			unq = U16MAX;
+		else
+			unq = (q * (U16MAX+1) + (U16MAX+1)/2) >> prec;
+		break;
+
+	// here, let's stick with S16 (no apparent quality benefit from going to S17)
+	// range is (-7c00..7c00)/(-8000..8000) = 31/32
+	case SIGNED_F16:
+		// don't remove this test even though it appears equivalent to the code below
+		// as it isn't -- the code below can overflow for prec = 16
+		if (prec >= 16)
+			unq = q;
+		else
+		{
+			if (q < 0) { s = 1; q = -q; } else s = 0;
+
+			if (q == 0)
+				unq = 0;
+			else if (q >= ((1<<(prec-1))-1))
+				unq = s ? -S16MAX : S16MAX;
+			else
+			{
+				unq = (q * (S16MAX+1) + (S16MAX+1)/2) >> (prec-1);
+				if (s)
+					unq = -unq;
+			}
+		}
+		break;
+	}
+	return unq;
+}
+
+
+
+// pick a norm!
+#define	NORM_EUCLIDEAN 1
+
+float Utils::norm(const Vector3 &a, const Vector3 &b)
+{
+#ifdef	NORM_EUCLIDEAN
+	return lengthSquared(a - b);
+#endif
+#ifdef	NORM_ABS
+	Vector3 err = a - b;
+	return fabs(err.x) + fabs(err.y) + fabs(err.z);
+#endif
+}
+
+// parse <name>[<start>{:<end>}]{,}	
+// the pointer starts here         ^
+// name is 1 or 2 chars and matches field names. start and end are decimal numbers
+void Utils::parse(const char *encoding, int &ptr, Field &field, int &endbit, int &len)
+{
+	if (ptr <= 0) return;
+	--ptr;
+	if (encoding[ptr] == ',') --ptr;
+	nvDebugCheck (encoding[ptr] == ']');
+	--ptr;
+	endbit = 0;
+	int scale = 1;
+	while (encoding[ptr] != ':' && encoding[ptr] != '[')
+	{
+		nvDebugCheck(encoding[ptr] >= '0' && encoding[ptr] <= '9');
+		endbit += (encoding[ptr--] - '0') * scale;
+		scale *= 10;
+	}
+	int startbit = 0; scale = 1;
+	if (encoding[ptr] == '[')
+		startbit = endbit;
+	else  
+	{
+		ptr--;
+		while (encoding[ptr] != '[')
+		{
+			nvDebugCheck(encoding[ptr] >= '0' && encoding[ptr] <= '9');
+			startbit += (encoding[ptr--] - '0') * scale;
+			scale *= 10;
+		}
+	}
+	len = startbit - endbit + 1;	// startbit>=endbit note
+	--ptr;
+	if (encoding[ptr] == 'm')		field = FIELD_M;
+	else if (encoding[ptr] == 'd')	field = FIELD_D;
+	else {
+		// it's wxyz
+		nvDebugCheck (encoding[ptr] >= 'w' && encoding[ptr] <= 'z');
+		int foo = encoding[ptr--] - 'w';
+		// now it is r g or b
+		if (encoding[ptr] == 'r')		foo += 10;
+		else if (encoding[ptr] == 'g')	foo += 20;
+		else if (encoding[ptr] == 'b')	foo += 30;
+		else nvDebugCheck(0);
+		field = (Field) foo;
+	}
+}
+
+
--- a/src/bc6h/zoh_utils.h
+++ b/src/bc6h/zoh_utils.h
@ -0,0 +1,73 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// utility class holding common routines
+#pragma once
+#ifndef _ZOH_UTILS_H
+#define _ZOH_UTILS_H
+
+#include "nvmath/Vector.h"
+
+namespace ZOH {
+
+inline int SIGN_EXTEND(int x, int nb) { return ((((signed(x))&(1<<((nb)-1)))?((~0)<<(nb)):0)|(signed(x))); }
+
+enum Field {
+    FIELD_M = 1,	// mode
+    FIELD_D = 2,	// distribution/shape
+    FIELD_RW = 10+0, FIELD_RX = 10+1, FIELD_RY = 10+2, FIELD_RZ = 10+3,	// red channel endpoints or deltas
+    FIELD_GW = 20+0, FIELD_GX = 20+1, FIELD_GY = 20+2, FIELD_GZ = 20+3,	// green channel endpoints or deltas
+    FIELD_BW = 30+0, FIELD_BX = 30+1, FIELD_BY = 30+2, FIELD_BZ = 30+3,	// blue channel endpoints or deltas
+};
+
+// some constants
+static const int F16S_MASK	=  0x8000;		// f16 sign mask
+static const int F16EM_MASK	=  0x7fff;		// f16 exp & mantissa mask
+static const int U16MAX		=  0xffff;
+static const int S16MIN		= -0x8000;
+static const int S16MAX		=  0x7fff;
+static const int INT16_MASK	=  0xffff;
+static const int F16MAX		=  0x7bff;		// MAXFLT bit pattern for halfs
+
+enum Format { UNSIGNED_F16, SIGNED_F16 };
+
+class Utils
+{
+public:
+    static Format FORMAT;     // this is a global -- we're either handling unsigned or unsigned half values
+
+    // error metrics
+    static float norm(const nv::Vector3 &a, const nv::Vector3 &b);
+    static float mpsnr_norm(const nv::Vector3 &a, int exposure, const nv::Vector3 &b);
+
+    // conversion & clamp
+    static int ushort_to_format(unsigned short input);
+    static unsigned short format_to_ushort(int input);
+
+    // clamp to format
+    static void clamp(nv::Vector3 &v);
+
+    // quantization and unquantization
+    static int finish_unquantize(int q, int prec);
+    static int unquantize(int q, int prec);
+    static int quantize(float value, int prec);
+
+    static void parse(const char *encoding, int &ptr, Field & field, int &endbit, int &len);
+
+    // lerping
+    static int lerp(int a, int b, int i, int denom);
+    static nv::Vector3 lerp(const nv::Vector3 & a, const nv::Vector3 & b, int i, int denom);
+};
+
+}
+
+#endif // _ZOH_UTILS_H
--- a/src/bc6h/zohone.cpp
+++ b/src/bc6h/zohone.cpp
@ -0,0 +1,799 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// one region zoh compress/decompress code
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+#include "bits.h"
+#include "tile.h"
+#include "zoh.h"
+#include "zoh_utils.h"
+
+#include "nvmath/Vector.inl"
+#include "nvmath/Fitting.h"
+
+#include <string.h> // strlen
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+using namespace ZOH;
+
+#define NINDICES	16
+#define	INDEXBITS	4
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+
+#define	NSHAPES	1
+
+static const int shapes[NSHAPES] =
+{
+    0x0000
+};	// only 1 shape
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NDELTA	2
+
+struct Chanpat
+{
+    int prec[NDELTA];		// precision pattern for one channel
+};
+
+struct Pattern
+{
+    Chanpat chan[NCHANNELS];// allow different bit patterns per channel -- but we still want constant precision per channel
+    int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+    int mode;				// associated mode value
+    int modebits;			// number of mode bits
+    const char *encoding;	// verilog description of encoding for this mode
+};
+
+#define MAXMODEBITS	5
+#define	MAXMODES (1<<MAXMODEBITS)
+
+#define	NPATTERNS 4
+
+static const Pattern patterns[NPATTERNS] =
+{
+    16,4,  16,4,  16,4,   1, 0x0f, 5, "bw[10],bw[11],bw[12],bw[13],bw[14],bw[15],bx[3:0],gw[10],gw[11],gw[12],gw[13],gw[14],gw[15],gx[3:0],rw[10],rw[11],rw[12],rw[13],rw[14],rw[15],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    12,8,  12,8,  12,8,   1, 0x0b, 5, "bw[10],bw[11],bx[7:0],gw[10],gw[11],gx[7:0],rw[10],rw[11],rx[7:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,9,  11,9,  11,9,   1, 0x07, 5, "bw[10],bx[8:0],gw[10],gx[8:0],rw[10],rx[8:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    10,10, 10,10, 10,10,  0, 0x03, 5, "bx[9:0],gx[9:0],rx[9:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+};
+
+// mapping of mode to the corresponding index in pattern
+static const int mode_to_pat[MAXMODES] = {
+    -1,-1,-1,
+    3,	// 0x03
+    -1,-1,-1,
+    2,	// 0x07
+    -1,-1,-1,
+    1,	// 0x0b
+    -1,-1,-1,
+    0,	// 0x0f
+    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+};
+
+#define	R_0(ep)	(ep)[0].A[i]
+#define	R_1(ep)	(ep)[0].B[i]
+#define	MASK(n)	((1<<(n))-1)
+
+// compress endpoints
+static void compress_endpts(const IntEndpts in[NREGIONS_ONE], ComprEndpts out[NREGIONS_ONE], const Pattern &p)
+{
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
+        }
+    }
+}
+
+// decompress endpoints
+static void decompress_endpts(const ComprEndpts in[NREGIONS_ONE], IntEndpts out[NREGIONS_ONE], const Pattern &p)
+{
+    bool issigned = Utils::FORMAT == SIGNED_F16;
+
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            int t;
+            t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
+        }
+    }
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_ONE], int prec, IntEndpts q_endpts[NREGIONS_ONE])
+{
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
+        q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
+        q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
+        q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
+        q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
+        q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
+    }
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_one have a 0 high-order bit
+// index_one is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(IntEndpts endpts[NREGIONS_ONE], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+    int index_positions[NREGIONS_ONE];
+
+    index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        int x = index_positions[region] & 3;
+        int y = (index_positions[region] >> 2) & 3;
+        nvDebugCheck(REGION(x,y,shapeindex) == region);		// double check the table
+        if (indices[y][x] & HIGH_INDEXBIT)
+        {
+            // high bit is set, swap the endpts and indices for this region
+            int t;
+            for (int i=0; i<NCHANNELS; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+            for (int y = 0; y < Tile::TILE_H; y++)
+                for (int x = 0; x < Tile::TILE_W; x++)
+                    if (REGION(x,y,shapeindex) == region)
+                        indices[y][x] = NINDICES - 1 - indices[y][x];
+        }
+    }
+}
+
+// endpoints fit only if the compression was lossless
+static bool endpts_fit(const IntEndpts orig[NREGIONS_ONE], const ComprEndpts compressed[NREGIONS_ONE], const Pattern &p)
+{
+    IntEndpts uncompressed[NREGIONS_ONE];
+
+    decompress_endpts(compressed, uncompressed, p);
+
+    for (int j=0; j<NREGIONS_ONE; ++j)
+	for (int i=0; i<NCHANNELS; ++i)
+	{
+        if (orig[j].A[i] != uncompressed[j].A[i]) return false;
+        if (orig[j].B[i] != uncompressed[j].B[i]) return false;
+    }
+    return true;
+}
+
+static void write_header(const ComprEndpts endpts[NREGIONS_ONE], const Pattern &p, Bits &out)
+{
+    // interpret the verilog backwards and process it
+    int m = p.mode;
+    int rw = endpts[0].A[0], rx = endpts[0].B[0];
+    int gw = endpts[0].A[1], gx = endpts[0].B[1];
+    int bw = endpts[0].A[2], bx = endpts[0].B[2];
+    int ptr = int(strlen(p.encoding));
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+        switch(field)
+        {
+        case FIELD_M:	out.write( m >> endbit, len); break;
+        case FIELD_RW:	out.write(rw >> endbit, len); break;
+        case FIELD_RX:	out.write(rx >> endbit, len); break;
+        case FIELD_GW:	out.write(gw >> endbit, len); break;
+        case FIELD_GX:	out.write(gx >> endbit, len); break;
+        case FIELD_BW:	out.write(bw >> endbit, len); break;
+        case FIELD_BX:	out.write(bx >> endbit, len); break;
+
+        case FIELD_D:
+        case FIELD_RY:
+        case FIELD_RZ:
+        case FIELD_GY:
+        case FIELD_GZ:
+        case FIELD_BY:
+        case FIELD_BZ:
+        default: nvUnreachable();
+        }
+    }
+}
+
+static void read_header(Bits &in, ComprEndpts endpts[NREGIONS_ONE], Pattern &p)
+{
+    // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
+    int mode = in.read(2);
+    if (mode != 0x00 && mode != 0x01)
+        mode = (in.read(3) << 2) | mode;
+
+    int pat_index = mode_to_pat[mode];
+
+    nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
+    nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
+
+    p = patterns[pat_index];
+
+    int d;
+    int rw, rx;
+    int gw, gx;
+    int bw, bx;
+
+    d = 0;
+    rw = rx = 0;
+    gw = gx = 0;
+    bw = bx = 0;
+
+    int ptr = int(strlen(p.encoding));
+
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+
+        switch(field)
+        {
+        case FIELD_M:	break;	// already processed so ignore
+        case FIELD_RW:	rw |= in.read(len) << endbit; break;
+        case FIELD_RX:	rx |= in.read(len) << endbit; break;
+        case FIELD_GW:	gw |= in.read(len) << endbit; break;
+        case FIELD_GX:	gx |= in.read(len) << endbit; break;
+        case FIELD_BW:	bw |= in.read(len) << endbit; break;
+        case FIELD_BX:	bx |= in.read(len) << endbit; break;
+
+        case FIELD_D:
+        case FIELD_RY:
+        case FIELD_RZ:
+        case FIELD_GY:
+        case FIELD_GZ:
+        case FIELD_BY:
+        case FIELD_BZ:
+        default: nvUnreachable();
+        }
+    }
+
+    nvDebugCheck (in.getptr() == 128 - 63);
+
+    endpts[0].A[0] = rw; endpts[0].B[0] = rx;
+    endpts[0].A[1] = gw; endpts[0].B[1] = gx;
+    endpts[0].A[2] = bw; endpts[0].B[2] = bx;
+}
+
+// compress index 0
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        out.write(indices[y][x], INDEXBITS - ((pos == 0) ? 1 : 0));
+    }
+}
+
+static void emit_block(const ComprEndpts endpts[NREGIONS_ONE], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+    Bits out(block, ZOH::BITSIZE);
+
+    write_header(endpts, p, out);
+
+    write_indices(indices, shapeindex, out);
+
+    nvDebugCheck(out.getptr() == ZOH::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
+{
+    // scale endpoints
+    int a, b;			// really need a IntVector3...
+
+    a = Utils::unquantize(endpts.A[0], prec);
+    b = Utils::unquantize(endpts.B[0], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[1], prec);
+    b = Utils::unquantize(endpts.B[1], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[2], prec);
+    b = Utils::unquantize(endpts.B[2], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+}
+
+// position 0 was compressed
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        indices[y][x]= in.read(INDEXBITS - ((pos == 0) ? 1 : 0));
+    }
+}
+
+void ZOH::decompressone(const char *block, Tile &t)
+{
+    Bits in(block, ZOH::BITSIZE);
+
+    Pattern p;
+    IntEndpts endpts[NREGIONS_ONE];
+    ComprEndpts compr_endpts[NREGIONS_ONE];
+
+    read_header(in, compr_endpts, p);
+    int shapeindex = 0;		// only one shape
+
+    decompress_endpts(compr_endpts, endpts, p);
+
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+    for (int r = 0; r < NREGIONS_ONE; ++r)
+        generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
+
+    // read indices
+    int indices[Tile::TILE_H][Tile::TILE_W];
+
+    read_indices(in, shapeindex, indices);
+
+    nvDebugCheck(in.getptr() == ZOH::BITSIZE);
+
+    // lookup
+    for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+            t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
+{
+    Vector3 palette[NINDICES];
+    float toterr = 0;
+    Vector3 err;
+
+    generate_palette_quantized(endpts, prec, palette);
+
+    for (int i = 0; i < np; ++i)
+    {
+        float err, besterr;
+
+        besterr = Utils::norm(colors[i], palette[0]) * importance[i];
+
+        for (int j = 1; j < NINDICES && besterr > 0; ++j)
+        {
+            err = Utils::norm(colors[i], palette[j]) * importance[i];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_ONE], int prec, 
+                           int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_ONE])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        generate_palette_quantized(endpts[region], prec, &palette[region][0]);
+        toterr[region] = 0;
+    }
+
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]);
+        indices[y][x] = 0;
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]);
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+            {
+                besterr = err;
+                indices[y][x] = i;
+            }
+        }
+        toterr[region] += besterr;
+    }
+}
+
+static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
+                          float old_err, int do_b)
+{
+    // we have the old endpoints: old_endpts
+    // we have the perturbed endpoints: new_endpts
+    // we have the temporary endpoints: temp_endpts
+
+    IntEndpts temp_endpts;
+    float min_err = old_err;		// start with the best current error
+    int beststep;
+
+    // copy real endpoints so we can perturb them
+    for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
+
+    // do a logarithmic search for the best error for this endpoint (which)
+    for (int step = 1 << (prec-1); step; step >>= 1)
+    {
+        bool improved = false;
+        for (int sign = -1; sign <= 1; sign += 2)
+        {
+            if (do_b == 0)
+            {
+                temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+                if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+                    continue;
+            }
+            else
+            {
+                temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+                if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+                    continue;
+            }
+
+            float err = map_colors(colors, importance, np, temp_endpts, prec);
+
+            if (err < min_err)
+            {
+                improved = true;
+                min_err = err;
+                beststep = sign * step;
+            }
+        }
+        // if this was an improvement, move the endpoint and continue search from there
+        if (improved)
+        {
+            if (do_b == 0)
+                new_endpts.A[ch] += beststep;
+            else
+                new_endpts.B[ch] += beststep;
+        }
+    }
+    return min_err;
+}
+
+static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
+{
+    float opt_err = orig_err;
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        opt_endpts.A[ch] = orig_endpts.A[ch];
+        opt_endpts.B[ch] = orig_endpts.B[ch];
+    }
+    /*
+        err0 = perturb(rgb0, delta0)
+        err1 = perturb(rgb1, delta1)
+        if (err0 < err1)
+            if (err0 >= initial_error) break
+            rgb0 += delta0
+            next = 1
+        else
+            if (err1 >= initial_error) break
+            rgb1 += delta1
+            next = 0
+        initial_err = map()
+        for (;;)
+            err = perturb(next ? rgb1:rgb0, delta)
+            if (err >= initial_err) break
+            next? rgb1 : rgb0 += delta
+            initial_err = err
+	*/
+    IntEndpts new_a, new_b;
+    IntEndpts new_endpt;
+    int do_b;
+
+    // now optimize each channel separately
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        // figure out which endpoint when perturbed gives the most improvement and start there
+        // if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
+
+        if (err0 < err1)
+        {
+            if (err0 >= opt_err)
+                continue;
+
+            opt_endpts.A[ch] = new_a.A[ch];
+            opt_err = err0;
+            do_b = 1;		// do B next
+        }
+        else
+        {
+            if (err1 >= opt_err)
+                continue;
+            opt_endpts.B[ch] = new_b.B[ch];
+            opt_err = err1;
+            do_b = 0;		// do A next
+        }
+
+        // now alternate endpoints and keep trying until there is no improvement
+        for (;;)
+        {
+            float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
+            if (err >= opt_err)
+                break;
+            if (do_b == 0)
+                opt_endpts.A[ch] = new_endpt.A[ch];
+            else
+                opt_endpts.B[ch] = new_endpt.B[ch];
+            opt_err = err;
+            do_b = 1 - do_b;	// now move the other endpoint
+        }
+    }
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_ONE], 
+                            const IntEndpts orig_endpts[NREGIONS_ONE], int prec, IntEndpts opt_endpts[NREGIONS_ONE])
+{
+    Vector3 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+    float err = 0;
+
+    for (int region=0; region<NREGIONS_ONE; ++region)
+    {
+        // collect the pixels in the region
+        int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    ++np;
+                }
+            }
+        }
+
+        optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
+    }
+}
+
+/* optimization algorithm
+    for each pattern
+        convert endpoints using pattern precision
+        assign indices and get initial error
+        compress indices (and possibly reorder endpoints)
+        transform endpoints
+        if transformed endpoints fit pattern
+            get original endpoints back
+            optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+            compress new indices
+            transform new endpoints
+            if new endpoints fit pattern AND if error is improved
+                emit compressed block with new data
+            else
+                emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+float ZOH::refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block)
+{
+    float orig_err[NREGIONS_ONE], opt_err[NREGIONS_ONE], orig_toterr, opt_toterr;
+    IntEndpts orig_endpts[NREGIONS_ONE], opt_endpts[NREGIONS_ONE];
+    ComprEndpts compr_orig[NREGIONS_ONE], compr_opt[NREGIONS_ONE];
+    int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+    for (int sp = 0; sp < NPATTERNS; ++sp)
+    {
+        // precisions for all channels need to be the same
+        for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
+
+        quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
+        assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
+        swap_indices(orig_endpts, orig_indices, shapeindex_best);
+        compress_endpts(orig_endpts, compr_orig, patterns[sp]);
+        if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
+        {
+            optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
+            assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
+            swap_indices(opt_endpts, opt_indices, shapeindex_best);
+            compress_endpts(opt_endpts, compr_opt, patterns[sp]);
+            orig_toterr = opt_toterr = 0;
+            for (int i=0; i < NREGIONS_ONE; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+
+            if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
+            {
+                emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
+                return opt_toterr;
+            }
+            else
+            {
+                // either it stopped fitting when we optimized it, or there was no improvement
+                // so go back to the unoptimized endpoints which we know will fit
+                emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
+                return orig_toterr;
+            }
+        }
+    }
+
+	nvAssert (false); // "No candidate found, should never happen (refineone.)";
+	return FLT_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_ONE], Vector3 palette[NREGIONS_ONE][NINDICES])
+{
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+            palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_ONE])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+
+    generate_palette_unquantized(endpts, palette);
+
+    float toterr = 0;
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+float ZOH::roughone(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_ONE])
+{
+    for (int region=0; region<NREGIONS_ONE; ++region)
+    {
+        int np = 0;
+        Vector3 colors[Tile::TILE_TOTAL];
+        Vector3 mean(0,0,0);
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x,y,shapeindex) == region)
+                {
+                    colors[np] = tile.data[y][x];
+                    mean += tile.data[y][x];
+                    ++np;
+                }
+            }
+        }
+
+        // handle simple cases
+        if (np == 0)
+        {
+            Vector3 zero(0,0,0);
+            endpts[region].A = zero;
+            endpts[region].B = zero;
+            continue;
+        }
+        else if (np == 1)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[0];
+            continue;
+        }
+        else if (np == 2)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[1];
+            continue;
+        }
+
+        mean /= float(np);
+
+        Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+        // project each pixel value along the principal direction
+        float minp = FLT_MAX, maxp = -FLT_MAX;
+        for (int i = 0; i < np; i++)
+        {
+            float dp = dot(colors[i]-mean, direction);
+            if (dp < minp) minp = dp;
+            if (dp > maxp) maxp = dp;
+        }
+
+        // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+        endpts[region].A = mean + minp*direction;
+        endpts[region].B = mean + maxp*direction;
+
+        // clamp endpoints
+        // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+        // shape based on endpoints being clamped
+        Utils::clamp(endpts[region].A);
+        Utils::clamp(endpts[region].B);
+    }
+
+    return map_colors(tile, shapeindex, endpts);
+}
+
+float ZOH::compressone(const Tile &t, char *block)
+{
+    int shapeindex_best = 0;
+    FltEndpts endptsbest[NREGIONS_ONE], tempendpts[NREGIONS_ONE];
+    float msebest = FLT_MAX;
+
+    /*
+		collect the mse values that are within 5% of the best values
+		optimize each one and choose the best
+	*/
+    // hack for now -- just use the best value WORK
+    for (int i=0; i<NSHAPES && msebest>0.0; ++i)
+    {
+        float mse = roughone(t, i, tempendpts);
+        if (mse < msebest)
+        {
+            msebest = mse;
+            shapeindex_best = i;
+            memcpy(endptsbest, tempendpts, sizeof(endptsbest));
+        }
+
+    }
+    return refineone(t, shapeindex_best, endptsbest, block);
+}
--- a/src/bc6h/zohtwo.cpp
+++ b/src/bc6h/zohtwo.cpp
@ -0,0 +1,883 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// two regions zoh compress/decompress code
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+/* optimization algorithm
+
+	get initial float endpoints
+	convert endpoints using 16 bit precision, transform, and get bit delta. choose likely endpoint compression candidates.
+		note that there will be 1 or 2 candidates; 2 will be chosen when the delta values are close to the max possible.
+	for each EC candidate in order from max precision to smaller precision
+		convert endpoints using the appropriate precision.
+		optimize the endpoints and minimize square error. save the error and index assignments. apply index compression as well.
+			(thus the endpoints and indices are in final form.)
+		transform and get bit delta.
+		if the bit delta fits, exit
+	if we ended up with no candidates somehow, choose the tail set of EC candidates and retry. this should happen hardly ever.
+		add a state variable to nvDebugCheck we only do this once.
+	convert to bit stream.
+	return the error.
+
+	Global optimization
+		order all tiles based on their errors
+		do something special for high-error tiles
+			the goal here is to try to avoid tiling artifacts. but I think this is a research problem. let's just generate an error image...
+
+	display an image that shows partitioning and precision selected for each tile
+*/
+
+#include "bits.h"
+#include "tile.h"
+#include "zoh.h"
+#include "zoh_utils.h"
+
+#include "nvmath/Fitting.h"
+#include "nvmath/Vector.inl"
+
+#include <string.h> // strlen
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+using namespace ZOH;
+
+#define NINDICES	8
+#define	INDEXBITS	3
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#include "shapes_two.h"
+// use only the first 32 available shapes
+#undef NSHAPES
+#undef SHAPEBITS
+#define NSHAPES 32
+#define SHAPEBITS 5
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NDELTA	4
+
+struct Chanpat
+{
+    int prec[NDELTA];		// precision pattern for one channel
+};
+
+struct Pattern
+{
+    Chanpat chan[NCHANNELS];    // allow different bit patterns per channel -- but we still want constant precision per channel
+    int transformed;            // if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+    int mode;                   // associated mode value
+    int modebits;               // number of mode bits
+    const char *encoding;       // verilog description of encoding for this mode
+};
+
+#define MAXMODEBITS	5
+#define	MAXMODES (1<<MAXMODEBITS)
+
+#define	NPATTERNS 10
+
+static const Pattern patterns[NPATTERNS] =
+{
+    11,5,5,5,	11,4,4,4,	11,4,4,4,	1,	0x02, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],rw[10],rx[4:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,4,4,4,	11,5,5,5,	11,4,4,4,	1,	0x06, 5, "d[4:0],bz[3],gy[4],rz[3:0],bz[2],bz[0],ry[3:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],gw[10],gx[4:0],gy[3:0],gz[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,4,4,4,	11,4,4,4,	11,5,5,5,	1,	0x0a, 5, "d[4:0],bz[3],bz[4],rz[3:0],bz[2:1],ry[3:0],by[3:0],bw[10],bx[4:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],by[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    10,5,5,5,	10,5,5,5,	10,5,5,5,	1,	0x00, 2, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bw[9:0],gw[9:0],rw[9:0],bz[4],by[4],gy[4],m[1:0]",
+    9,5,5,5,	9,5,5,5,	9,5,5,5,	1,	0x0e, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bw[8:0],gy[4],gw[8:0],by[4],rw[8:0],m[4:0]",
+    8,6,6,6,	8,5,5,5,	8,5,5,5,	1,	0x12, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],rx[5:0],bz[4:3],bw[7:0],gy[4],bz[2],gw[7:0],by[4],gz[4],rw[7:0],m[4:0]",
+    8,5,5,5,	8,6,6,6,	8,5,5,5,	1,	0x16, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],gx[5:0],gy[3:0],gz[4],rx[4:0],bz[4],gz[5],bw[7:0],gy[4],gy[5],gw[7:0],by[4],bz[0],rw[7:0],m[4:0]",
+    8,5,5,5,	8,5,5,5,	8,6,6,6,	1,	0x1a, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bx[5:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bz[5],bw[7:0],gy[4],by[5],gw[7:0],by[4],bz[1],rw[7:0],m[4:0]",
+    7,6,6,6,	7,6,6,6,	7,6,6,6,	1,	0x01, 2, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],bw[6:0],gy[4],bz[2],by[5],gw[6:0],by[4],bz[1:0],rw[6:0],gz[5:4],gy[5],m[1:0]",
+    6,6,6,6,	6,6,6,6,	6,6,6,6,	0,	0x1e, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],gz[5],bw[5:0],gy[4],bz[2],by[5],gy[5],gw[5:0],by[4],bz[1:0],gz[4],rw[5:0],m[4:0]",
+};
+
+// mapping of mode to the corresponding index in pattern
+// UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f -- return -2 for these
+static const int mode_to_pat[MAXMODES] = {	
+    3,	// 0x00
+    8,	// 0x01
+    0,	// 0x02
+    -1,-1,-1,
+    1,	// 0x06
+    -1,-1,-1,
+    2,	// 0x0a
+    -1,-1,-1,
+    4,	// 0x0e
+    -1,-1,-1,
+    5,	// 0x12
+    -2,-1,-1,
+    6,	// 0x16
+    -2,-1,-1,
+    7,	// 0x1a
+    -2,-1,-1,
+    9,	// 0x1e
+    -2
+};
+
+#define	R_0(ep)	(ep)[0].A[i]
+#define	R_1(ep)	(ep)[0].B[i]
+#define	R_2(ep)	(ep)[1].A[i]
+#define	R_3(ep)	(ep)[1].B[i]
+#define	MASK(n)	((1<<(n))-1)
+
+// compress endpoints
+static void compress_endpts(const IntEndpts in[NREGIONS_TWO], ComprEndpts out[NREGIONS_TWO], const Pattern &p)
+{
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
+            R_2(out) = (R_2(in) - R_0(in)) & MASK(p.chan[i].prec[2]);
+            R_3(out) = (R_3(in) - R_0(in)) & MASK(p.chan[i].prec[3]);
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
+            R_2(out) = R_2(in) & MASK(p.chan[i].prec[2]);
+            R_3(out) = R_3(in) & MASK(p.chan[i].prec[3]);
+        }
+    }
+}
+
+// decompress endpoints
+static void decompress_endpts(const ComprEndpts in[NREGIONS_TWO], IntEndpts out[NREGIONS_TWO], const Pattern &p)
+{
+    bool issigned = Utils::FORMAT == SIGNED_F16;
+
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            int t;
+            t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+            t = SIGN_EXTEND(R_2(in), p.chan[i].prec[2]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_2(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+            t = SIGN_EXTEND(R_3(in), p.chan[i].prec[3]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_3(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
+            R_2(out) = issigned ? SIGN_EXTEND(R_2(in),p.chan[i].prec[2]) : R_2(in);
+            R_3(out) = issigned ? SIGN_EXTEND(R_3(in),p.chan[i].prec[3]) : R_3(in);
+        }
+    }
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_TWO], int prec, IntEndpts q_endpts[NREGIONS_TWO])
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
+        q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
+        q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
+        q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
+        q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
+        q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
+    }
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndpts endpts[NREGIONS_TWO], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+        int x = POS_TO_X(position);
+        int y = POS_TO_Y(position);
+        nvDebugCheck(REGION(x,y,shapeindex) == region);		// double check the table
+        if (indices[y][x] & HIGH_INDEXBIT)
+        {
+            // high bit is set, swap the endpts and indices for this region
+            int t;
+            for (int i=0; i<NCHANNELS; ++i)
+            {
+                t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+            }
+
+            for (int y = 0; y < Tile::TILE_H; y++)
+                for (int x = 0; x < Tile::TILE_W; x++)
+                    if (REGION(x,y,shapeindex) == region)
+                        indices[y][x] = NINDICES - 1 - indices[y][x];
+        }
+    }
+}
+
+// endpoints fit only if the compression was lossless
+static bool endpts_fit(const IntEndpts orig[NREGIONS_TWO], const ComprEndpts compressed[NREGIONS_TWO], const Pattern &p)
+{
+    IntEndpts uncompressed[NREGIONS_TWO];
+
+    decompress_endpts(compressed, uncompressed, p);
+
+    for (int j=0; j<NREGIONS_TWO; ++j)
+    {
+	for (int i=0; i<NCHANNELS; ++i)
+	{
+            if (orig[j].A[i] != uncompressed[j].A[i]) return false;
+            if (orig[j].B[i] != uncompressed[j].B[i]) return false;
+        }
+    }
+    return true;
+}
+
+static void write_header(const ComprEndpts endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, Bits &out)
+{
+    // interpret the verilog backwards and process it
+    int m = p.mode;
+    int d = shapeindex;
+    int rw = endpts[0].A[0], rx = endpts[0].B[0], ry = endpts[1].A[0], rz = endpts[1].B[0];
+    int gw = endpts[0].A[1], gx = endpts[0].B[1], gy = endpts[1].A[1], gz = endpts[1].B[1];
+    int bw = endpts[0].A[2], bx = endpts[0].B[2], by = endpts[1].A[2], bz = endpts[1].B[2];
+    int ptr = int(strlen(p.encoding));
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+        switch(field)
+        {
+        case FIELD_M:	out.write( m >> endbit, len); break;
+        case FIELD_D:	out.write( d >> endbit, len); break;
+        case FIELD_RW:	out.write(rw >> endbit, len); break;
+        case FIELD_RX:	out.write(rx >> endbit, len); break;
+        case FIELD_RY:	out.write(ry >> endbit, len); break;
+        case FIELD_RZ:	out.write(rz >> endbit, len); break;
+        case FIELD_GW:	out.write(gw >> endbit, len); break;
+        case FIELD_GX:	out.write(gx >> endbit, len); break;
+        case FIELD_GY:	out.write(gy >> endbit, len); break;
+        case FIELD_GZ:	out.write(gz >> endbit, len); break;
+        case FIELD_BW:	out.write(bw >> endbit, len); break;
+        case FIELD_BX:	out.write(bx >> endbit, len); break;
+        case FIELD_BY:	out.write(by >> endbit, len); break;
+        case FIELD_BZ:	out.write(bz >> endbit, len); break;
+        default: nvUnreachable();
+        }
+    }
+}
+
+static bool read_header(Bits &in, ComprEndpts endpts[NREGIONS_TWO], int &shapeindex, Pattern &p)
+{
+    // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
+    int mode = in.read(2);
+    if (mode != 0x00 && mode != 0x01)
+        mode = (in.read(3) << 2) | mode;
+
+    int pat_index = mode_to_pat[mode];
+
+    if (pat_index == -2)
+        return false;		// reserved mode found
+
+    nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
+    nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
+
+    p = patterns[pat_index];
+
+    int d;
+    int rw, rx, ry, rz;
+    int gw, gx, gy, gz;
+    int bw, bx, by, bz;
+
+    d = 0;
+    rw = rx = ry = rz = 0;
+    gw = gx = gy = gz = 0;
+    bw = bx = by = bz = 0;
+
+    int ptr = int(strlen(p.encoding));
+
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+
+        switch(field)
+        {
+        case FIELD_M:	break;	// already processed so ignore
+        case FIELD_D:	 d |= in.read(len) << endbit; break;
+        case FIELD_RW:	rw |= in.read(len) << endbit; break;
+        case FIELD_RX:	rx |= in.read(len) << endbit; break;
+        case FIELD_RY:	ry |= in.read(len) << endbit; break;
+        case FIELD_RZ:	rz |= in.read(len) << endbit; break;
+        case FIELD_GW:	gw |= in.read(len) << endbit; break;
+        case FIELD_GX:	gx |= in.read(len) << endbit; break;
+        case FIELD_GY:	gy |= in.read(len) << endbit; break;
+        case FIELD_GZ:	gz |= in.read(len) << endbit; break;
+        case FIELD_BW:	bw |= in.read(len) << endbit; break;
+        case FIELD_BX:	bx |= in.read(len) << endbit; break;
+        case FIELD_BY:	by |= in.read(len) << endbit; break;
+        case FIELD_BZ:	bz |= in.read(len) << endbit; break;
+        default: nvUnreachable();
+        }
+    }
+
+    nvDebugCheck (in.getptr() == 128 - 46);
+
+    shapeindex = d;
+    endpts[0].A[0] = rw; endpts[0].B[0] = rx; endpts[1].A[0] = ry; endpts[1].B[0] = rz;
+    endpts[0].A[1] = gw; endpts[0].B[1] = gx; endpts[1].A[1] = gy; endpts[1].B[1] = gz;
+    endpts[0].A[2] = bw; endpts[0].B[2] = bx; endpts[1].A[2] = by; endpts[1].B[2] = bz;
+
+    return true;
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+    int positions[NREGIONS_TWO];
+
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        bool match = false;
+
+        for (int r = 0; r < NREGIONS_TWO; ++r)
+            if (positions[r] == pos) { match = true; break; }
+
+        out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+    }
+}
+
+static void emit_block(const ComprEndpts compr_endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+    Bits out(block, ZOH::BITSIZE);
+
+    write_header(compr_endpts, shapeindex, p, out);
+
+    write_indices(indices, shapeindex, out);
+
+    nvDebugCheck(out.getptr() == ZOH::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
+{
+    // scale endpoints
+    int a, b;			// really need a IntVector3...
+
+    a = Utils::unquantize(endpts.A[0], prec);
+    b = Utils::unquantize(endpts.B[0], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[1], prec);
+    b = Utils::unquantize(endpts.B[1], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[2], prec);
+    b = Utils::unquantize(endpts.B[2], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+    int positions[NREGIONS_TWO];
+
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        bool match = false;
+
+        for (int r = 0; r < NREGIONS_TWO; ++r)
+            if (positions[r] == pos) { match = true; break; }
+
+        indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+    }
+}
+
+void ZOH::decompresstwo(const char *block, Tile &t)
+{
+    Bits in(block, ZOH::BITSIZE);
+
+    Pattern p;
+    IntEndpts endpts[NREGIONS_TWO];
+    ComprEndpts compr_endpts[NREGIONS_TWO];
+    int shapeindex;
+
+    if (!read_header(in, compr_endpts, shapeindex, p))
+    {
+        // reserved mode, return all zeroes
+        for (int y = 0; y < Tile::TILE_H; y++)
+            for (int x = 0; x < Tile::TILE_W; x++)
+                t.data[y][x] = Vector3(0.0f);
+
+        return;
+    }
+
+    decompress_endpts(compr_endpts, endpts, p);
+
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
+
+    int indices[Tile::TILE_H][Tile::TILE_W];
+
+    read_indices(in, shapeindex, indices);
+
+    nvDebugCheck(in.getptr() == ZOH::BITSIZE);
+
+    // lookup
+    for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+        t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
+{
+    Vector3 palette[NINDICES];
+    float toterr = 0;
+    Vector3 err;
+
+    generate_palette_quantized(endpts, prec, palette);
+
+    for (int i = 0; i < np; ++i)
+    {
+        float err, besterr;
+
+        besterr = Utils::norm(colors[i], palette[0]) * importance[i];
+
+        for (int j = 1; j < NINDICES && besterr > 0; ++j)
+        {
+            err = Utils::norm(colors[i], palette[j]) * importance[i];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_TWO], int prec, 
+                           int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_TWO])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        generate_palette_quantized(endpts[region], prec, &palette[region][0]);
+        toterr[region] = 0;
+    }
+
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]);
+        indices[y][x] = 0;
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]);
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+            {
+                besterr = err;
+                indices[y][x] = i;
+            }
+        }
+        toterr[region] += besterr;
+    }
+}
+
+static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
+                          float old_err, int do_b)
+{
+    // we have the old endpoints: old_endpts
+    // we have the perturbed endpoints: new_endpts
+    // we have the temporary endpoints: temp_endpts
+
+    IntEndpts temp_endpts;
+    float min_err = old_err;		// start with the best current error
+    int beststep;
+
+    // copy real endpoints so we can perturb them
+    for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
+
+    // do a logarithmic search for the best error for this endpoint (which)
+    for (int step = 1 << (prec-1); step; step >>= 1)
+    {
+        bool improved = false;
+        for (int sign = -1; sign <= 1; sign += 2)
+        {
+            if (do_b == 0)
+            {
+                temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+                if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+                    continue;
+            }
+            else
+            {
+                temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+                if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+                    continue;
+            }
+
+            float err = map_colors(colors, importance, np, temp_endpts, prec);
+
+            if (err < min_err)
+            {
+                improved = true;
+                min_err = err;
+                beststep = sign * step;
+            }
+        }
+        // if this was an improvement, move the endpoint and continue search from there
+        if (improved)
+        {
+            if (do_b == 0)
+                new_endpts.A[ch] += beststep;
+            else
+                new_endpts.B[ch] += beststep;
+        }
+    }
+    return min_err;
+}
+
+static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
+{
+    float opt_err = orig_err;
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        opt_endpts.A[ch] = orig_endpts.A[ch];
+        opt_endpts.B[ch] = orig_endpts.B[ch];
+    }
+    /*
+        err0 = perturb(rgb0, delta0)
+        err1 = perturb(rgb1, delta1)
+        if (err0 < err1)
+            if (err0 >= initial_error) break
+            rgb0 += delta0
+            next = 1
+        else
+            if (err1 >= initial_error) break
+            rgb1 += delta1
+            next = 0
+        initial_err = map()
+        for (;;)
+            err = perturb(next ? rgb1:rgb0, delta)
+            if (err >= initial_err) break
+            next? rgb1 : rgb0 += delta
+            initial_err = err
+    */
+    IntEndpts new_a, new_b;
+    IntEndpts new_endpt;
+    int do_b;
+
+    // now optimize each channel separately
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        // figure out which endpoint when perturbed gives the most improvement and start there
+        // if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
+
+        if (err0 < err1)
+        {
+            if (err0 >= opt_err)
+                continue;
+
+            opt_endpts.A[ch] = new_a.A[ch];
+            opt_err = err0;
+            do_b = 1;		// do B next
+        }
+        else
+        {
+            if (err1 >= opt_err)
+                continue;
+            opt_endpts.B[ch] = new_b.B[ch];
+            opt_err = err1;
+            do_b = 0;		// do A next
+        }
+
+        // now alternate endpoints and keep trying until there is no improvement
+        for (;;)
+        {
+            float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
+            if (err >= opt_err)
+                break;
+            if (do_b == 0)
+                opt_endpts.A[ch] = new_endpt.A[ch];
+            else
+                opt_endpts.B[ch] = new_endpt.B[ch];
+            opt_err = err;
+            do_b = 1 - do_b;	// now move the other endpoint
+        }
+    }
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_TWO], 
+                            const IntEndpts orig_endpts[NREGIONS_TWO], int prec, IntEndpts opt_endpts[NREGIONS_TWO])
+{
+    Vector3 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+    float err = 0;
+
+    for (int region=0; region<NREGIONS_TWO; ++region)
+    {
+        // collect the pixels in the region
+        int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++)
+            for (int x = 0; x < tile.size_x; x++)
+                if (REGION(x,y,shapeindex) == region)
+                {
+            pixels[np] = tile.data[y][x];
+            importance[np] = tile.importance_map[y][x];
+            ++np;
+        }
+
+        optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
+    }
+}
+
+/* optimization algorithm
+    for each pattern
+        convert endpoints using pattern precision
+        assign indices and get initial error
+        compress indices (and possibly reorder endpoints)
+        transform endpoints
+        if transformed endpoints fit pattern
+            get original endpoints back
+            optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+            compress new indices
+            transform new endpoints
+            if new endpoints fit pattern AND if error is improved
+                emit compressed block with new data
+            else
+                emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+float ZOH::refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block)
+{
+    float orig_err[NREGIONS_TWO], opt_err[NREGIONS_TWO], orig_toterr, opt_toterr;
+    IntEndpts orig_endpts[NREGIONS_TWO], opt_endpts[NREGIONS_TWO];
+    ComprEndpts compr_orig[NREGIONS_TWO], compr_opt[NREGIONS_TWO];
+    int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+    for (int sp = 0; sp < NPATTERNS; ++sp)
+    {
+        // precisions for all channels need to be the same
+        for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
+
+        quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
+        assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
+        swap_indices(orig_endpts, orig_indices, shapeindex_best);
+        compress_endpts(orig_endpts, compr_orig, patterns[sp]);
+        if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
+        {
+            optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
+            assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
+            swap_indices(opt_endpts, opt_indices, shapeindex_best);
+            compress_endpts(opt_endpts, compr_opt, patterns[sp]);
+            orig_toterr = opt_toterr = 0;
+            for (int i=0; i < NREGIONS_TWO; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+            if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
+            {
+                emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
+                return opt_toterr;
+            }
+            else
+            {
+                // either it stopped fitting when we optimized it, or there was no improvement
+                // so go back to the unoptimized endpoints which we know will fit
+                emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
+                return orig_toterr;
+            }
+        }
+    }
+    nvAssert(false); //throw "No candidate found, should never happen (refinetwo.)";
+	return FLT_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_TWO], Vector3 palette[NREGIONS_TWO][NINDICES])
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+            palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_TWO])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+
+    generate_palette_unquantized(endpts, palette);
+
+    float toterr = 0;
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+float ZOH::roughtwo(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_TWO])
+{
+    for (int region=0; region<NREGIONS_TWO; ++region)
+    {
+        int np = 0;
+        Vector3 colors[Tile::TILE_TOTAL];
+        Vector3 mean(0,0,0);
+
+        for (int y = 0; y < tile.size_y; y++)
+            for (int x = 0; x < tile.size_x; x++)
+                if (REGION(x,y,shapeindex) == region)
+                {
+            colors[np] = tile.data[y][x];
+            mean += tile.data[y][x];
+            ++np;
+        }
+
+        // handle simple cases
+        if (np == 0)
+        {
+            Vector3 zero(0,0,0);
+            endpts[region].A = zero;
+            endpts[region].B = zero;
+            continue;
+        }
+        else if (np == 1)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[0];
+            continue;
+        }
+        else if (np == 2)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[1];
+            continue;
+        }
+
+        mean /= float(np);
+
+        Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+        // project each pixel value along the principal direction
+        float minp = FLT_MAX, maxp = -FLT_MAX;
+        for (int i = 0; i < np; i++)
+        {
+            float dp = dot(colors[i]-mean, direction);
+            if (dp < minp) minp = dp;
+            if (dp > maxp) maxp = dp;
+        }
+
+        // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+        endpts[region].A = mean + minp*direction;
+        endpts[region].B = mean + maxp*direction;
+
+        // clamp endpoints
+        // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+        // shape based on endpoints being clamped
+        Utils::clamp(endpts[region].A);
+        Utils::clamp(endpts[region].B);
+    }
+
+    return map_colors(tile, shapeindex, endpts);
+}
+
+float ZOH::compresstwo(const Tile &t, char *block)
+{
+    int shapeindex_best = 0;
+    FltEndpts endptsbest[NREGIONS_TWO], tempendpts[NREGIONS_TWO];
+    float msebest = FLT_MAX;
+
+    /*
+    collect the mse values that are within 5% of the best values
+    optimize each one and choose the best
+    */
+    // hack for now -- just use the best value WORK
+    for (int i=0; i<NSHAPES && msebest>0.0; ++i)
+    {
+        float mse = roughtwo(t, i, tempendpts);
+        if (mse < msebest)
+        {
+            msebest = mse;
+            shapeindex_best = i;
+            memcpy(endptsbest, tempendpts, sizeof(endptsbest));
+        }
+
+    }
+    return refinetwo(t, shapeindex_best, endptsbest, block);
+}
+