// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
// 
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
// 
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
// 
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.

#include <nvmath/Color.h>
#include <nvimage/ColorBlock.h>
#include <nvimage/BlockDXT.h>

#include "FastCompressDXT.h"

#if defined(__SSE2__)
#include <emmintrin.h>
#endif

#if defined(__SSE__)
#include <xmmintrin.h>
#endif

#if defined(__MMX__)
#include <mmintrin.h>
#endif

#undef __VEC__
#if defined(__VEC__)
#include <altivec.h>
#undef bool
#endif
// Online Resources:
// - http://www.jasondorie.com/ImageLib.zip
// - http://homepage.hispeed.ch/rscheidegger/dri_experimental/s3tc_index.html
// - http://www.sjbrown.co.uk/?article=dxt

using namespace nv;


#if defined(__SSE2__) && 0

// @@ TODO

typedef __m128i VectorColor;

inline static __m128i loadColor(Color32 c)
{
	return ...;
}

inline static __m128i absoluteDifference(__m128i a, __m128i b)
{
	return ...;
}
	
inline uint colorDistance(__m128i a, __m128i b)
{
	return 0;
}

#elif defined(__MMX__) && 0

typedef __m64 VectorColor;

inline static __m64 loadColor(Color32 c)
{
	return _mm_unpacklo_pi8(_mm_cvtsi32_si64(c), _mm_setzero_si64());
}

inline static __m64 absoluteDifference(__m64 a, __m64 b)
{
	// = |a-b| or |b-a|
	return _mm_or_si64(_mm_subs_pu16(a, b), _mm_subs_pu16(b, a));
}
	
inline uint colorDistance(__m64 a, __m64 b)
{
	union {
		__m64 v;
		uint16 part[4];
	} s;
	
	s.v = absoluteDifference(a, b);
		
	// @@ This is very slow!
	return s.part[0] + s.part[1] + s.part[2] + s.part[3];
}

#define vectorEnd	_mm_empty

#elif defined(__VEC__)

typedef vector signed int VectorColor;

inline static vector signed int loadColor(Color32 c)
{
	return (vector signed int) (c.r, c.g, c.b, c.a);
}

// Get the absolute distance between the given colors.
inline static uint colorDistance(vector signed int c0, vector signed int c1)
{
	int result;
	vector signed int v = vec_sums(vec_abs(vec_sub(c0, c1)), (vector signed int)0);
	vec_ste(vec_splat(v, 3), 0, &result);
	return result;
}

inline void vectorEnd()
{
}

#else

typedef Color32 VectorColor;

inline static Color32 loadColor(Color32 c)
{
	return c;
}

inline static Color32 premultiplyAlpha(Color32 c)
{
	Color32 pm;
	pm.r = (c.r * c.a) >> 8;
	pm.g = (c.g * c.a) >> 8;
	pm.b = (c.b * c.a) >> 8;
	pm.a = c.a;
	return pm;
}

inline static uint sqr(uint s)
{
	return s*s;
}

// Get the absolute distance between the given colors.
inline static uint colorDistance(Color32 c0, Color32 c1)
{
	return sqr(c0.r - c1.r) + sqr(c0.g - c1.g) + sqr(c0.b - c1.b);
	//return abs(c0.r - c1.r) + abs(c0.g - c1.g) + abs(c0.b - c1.b);
}

inline void vectorEnd()
{
}

#endif


inline static uint paletteError(const ColorBlock & rgba, Color32 palette[4])
{
	uint error = 0;
	
	const VectorColor vcolor0 = loadColor(palette[0]);
	const VectorColor vcolor1 = loadColor(palette[1]);
	const VectorColor vcolor2 = loadColor(palette[2]);
	const VectorColor vcolor3 = loadColor(palette[3]);

	for(uint i = 0; i < 16; i++) {
		const VectorColor vcolor = loadColor(rgba.color(i));
		
		uint d0 = colorDistance(vcolor, vcolor0);
		uint d1 = colorDistance(vcolor, vcolor1);
		uint d2 = colorDistance(vcolor, vcolor2);
		uint d3 = colorDistance(vcolor, vcolor3);
		
		error += min(min(d0, d1), min(d2, d3));
	}

	vectorEnd();
	return error;
}

	
	
inline static uint computeIndices(const ColorBlock & rgba, const Color32 palette[4])
{
	const VectorColor vcolor0 = loadColor(palette[0]);
	const VectorColor vcolor1 = loadColor(palette[1]);
	const VectorColor vcolor2 = loadColor(palette[2]);
	const VectorColor vcolor3 = loadColor(palette[3]);
	
	uint indices = 0;
	for(int i = 0; i < 16; i++) {
		const VectorColor vcolor = loadColor(rgba.color(i));
		
		uint d0 = colorDistance(vcolor0, vcolor);
		uint d1 = colorDistance(vcolor1, vcolor);
		uint d2 = colorDistance(vcolor2, vcolor);
		uint d3 = colorDistance(vcolor3, vcolor);
		
		uint b0 = d0 > d3;
		uint b1 = d1 > d2;
		uint b2 = d0 > d2;
		uint b3 = d1 > d3;
		uint b4 = d2 > d3;
		
		uint x0 = b1 & b2;
		uint x1 = b0 & b3;
		uint x2 = b0 & b4;
		
		indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
	}

	vectorEnd();
	return indices;
}

inline static uint computeIndicesAlpha(const ColorBlock & rgba, const Color32 palette[4])
{
	const VectorColor vcolor0 = loadColor(palette[0]);
	const VectorColor vcolor1 = loadColor(palette[1]);
	const VectorColor vcolor2 = loadColor(palette[2]);
	const VectorColor vcolor3 = loadColor(palette[3]);
	
	uint indices = 0;
	for(int i = 0; i < 16; i++) {
		const VectorColor vcolor = premultiplyAlpha(loadColor(rgba.color(i)));
		
		uint d0 = colorDistance(vcolor0, vcolor);
		uint d1 = colorDistance(vcolor1, vcolor);
		uint d2 = colorDistance(vcolor2, vcolor);
		uint d3 = colorDistance(vcolor3, vcolor);
		
		uint b0 = d0 > d3;
		uint b1 = d1 > d2;
		uint b2 = d0 > d2;
		uint b3 = d1 > d3;
		uint b4 = d2 > d3;
		
		uint x0 = b1 & b2;
		uint x1 = b0 & b3;
		uint x2 = b0 & b4;
		
		indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
	}

	vectorEnd();
	return indices;
}


inline static Color16 saturate16(int r, int g, int b)
{
	Color16 c;
	c.r = clamp(0, 31, r);
	c.g = clamp(0, 63, g);
	c.b = clamp(0, 31, b);
	return c;
}


// Compressor that uses the luminance axis.
void nv::compressBlock_LuminanceAxis(const ColorBlock & rgba, BlockDXT1 * block)
{
	Color32 c0, c1;
	rgba.luminanceRange(&c0, &c1);
	
	block->col0 = toColor16(c0);
	block->col1 = toColor16(c1);
	
	// Use 4 color mode only.
	if (block->col0.u < block->col1.u) {
		swap(block->col0.u, block->col1.u);
	}
	
	Color32 palette[4];
	block->evaluatePalette4(palette);
	
	block->indices = computeIndices(rgba, palette);
}


// Compressor that uses diameter axis.
void nv::compressBlock_DiameterAxis(const ColorBlock & rgba, BlockDXT1 * block)
{
	Color32 c0, c1;
	rgba.diameterRange(&c0, &c1);
	
	block->col0 = toColor16(c0);
	block->col1 = toColor16(c1);
	
	// Use 4 color mode only.
	if (block->col0.u < block->col1.u) {
		swap(block->col0.u, block->col1.u);
	}
	
	Color32 palette[4];
	block->evaluatePalette4(palette);
	
	block->indices = computeIndices(rgba, palette);
}


// Compressor that uses bounding box.
void nv::compressBlock_BoundsRange(const ColorBlock & rgba, BlockDXT1 * block)
{
	Color32 c0, c1;
	rgba.boundsRange(&c1, &c0);
	
	block->col0 = toColor16(c0);
	block->col1 = toColor16(c1);
	
	nvDebugCheck(block->col0.u > block->col1.u);
	
	// Use 4 color mode only.
	//if (block->col0.u < block->col1.u) {
	//	swap(block->col0.u, block->col1.u);
	//}
	
	Color32 palette[4];
	block->evaluatePalette4(palette);
	
	block->indices = computeIndices(rgba, palette);
}

// Compressor that uses bounding box and takes alpha into account.
void nv::compressBlock_BoundsRangeAlpha(const ColorBlock & rgba, BlockDXT1 * block)
{
	Color32 c0, c1;
	rgba.boundsRange(&c1, &c0);
	
	if (rgba.hasAlpha())
	{
		block->col0 = toColor16(c1);
		block->col1 = toColor16(c0);
	}
	else
	{
		block->col0 = toColor16(c0);
		block->col1 = toColor16(c1);
	}
	
	Color32 palette[4];
	block->evaluatePalette(palette);
	
	block->indices = computeIndicesAlpha(rgba, palette);
}


// Compressor that uses the best fit axis.
void nv::compressBlock_BestFitAxis(const ColorBlock & rgba, BlockDXT1 * block)
{
	Color32 c0, c1;
	rgba.bestFitRange(&c0, &c1);
	
	block->col0 = toColor16(c0);
	block->col1 = toColor16(c1);
	
	// Use 4 color mode only.
	if (block->col0.u < block->col1.u) {
		swap(block->col0.u, block->col1.u);
	}
	else if (block->col0.u == block->col1.u) {
		block->col0.u++;
	}
	
	Color32 palette[4];
	block->evaluatePalette4(palette);
	
	block->indices = computeIndices(rgba, palette);
}


// Compressor that tests all input color pairs.
void nv::compressBlock_TestAllPairs(const ColorBlock & rgba, BlockDXT1 * block)
{
	uint best_error = uint(-1);
	Color16 best_col0, best_col1;
	
	Color32 palette[4];
	
	// Test all color pairs.
	for(uint i = 0; i < 16; i++) {
		block->col0 = toColor16(rgba.color(i));
		
		for(uint ii = 0; ii < 16; ii++) {
			if( i != ii ) {
				block->col1 = toColor16(rgba.color(ii));
				block->evaluatePalette(palette);
				
				const uint error = paletteError(rgba, palette);
				if(error < best_error) {
					best_error = error;
					best_col0 = block->col0;
					best_col1 = block->col1;
				}
			}
		}
	}
	
	block->col0 = best_col0;
	block->col1 = best_col1;
	block->evaluatePalette(palette);
	
	block->indices = computeIndices(rgba, palette);
}


// Compressor that tests all pairs in the best fit axis.
void nv::compressBlock_AnalyzeBestFitAxis(const ColorBlock & rgba, BlockDXT1 * block)
{
	uint best_error = uint(-1);
	Color16 best_col0, best_col1;
	
	Color32 palette[4];
	

	// Find bounds of the search space.
	int r_min = 32;
	int r_max = 0;
	int g_min = 64;
	int g_max = 0;
	int b_min = 32;
	int b_max = 0;
	
	for(uint i = 0; i < 16; i++) {
		Color16 color = toColor16(rgba.color(i));
		
		r_min = min(r_min, int(color.r));
		r_max = max(r_max, int(color.r));
		g_min = min(g_min, int(color.g));
		g_max = max(g_max, int(color.g));
		b_min = min(b_min, int(color.b));
		b_max = max(b_max, int(color.b));
	}
	
	const int r_pad = 4 * max(1, (r_max - r_min));
	const int g_pad = 4 * max(1, (g_max - g_min));
	const int b_pad = 4 * max(1, (b_max - b_min));

	r_min = max(0, r_min - r_pad);
	r_max = min(31, r_max + r_pad);
	g_min = max(0, g_min - g_pad);
	g_max = min(63, g_max + g_pad);
	b_min = max(0, b_min - b_pad);
	b_max = min(31, b_max + b_pad);
	
	const Line3 line = rgba.bestFitLine();
	
	if( fabs(line.direction().x()) > fabs(line.direction().y()) && fabs(line.direction().x()) > fabs(line.direction().z()) ) {
		for(int r0 = r_min; r0 <= r_max; r0++) {
			const float x0 = float((r0 << 3) | (r0 >> 2));
			const float t0 = (x0 - line.origin().x()) / line.direction().x();
			const float y0 = line.origin().y() + t0 * line.direction().y();
			const float z0 = line.origin().z() + t0 * line.direction().z();
			
			const int g0 = clamp(int(y0), 0, 255) >> 2;
			const int b0 = clamp(int(z0), 0, 255) >> 3;
			
			for(int r1 = r_min; r1 <= r_max; r1++) {
				const float x1 = float((r1 << 3) | (r1 >> 2));
				const float t1 = (x1 - line.origin().x()) / line.direction().x();
				const float y1 = line.origin().y() + t1 * line.direction().y();
				const float z1 = line.origin().z() + t1 * line.direction().z();
				
				const int g1 = clamp(int(y1), 0, 255) >> 2;
				const int b1 = clamp(int(z1), 0, 255) >> 3;
				
				// Test one pixel around.
				for (int i0 = -1; i0 <= 1; i0++) {
					for (int j0 = -1; j0 <= 1; j0++) {
						for (int i1 = -1; i1 <= 1; i1++) {
							for (int j1 = -1; j1 <= 1; j1++) { 
								if( g0+i0 >= 0 && g0+i0 < 64 && g1+i1 >= 0 && g1+i1 < 64 &&
									b0+j0 >= 0 && b0+j0 < 32 && b1+j1 >= 0 && b1+j1 < 32 )
								{
									block->col0.r = r0;
									block->col0.g = g0 + i0;
									block->col0.b = b0 + j0;
									block->col1.r = r1;
									block->col1.g = g1 + i1;
									block->col1.b = b1 + j1;
									block->evaluatePalette(palette);
									
									const uint error = paletteError(rgba, palette);
									if(error < best_error) {
										best_error = error;
										best_col0 = block->col0;
										best_col1 = block->col1;
									}
								}
							}
						}
					}
				}
			}
		}
	}
	else if( fabs(line.direction().y()) > fabs(line.direction().z()) ) {
		for(int g0 = g_min; g0 <= g_max; g0++) {
			const float y0 = float((g0 << 2) | (g0 >> 4));
			const float t0 = (y0 - line.origin().y()) / line.direction().y();
			const float x0 = line.origin().x() + t0 * line.direction().x();
			const float z0 = line.origin().z() + t0 * line.direction().z();
			
			const int r0 = clamp(int(x0), 0, 255) >> 3;
			const int b0 = clamp(int(z0), 0, 255) >> 3;
			
			for(int g1 = g_min; g1 <= g_max; g1++) {
				const float y1 = float((g1 << 2) | (g1 >> 4));
				const float t1 = (y1 - line.origin().y()) / line.direction().y();
				const float x1 = line.origin().x() + t1 * line.direction().x();
				const float z1 = line.origin().z() + t1 * line.direction().z();
				
				const int r1 = clamp(int(x1), 0, 255) >> 2;
				const int b1 = clamp(int(z1), 0, 255) >> 3;
				
				// Test one pixel around.
				for (int i0 = -1; i0 <= 1; i0++) {
					for (int j0 = -1; j0 <= 1; j0++) {
						for (int i1 = -1; i1 <= 1; i1++) {
							for (int j1 = -1; j1 <= 1; j1++) { 
								if( r0+i0 >= 0 && r0+i0 < 32 && r1+i1 >= 0 && r1+i1 < 32 &&
									b0+j0 >= 0 && b0+j0 < 32 && b1+j1 >= 0 && b1+j1 < 32 )
								{
									block->col0.r = r0 + i0;
									block->col0.g = g0;
									block->col0.b = b0 + j0;
									block->col1.r = r1 + i1;
									block->col1.g = g1;
									block->col1.b = b1 + j1;
									block->evaluatePalette(palette);
									
									const uint error = paletteError(rgba, palette);
									if(error < best_error) {
										best_error = error;
										best_col0 = block->col0;
										best_col1 = block->col1;
									}
								}
							}
						}
					}
				}
			}
		}
	}
	else {
		for(int b0 = b_min; b0 <= b_max; b0++) {
			const float z0 = float((b0 << 3) | (b0 >> 2));
			const float t0 = (z0 - line.origin().z()) / line.direction().z();
			const float y0 = line.origin().y() + t0 * line.direction().y();
			const float x0 = line.origin().x() + t0 * line.direction().x();
			
			const int g0 = clamp(int(y0), 0, 255) >> 2;
			const int r0 = clamp(int(x0), 0, 255) >> 3;
			
			for(int b1 = b_min; b1 <= b_max; b1++) {
				const float z1 = float((b1 << 3) | (b1 >> 2));
				const float t1 = (z1 - line.origin().z()) / line.direction().z();
				const float y1 = line.origin().y() + t1 * line.direction().y();
				const float x1 = line.origin().x() + t1 * line.direction().x();
				
				const int g1 = clamp(int(y1), 0, 255) >> 2;
				const int r1 = clamp(int(x1), 0, 255) >> 3;
				
				// Test one pixel around.
				for (int i0 = -1; i0 <= 1; i0++) {
					for (int j0 = -1; j0 <= 1; j0++) {
						for (int i1 = -1; i1 <= 1; i1++) {
							for (int j1 = -1; j1 <= 1; j1++) { 
								if( g0+i0 >= 0 && g0+i0 < 64 && g1+i1 >= 0 && g1+i1 < 64 &&
									r0+j0 >= 0 && r0+j0 < 32 && r1+j1 >= 0 && r1+j1 < 32 )
								{
									block->col0.r = r0 + j0;
									block->col0.g = g0 + i0;
									block->col0.b = b0;
									block->col1.r = r1 + j1;
									block->col1.g = g1 + i1;
									block->col1.b = b1;
									block->evaluatePalette(palette);
									
									const uint error = paletteError(rgba, palette);
									if(error < best_error) {
										best_error = error;
										best_col0 = block->col0;
										best_col1 = block->col1;
									}
								}
							}
						}
					}
				}
			}
		}
	}

	block->col0 = best_col0;
	block->col1 = best_col1;
	block->evaluatePalette(palette);
	
	block->indices = computeIndices(rgba, palette);
}


// Improve palette iteratively using alternate 3d search as suggested by Dave Moore.
void nv::refineSolution_3dSearch(const ColorBlock & rgba, BlockDXT1 * block)
{
	Color32 palette[4];
	block->evaluatePalette(palette);
	
	uint best_error = paletteError(rgba, palette);
	Color16 best_col0 = block->col0;
	Color16 best_col1 = block->col1;
	
	const int W = 2;
	
	while(true) {
		bool changed = false;
		
		const int r0 = best_col0.r;
		const int g0 = best_col0.g;
		const int b0 = best_col0.b;
		
		for(int z = -W; z <= W; z++) {
			for(int y = -W; y <= W; y++) {
				for(int x = -W; x <= W; x++) {
					block->col0 = saturate16(r0 + x, g0 + y, b0 + z);
					block->evaluatePalette(palette);
					
					const uint error = paletteError(rgba, palette);
					if(error < best_error) {
						best_error = error;
						best_col0 = block->col0;
						best_col1 = block->col1;
						changed = true;
					}
				}
			}
		}
		
		const int r1 = best_col1.r;
		const int g1 = best_col1.g;
		const int b1 = best_col1.b;
		
		for(int z = -W; z <= W; z++) {
			for(int y = -W; y <= W; y++) {
				for(int x = -W; x <= W; x++) {
					block->col1 = saturate16(r1 + x, g1 + y, b1 + z);
					block->evaluatePalette(palette);
					
					const uint error = paletteError(rgba, palette);
					if(error < best_error) {
						best_error = error;
						best_col0 = block->col0;
						best_col1 = block->col1;
						changed = true;
					}
				}
			}
		}
		
		if( !changed ) {
			// Stop at local minima.
			break;
		}
	}
	
	block->col0 = best_col0;
	block->col1 = best_col1;
	block->evaluatePalette(palette);
	
	block->indices = computeIndices(rgba, palette);
}


// Improve the palette iteratively using 6d search as suggested by Charles Bloom.
void nv::refineSolution_6dSearch(const ColorBlock & rgba, BlockDXT1 * block)
{
	Color32 palette[4];
	block->evaluatePalette(palette);
	
	uint best_error = paletteError(rgba, palette);
	Color16 best_col0 = block->col0;
	Color16 best_col1 = block->col1;
	
	const int W = 1;

	while(true) {
		bool changed = false;
		const int r0 = best_col0.r;
		const int g0 = best_col0.g;
		const int b0 = best_col0.b;
		const int r1 = best_col1.r;
		const int g1 = best_col1.g;
		const int b1 = best_col1.b;
		
		for(int z0 = -W; z0 <= W; z0++) {
			for(int y0 = -W; y0 <= W; y0++) {
				for(int x0 = -W; x0 <= W; x0++) {
					for(int z1 = -W; z1 <= W; z1++) {
						for(int y1 = -W; y1 <= W; y1++) {
							for(int x1 = -W; x1 <= W; x1++) {
								
								block->col0 = saturate16(r0 + x0, g0 + y0, b0 + z0);
								block->col1 = saturate16(r1 + x1, g1 + y1, b1 + z1);
								block->evaluatePalette(palette);
								
								const uint error = paletteError(rgba, palette);
								if(error < best_error) {
									best_error = error;
									best_col0 = block->col0;
									best_col1 = block->col1;
									changed = true;
								}
							}
						}
					}
				}
			}
		}
		
		if( !changed ) {
			// Stop at local minima.
			break;
		}
	}
	
	block->col0 = best_col0;
	block->col1 = best_col1;
	block->evaluatePalette(palette);
	
	block->indices = computeIndices(rgba, palette);
}



// Improve the palette iteratively using alternate 1d search as suggested by Walt Donovan.
void nv::refineSolution_1dSearch(const ColorBlock & rgba, BlockDXT1 * block)
{
	Color32 palette[4];
	block->evaluatePalette(palette);
	
	uint best_error = paletteError(rgba, palette);
	Color16 best_col0 = block->col0;
	Color16 best_col1 = block->col1;
	
	const int W = 4;
	
	while(true) {
		bool changed = false;
		
		const int r0 = best_col0.r;
		const int g0 = best_col0.g;
		const int b0 = best_col0.b;
		
		for(int z = -W; z <= W; z++) {
			block->col0.b = clamp(b0 + z, 0, 31);
			block->evaluatePalette(palette);
					
			const uint error = paletteError(rgba, palette);
			if(error < best_error) {
				best_error = error;
				best_col0 = block->col0;
				best_col1 = block->col1;
				changed = true;
			}
		}
		
		for(int y = -W; y <= W; y++) {
			block->col0.g = clamp(g0 + y, 0, 63);
			block->evaluatePalette(palette);
				
			const uint error = paletteError(rgba, palette);
			if(error < best_error) {
				best_error = error;
				best_col0 = block->col0;
				best_col1 = block->col1;
				changed = true;
			}
		}
		
		for(int x = -W; x <= W; x++) {
			block->col0.r = clamp(r0 + x, 0, 31);
			block->evaluatePalette(palette);
			
			const uint error = paletteError(rgba, palette);
			if(error < best_error) {
				best_error = error;
				best_col0 = block->col0;
				best_col1 = block->col1;
				changed = true;
			}
		}
		
		
		const int r1 = best_col1.r;
		const int g1 = best_col1.g;
		const int b1 = best_col1.b;
		
		for(int z = -W; z <= W; z++) {
			block->col1.b = clamp(b1 + z, 0, 31);
			block->evaluatePalette(palette);
			
			const uint error = paletteError(rgba, palette);
			if(error < best_error) {
				best_error = error;
				best_col0 = block->col0;
				best_col1 = block->col1;
				changed = true;
			}
		}
		
		for(int y = -W; y <= W; y++) {
			block->col1.g = clamp(g1 + y, 0, 63);
			block->evaluatePalette(palette);
			
			const uint error = paletteError(rgba, palette);
			if(error < best_error) {
				best_error = error;
				best_col0 = block->col0;
				best_col1 = block->col1;
				changed = true;
			}
		}
		
		for(int x = -W; x <= W; x++) {
			block->col1.r = clamp(r1 + x, 0, 31);
			block->evaluatePalette(palette);
			
			const uint error = paletteError(rgba, palette);
			if(error < best_error) {
				best_error = error;
				best_col0 = block->col0;
				best_col1 = block->col1;
				changed = true;
			}
		}
		
		if( !changed ) {
			// Stop at local minima.
			break;
		}
	}
	
	block->col0 = best_col0;
	block->col1 = best_col1;
	block->evaluatePalette(palette);
	
	block->indices = computeIndices(rgba, palette);
}

static uint computeGreenError(const ColorBlock & rgba, const BlockDXT1 * block)
{
	Color32 colors[4];
	block->evaluatePalette4(colors);

	uint totalError = 0;

	for (uint i = 0; i < 16; i++)
	{
		uint8 green = rgba.color(i).g;
		
		uint besterror = 256*256;
		uint best;
		for(uint p = 0; p < 4; p++)
		{
			int d = colors[p].g - green;
			uint error = d * d;
			
			if (error < besterror)
			{
				besterror = error;
				best = p;
			}
		}
		
		totalError += besterror;
	}

	return totalError;
}

// Brute force compressor for DXT5n
void nv::compressGreenBlock_BruteForce(const ColorBlock & rgba, BlockDXT1 * block)
{
	nvDebugCheck(block != NULL);
	
	uint8 ming = 63;
	uint8 maxg = 0;
	
	// Get min/max green.
	for (uint i = 0; i < 16; i++)
	{
		uint8 green = rgba.color(i).g >> 2;
		ming = min(ming, green);
		maxg = max(maxg, green);
	}

	block->col0.u = 0;
	block->col1.u = 0;
	block->col0.g = maxg;
	block->col1.g = ming;

	if (maxg - ming > 4)
	{
		int besterror = computeGreenError(rgba, block);
		int bestg0 = maxg;
		int bestg1 = ming;
		
		for (int g0 = ming+5; g0 < maxg; g0++)
		{
			for (int g1 = ming; g1 < g0-4; g1++)
			{
				if ((maxg-g0) + (g1-ming) > besterror)
					continue;
				
				block->col0.g = g0;
				block->col1.g = g1;
				int error = computeGreenError(rgba, block);
				
				if (error < besterror)
				{
					besterror = error;
					bestg0 = g0;
					bestg1 = g1;
				}
			}
		}
		
		block->col0.g = bestg0;
		block->col1.g = bestg1;
	}
	
	Color32 palette[4];
	block->evaluatePalette(palette);
	block->indices = computeIndices(rgba, palette);
}



uint nv::blockError(const ColorBlock & rgba, const BlockDXT1 & block)
{
	Color32 palette[4];
	block.evaluatePalette(palette);

	VectorColor vcolors[4];
	vcolors[0] = loadColor(palette[0]);
	vcolors[1] = loadColor(palette[1]);
	vcolors[2] = loadColor(palette[2]);
	vcolors[3] = loadColor(palette[3]);

	uint error = 0;
	for(uint i = 0; i < 16; i++) {
		const VectorColor vcolor = loadColor(rgba.color(i));

		int idx = (block.indices >> (2 * i)) & 3;

		uint d = colorDistance(vcolor, vcolors[idx]);
		error += d;
	}

	//nvDebugCheck(error == paletteError(rgba, palette));

	vectorEnd();
	return error;
}


uint nv::blockError(const ColorBlock & rgba, const AlphaBlockDXT5 & block)
{
	uint8 palette[8];
	block.evaluatePalette(palette);

	uint8 indices[16];
	block.indices(indices);

	uint error = 0;
	for(uint i = 0; i < 16; i++) {
		int d = palette[indices[i]] - rgba.color(i).a;
		error += uint(d * d);
	}

	return error;
}



void nv::optimizeEndPoints(const ColorBlock & rgba, BlockDXT1 * block)
{
	float alpha2_sum = 0.0f;
	float beta2_sum = 0.0f;
	float alphabeta_sum = 0.0f;
	Vector3 alphax_sum(zero);
	Vector3 betax_sum(zero);
	
	for( int i = 0; i < 16; ++i )
	{
		const uint bits = block->indices >> (2 * i);

		float beta = float(bits & 1);
		if (bits & 2) beta = (1 + beta) / 3.0f;
		float alpha = 1.0f - beta;

		const Vector3 x = toVector4(rgba.color(i)).xyz();
		
		alpha2_sum += alpha * alpha;
		beta2_sum += beta * beta;
		alphabeta_sum += alpha * beta;
		alphax_sum += alpha * x;
		betax_sum += beta * x;
	}

	float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
		
	Vector3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
	Vector3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;

	Vector3 zero(0, 0, 0);
	Vector3 one(1, 1, 1);
	a = min(one, max(zero, a));
	b = min(one, max(zero, b));
	
	BlockDXT1 B;

	// Round a,b to 565.
	B.col0.r = uint16(a.x() * 31);
	B.col0.g = uint16(a.y() * 63);
	B.col0.b = uint16(a.z() * 31);
	B.col1.r = uint16(b.x() * 31);
	B.col1.g = uint16(b.y() * 63);
	B.col1.b = uint16(b.z() * 31);
	B.indices = block->indices;

	// Force 4 color mode.
	if (B.col0.u < B.col1.u)
	{
		swap(B.col0.u, B.col1.u);
		B.indices ^= 0x55555555;
	}
	else if (B.col0.u == B.col1.u)
	{
		block->indices = 0;
	}

	if (blockError(rgba, B) < blockError(rgba, *block))
	{
		*block = B;
	}
}

// Encode DXT3 block.
void nv::compressBlock_BoundsRange(const ColorBlock & rgba, BlockDXT3 * block)
{
	compressBlock_BoundsRange(rgba, &block->color);
	compressBlock(rgba, &block->alpha);
}

// Encode DXT3 alpha block.
void nv::compressBlock(const ColorBlock & rgba, AlphaBlockDXT3 * block)
{
	block->alpha0 = rgba.color(0).a >> 4;
	block->alpha1 = rgba.color(1).a >> 4;
	block->alpha2 = rgba.color(2).a >> 4;
	block->alpha3 = rgba.color(3).a >> 4;
	block->alpha4 = rgba.color(4).a >> 4;
	block->alpha5 = rgba.color(5).a >> 4;
	block->alpha6 = rgba.color(6).a >> 4;
	block->alpha7 = rgba.color(7).a >> 4;
	block->alpha8 = rgba.color(8).a >> 4;
	block->alpha9 = rgba.color(9).a >> 4;
	block->alphaA = rgba.color(10).a >> 4;
	block->alphaB = rgba.color(11).a >> 4;
	block->alphaC = rgba.color(12).a >> 4;
	block->alphaD = rgba.color(13).a >> 4;
	block->alphaE = rgba.color(14).a >> 4;
	block->alphaF = rgba.color(15).a >> 4;
}



static uint computeAlphaIndices(const ColorBlock & rgba, AlphaBlockDXT5 * block)
{
	uint8 alphas[8];
	block->evaluatePalette(alphas);

	uint totalError = 0;

	for (uint i = 0; i < 16; i++)
	{
		uint8 alpha = rgba.color(i).a;

		uint besterror = 256*256;
		uint best = 8;
		for(uint p = 0; p < 8; p++)
		{
			int d = alphas[p] - alpha;
			uint error = d * d;

			if (error < besterror)
			{
				besterror = error;
				best = p;
			}
		}
		nvDebugCheck(best < 8);

		totalError += besterror;
		block->setIndex(i, best);
	}

	return totalError;
}

static uint computeAlphaError(const ColorBlock & rgba, const AlphaBlockDXT5 * block)
{
	uint8 alphas[8];
	block->evaluatePalette(alphas);

	uint totalError = 0;

	for (uint i = 0; i < 16; i++)
	{
		uint8 alpha = rgba.color(i).a;

		uint besterror = 256*256;
		uint best;
		for(uint p = 0; p < 8; p++)
		{
			int d = alphas[p] - alpha;
			uint error = d * d;

			if (error < besterror)
			{
				besterror = error;
				best = p;
			}
		}

		totalError += besterror;
	}

	return totalError;
}


void nv::compressBlock_BoundsRange(const ColorBlock & rgba, BlockDXT5 * block)
{
	Color32 c0, c1;
	rgba.boundsRangeAlpha(&c1, &c0);
	
	block->color.col0 = toColor16(c0);
	block->color.col1 = toColor16(c1);
	
	nvDebugCheck(block->color.col0.u > block->color.col1.u);
	
	Color32 palette[4];
	block->color.evaluatePalette4(palette);
	
	block->color.indices = computeIndices(rgba, palette);
	
	nvDebugCheck(c0.a <= c1.a);
	
	block->alpha.alpha0 = c0.a;
	block->alpha.alpha1 = c1.a;
	
	computeAlphaIndices(rgba, &block->alpha);
}


uint nv::compressBlock_BoundsRange(const ColorBlock & rgba, AlphaBlockDXT5 * block)
{
	uint8 alpha0 = 0;
	uint8 alpha1 = 255;

	// Get min/max alpha.
	for (uint i = 0; i < 16; i++)
	{
		uint8 alpha = rgba.color(i).a;
		alpha0 = max(alpha0, alpha);
		alpha1 = min(alpha1, alpha);
	}

	alpha0 = alpha0 - (alpha0 - alpha1) / 32;
	alpha1 = alpha1 + (alpha0 - alpha1) / 32;

	AlphaBlockDXT5 block0;
	block0.alpha0 = alpha0;
	block0.alpha1 = alpha1;
	uint error0 = computeAlphaIndices(rgba, &block0);

	AlphaBlockDXT5 block1;
	block1.alpha0 = alpha1;
	block1.alpha1 = alpha0;
	uint error1 = computeAlphaIndices(rgba, &block1);

	if (error0 < error1)
	{
		*block = block0;
		return error0;
	}
	else
	{
		*block = block1;
		return error1;
	}
}

uint nv::compressBlock_BruteForce(const ColorBlock & rgba, AlphaBlockDXT5 * block)
{
	uint8 mina = 255;
	uint8 maxa = 0;

	// Get min/max alpha.
	for (uint i = 0; i < 16; i++)
	{
		uint8 alpha = rgba.color(i).a;
		mina = min(mina, alpha);
		maxa = max(maxa, alpha);
	}

	block->alpha0 = maxa;
	block->alpha1 = mina;

	/*int centroidDist = 256;
	int centroid;

	// Get the closest to the centroid.
	for (uint i = 0; i < 16; i++)
	{
		uint8 alpha = rgba.color(i).a;
		int dist = abs(alpha - (maxa + mina) / 2);
		if (dist < centroidDist)
		{
			centroidDist = dist;
			centroid = alpha;
		}
	}*/

	if (maxa - mina > 8)
	{
		int besterror = computeAlphaError(rgba, block);
		int besta0 = maxa;
		int besta1 = mina;

		for (int a0 = mina+9; a0 < maxa; a0++)
		{
			for (int a1 = mina; a1 < a0-8; a1++)
			//for (int a1 = mina; a1 < maxa; a1++)
			{
				//nvCheck(abs(a1-a0) > 8);

				//if (abs(a0 - a1) < 8) continue;
				//if ((maxa-a0) + (a1-mina) + min(abs(centroid-a0), abs(centroid-a1)) > besterror)
				if ((maxa-a0) + (a1-mina) > besterror)
					continue;

				block->alpha0 = a0;
				block->alpha1 = a1;
				int error = computeAlphaError(rgba, block);

				if (error < besterror)
				{
					besterror = error;
					besta0 = a0;
					besta1 = a1;
				}
			}
		}

		block->alpha0 = besta0;
		block->alpha1 = besta1;
	}

	return computeAlphaIndices(rgba, block);
}

static void optimizeAlpha8(const ColorBlock & rgba, AlphaBlockDXT5 * block)
{
	float alpha2_sum = 0;
	float beta2_sum = 0;
	float alphabeta_sum = 0;
	float alphax_sum = 0;
	float betax_sum = 0;

	for (int i = 0; i < 16; i++)
	{
		uint idx = block->index(i);
		float alpha;
		if (idx < 2) alpha = 1.0f - idx;
		else alpha = (8.0f - idx) / 7.0f;
		
		float beta = 1 - alpha;

        alpha2_sum += alpha * alpha;
        beta2_sum += beta * beta;
        alphabeta_sum += alpha * beta;
        alphax_sum += alpha * rgba.color(i).a;
        betax_sum += beta * rgba.color(i).a;
	}

    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);

    float a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
	float b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;

	uint alpha0 = uint(min(max(a, 0.0f), 255.0f));
	uint alpha1 = uint(min(max(b, 0.0f), 255.0f));

	if (alpha0 < alpha1)
	{
		swap(alpha0, alpha1);

		// Flip indices:
		for (int i = 0; i < 16; i++)
		{
			uint idx = block->index(i);
			if (idx < 2) block->setIndex(i, 1 - idx);
			else block->setIndex(i, 9 - idx);
		}
	}
	else if (alpha0 == alpha1)
	{
		for (int i = 0; i < 16; i++)
		{
			block->setIndex(i, 0);
		}
	}

	block->alpha0 = alpha0;
	block->alpha1 = alpha1;
}


static void optimizeAlpha6(const ColorBlock & rgba, AlphaBlockDXT5 * block)
{
	float alpha2_sum = 0;
	float beta2_sum = 0;
	float alphabeta_sum = 0;
	float alphax_sum = 0;
	float betax_sum = 0;

	for (int i = 0; i < 16; i++)
	{
		uint8 x = rgba.color(i).a;
		if (x == 0 || x == 255) continue;

		uint bits = block->index(i);
		if (bits == 6 || bits == 7) continue;

		float alpha;
		if (bits == 0) alpha = 1.0f;
		else if (bits == 1) alpha = 0.0f;
		else alpha = (6.0f - block->index(i)) / 5.0f;
		
		float beta = 1 - alpha;

        alpha2_sum += alpha * alpha;
        beta2_sum += beta * beta;
        alphabeta_sum += alpha * beta;
        alphax_sum += alpha * x;
        betax_sum += beta * x;
	}

    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);

    float a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
	float b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;

	uint alpha0 = uint(min(max(a, 0.0f), 255.0f));
	uint alpha1 = uint(min(max(b, 0.0f), 255.0f));

	if (alpha0 > alpha1)
	{
		swap(alpha0, alpha1);
	}

	block->alpha0 = alpha0;
	block->alpha1 = alpha1;
}



static bool sameIndices(const AlphaBlockDXT5 & block0, const AlphaBlockDXT5 & block1)
{
	const uint64 mask = ~uint64(0xFFFF);
	return (block0.u | mask) == (block1.u | mask);
}


uint nv::compressBlock_Iterative(const ColorBlock & rgba, AlphaBlockDXT5 * resultblock)
{
	uint8 alpha0 = 0;
	uint8 alpha1 = 255;
	
	// Get min/max alpha.
	for (uint i = 0; i < 16; i++)
	{
		uint8 alpha = rgba.color(i).a;
		alpha0 = max(alpha0, alpha);
		alpha1 = min(alpha1, alpha);
	}
	
	AlphaBlockDXT5 block;
	block.alpha0 = alpha0 - (alpha0 - alpha1) / 34;
	block.alpha1 = alpha1 + (alpha0 - alpha1) / 34;
	uint besterror = computeAlphaIndices(rgba, &block);
	
	AlphaBlockDXT5 bestblock = block;
	
	while(true)
	{
		optimizeAlpha8(rgba, &block);
		uint error = computeAlphaIndices(rgba, &block);
		
		if (error >= besterror)
		{
			// No improvement, stop.
			break;
		}
		if (sameIndices(block, bestblock))
		{
			bestblock = block;
			break;
		}
		
		besterror = error;
		bestblock = block;
	};
	
	// Copy best block to result;
	*resultblock = bestblock;

	return besterror;
}