430 lines
18 KiB
C++
430 lines
18 KiB
C++
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
/**
|
|
* Contains source code from the article "Radix Sort Revisited".
|
|
* \file Radix.cpp
|
|
* \author Pierre Terdiman
|
|
* \date April, 4, 2000
|
|
*/
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
/**
|
|
* Revisited Radix Sort.
|
|
* This is my new radix routine:
|
|
* - it uses indices and doesn't recopy the values anymore, hence wasting less ram
|
|
* - it creates all the histograms in one run instead of four
|
|
* - it sorts words faster than dwords and bytes faster than words
|
|
* - it correctly sorts negative floating-point values by patching the offsets
|
|
* - it automatically takes advantage of temporal coherence
|
|
* - multiple keys support is a side effect of temporal coherence
|
|
* - it may be worth recoding in asm... (mainly to use FCOMI, FCMOV, etc) [it's probably memory-bound anyway]
|
|
*
|
|
* History:
|
|
* - 08.15.98: very first version
|
|
* - 04.04.00: recoded for the radix article
|
|
* - 12.xx.00: code lifting
|
|
* - 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
|
|
* - 10.11.01: added local ram support
|
|
* - 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
|
|
*
|
|
* \class RadixSort
|
|
* \author Pierre Terdiman
|
|
* \version 1.3
|
|
* \date August, 15, 1998
|
|
*/
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/*
|
|
To do:
|
|
- add an offset parameter between two input values (avoid some data recopy sometimes)
|
|
- unroll ? asm ?
|
|
*/
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
// Header
|
|
|
|
#include <nvcore/Radix.h>
|
|
|
|
#include <string.h> // memset
|
|
|
|
//using namespace IceCore;
|
|
|
|
#define DELETEARRAY(a) { delete [] a; a = NULL; }
|
|
#define CHECKALLOC(a)
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
/**
|
|
* Constructor.
|
|
*/
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
RadixSort::RadixSort() : mCurrentSize(0), mPreviousSize(0), mIndices(NULL), mIndices2(NULL), mTotalCalls(0), mNbHits(0)
|
|
{
|
|
#ifndef RADIX_LOCAL_RAM
|
|
// Allocate input-independent ram
|
|
mHistogram = new uint32[256*4];
|
|
mOffset = new uint32[256];
|
|
#endif
|
|
// Initialize indices
|
|
resetIndices();
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
/**
|
|
* Destructor.
|
|
*/
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
RadixSort::~RadixSort()
|
|
{
|
|
// Release everything
|
|
#ifndef RADIX_LOCAL_RAM
|
|
DELETEARRAY(mOffset);
|
|
DELETEARRAY(mHistogram);
|
|
#endif
|
|
DELETEARRAY(mIndices2);
|
|
DELETEARRAY(mIndices);
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
/**
|
|
* Resizes the inner lists.
|
|
* \param nb [in] new size (number of dwords)
|
|
* \return true if success
|
|
*/
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
bool RadixSort::resize(uint32 nb)
|
|
{
|
|
// Free previously used ram
|
|
DELETEARRAY(mIndices2);
|
|
DELETEARRAY(mIndices);
|
|
|
|
// Get some fresh one
|
|
mIndices = new uint32[nb]; CHECKALLOC(mIndices);
|
|
mIndices2 = new uint32[nb]; CHECKALLOC(mIndices2);
|
|
mCurrentSize = nb;
|
|
|
|
// Initialize indices so that the input buffer is read in sequential order
|
|
resetIndices();
|
|
|
|
return true;
|
|
}
|
|
|
|
#define CHECK_RESIZE(n) \
|
|
if(n!=mPreviousSize) \
|
|
{ \
|
|
if(n>mCurrentSize) resize(n); \
|
|
else resetIndices(); \
|
|
mPreviousSize = n; \
|
|
}
|
|
|
|
#define CREATE_HISTOGRAMS(type, buffer) \
|
|
/* Clear counters */ \
|
|
memset(mHistogram, 0, 256*4*sizeof(uint32)); \
|
|
\
|
|
/* Prepare for temporal coherence */ \
|
|
type PrevVal = (type)buffer[mIndices[0]]; \
|
|
bool AlreadySorted = true; /* Optimism... */ \
|
|
uint32* Indices = mIndices; \
|
|
\
|
|
/* Prepare to count */ \
|
|
uint8* p = (uint8*)input; \
|
|
uint8* pe = &p[nb*4]; \
|
|
uint32* h0= &mHistogram[0]; /* Histogram for first pass (LSB) */ \
|
|
uint32* h1= &mHistogram[256]; /* Histogram for second pass */ \
|
|
uint32* h2= &mHistogram[512]; /* Histogram for third pass */ \
|
|
uint32* h3= &mHistogram[768]; /* Histogram for last pass (MSB) */ \
|
|
\
|
|
while(p!=pe) \
|
|
{ \
|
|
/* Read input buffer in previous sorted order */ \
|
|
type Val = (type)buffer[*Indices++]; \
|
|
/* Check whether already sorted or not */ \
|
|
if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
|
|
/* Update for next iteration */ \
|
|
PrevVal = Val; \
|
|
\
|
|
/* Create histograms */ \
|
|
h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
|
|
} \
|
|
\
|
|
/* If all input values are already sorted, we just have to return and leave the */ \
|
|
/* previous list unchanged. That way the routine may take advantage of temporal */ \
|
|
/* coherence, for example when used to sort transparent faces. */ \
|
|
if(AlreadySorted) { mNbHits++; return *this; } \
|
|
\
|
|
/* Else there has been an early out and we must finish computing the histograms */ \
|
|
while(p!=pe) \
|
|
{ \
|
|
/* Create histograms without the previous overhead */ \
|
|
h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
|
|
}
|
|
|
|
#define CHECK_PASS_VALIDITY(pass) \
|
|
/* Shortcut to current counters */ \
|
|
uint32* CurCount = &mHistogram[pass<<8]; \
|
|
\
|
|
/* Reset flag. The sorting pass is supposed to be performed. (default) */ \
|
|
bool PerformPass = true; \
|
|
\
|
|
/* Check pass validity */ \
|
|
\
|
|
/* If all values have the same byte, sorting is useless. */ \
|
|
/* It may happen when sorting bytes or words instead of dwords. */ \
|
|
/* This routine actually sorts words faster than dwords, and bytes */ \
|
|
/* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */ \
|
|
/* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
|
|
\
|
|
/* Get first byte */ \
|
|
uint8 UniqueVal = *(((uint8*)input)+pass); \
|
|
\
|
|
/* Check that byte's counter */ \
|
|
if(CurCount[UniqueVal]==nb) PerformPass=false;
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
/**
|
|
* Main sort routine.
|
|
* This one is for integer values. After the call, mIndices contains a list of indices in sorted order, i.e. in the order you may process your data.
|
|
* \param input [in] a list of integer values to sort
|
|
* \param nb [in] number of values to sort
|
|
* \param signedvalues [in] true to handle negative values, false if you know your input buffer only contains positive values
|
|
* \return Self-Reference
|
|
*/
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedvalues)
|
|
{
|
|
uint32 i, j;
|
|
|
|
// Checkings
|
|
if(!input || !nb) return *this;
|
|
|
|
// Stats
|
|
mTotalCalls++;
|
|
|
|
// Resize lists if needed
|
|
CHECK_RESIZE(nb);
|
|
|
|
#ifdef RADIX_LOCAL_RAM
|
|
// Allocate histograms & offsets on the stack
|
|
uint32 mHistogram[256*4];
|
|
uint32 mOffset[256];
|
|
#endif
|
|
|
|
// Create histograms (counters). Counters for all passes are created in one run.
|
|
// Pros: read input buffer once instead of four times
|
|
// Cons: mHistogram is 4Kb instead of 1Kb
|
|
// We must take care of signed/unsigned values for temporal coherence.... I just
|
|
// have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
|
|
if(!signedvalues) { CREATE_HISTOGRAMS(uint32, input); }
|
|
else { CREATE_HISTOGRAMS(int32, input); }
|
|
|
|
// Compute #negative values involved if needed
|
|
uint32 NbNegativeValues = 0;
|
|
if(signedvalues)
|
|
{
|
|
// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
|
|
// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
|
|
// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
|
|
uint32* h3= &mHistogram[768];
|
|
for( i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
|
|
}
|
|
|
|
// Radix sort, j is the pass number (0=LSB, 3=MSB)
|
|
for( j=0;j<4;j++)
|
|
{
|
|
CHECK_PASS_VALIDITY(j);
|
|
|
|
// Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
|
|
// not a problem, numbers are correctly sorted anyway.
|
|
if(PerformPass)
|
|
{
|
|
// Should we care about negative values?
|
|
if(j!=3 || !signedvalues)
|
|
{
|
|
// Here we deal with positive values only
|
|
|
|
// Create offsets
|
|
mOffset[0] = 0;
|
|
for(i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
|
|
}
|
|
else
|
|
{
|
|
// This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
|
|
|
|
// Create biased offsets, in order for negative numbers to be sorted as well
|
|
mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
|
|
for(i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
|
|
|
|
// Fixing the wrong place for negative values
|
|
mOffset[128] = 0;
|
|
for(i=129;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
|
|
}
|
|
|
|
// Perform Radix Sort
|
|
uint8* InputBytes = (uint8*)input;
|
|
uint32* Indices = mIndices;
|
|
uint32* IndicesEnd = &mIndices[nb];
|
|
InputBytes += j;
|
|
while(Indices!=IndicesEnd)
|
|
{
|
|
uint32 id = *Indices++;
|
|
mIndices2[mOffset[InputBytes[id<<2]]++] = id;
|
|
}
|
|
|
|
// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
|
|
uint32* Tmp = mIndices; mIndices = mIndices2; mIndices2 = Tmp;
|
|
}
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
/**
|
|
* Main sort routine.
|
|
* This one is for floating-point values. After the call, mIndices contains a list of indices in sorted order, i.e. in the order you may process your data.
|
|
* \param input [in] a list of floating-point values to sort
|
|
* \param nb [in] number of values to sort
|
|
* \return Self-Reference
|
|
* \warning only sorts IEEE floating-point values
|
|
*/
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
RadixSort& RadixSort::sort(const float* input2, uint32 nb)
|
|
{
|
|
uint32 i, j;
|
|
|
|
// Checkings
|
|
if(!input2 || !nb) return *this;
|
|
|
|
// Stats
|
|
mTotalCalls++;
|
|
|
|
uint32* input = (uint32*)input2;
|
|
|
|
// Resize lists if needed
|
|
CHECK_RESIZE(nb);
|
|
|
|
#ifdef RADIX_LOCAL_RAM
|
|
// Allocate histograms & offsets on the stack
|
|
uint32 mHistogram[256*4];
|
|
uint32 mOffset[256];
|
|
#endif
|
|
|
|
// Create histograms (counters). Counters for all passes are created in one run.
|
|
// Pros: read input buffer once instead of four times
|
|
// Cons: mHistogram is 4Kb instead of 1Kb
|
|
// Floating-point values are always supposed to be signed values, so there's only one code path there.
|
|
// Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
|
|
// is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
|
|
// generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
|
|
// wouldn't work with mixed positive/negative values....
|
|
{ CREATE_HISTOGRAMS(float, input2); }
|
|
|
|
// Compute #negative values involved if needed
|
|
uint32 NbNegativeValues = 0;
|
|
// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
|
|
// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
|
|
// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
|
|
uint32* h3= &mHistogram[768];
|
|
for( i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
|
|
|
|
// Radix sort, j is the pass number (0=LSB, 3=MSB)
|
|
for( j=0;j<4;j++)
|
|
{
|
|
// Should we care about negative values?
|
|
if(j!=3)
|
|
{
|
|
// Here we deal with positive values only
|
|
CHECK_PASS_VALIDITY(j);
|
|
|
|
if(PerformPass)
|
|
{
|
|
// Create offsets
|
|
mOffset[0] = 0;
|
|
for( i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
|
|
|
|
// Perform Radix Sort
|
|
uint8* InputBytes = (uint8*)input;
|
|
uint32* Indices = mIndices;
|
|
uint32* IndicesEnd = &mIndices[nb];
|
|
InputBytes += j;
|
|
while(Indices!=IndicesEnd)
|
|
{
|
|
uint32 id = *Indices++;
|
|
mIndices2[mOffset[InputBytes[id<<2]]++] = id;
|
|
}
|
|
|
|
// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
|
|
uint32* Tmp = mIndices; mIndices = mIndices2; mIndices2 = Tmp;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// This is a special case to correctly handle negative values
|
|
CHECK_PASS_VALIDITY(j);
|
|
|
|
if(PerformPass)
|
|
{
|
|
// Create biased offsets, in order for negative numbers to be sorted as well
|
|
mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
|
|
for(i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
|
|
|
|
// We must reverse the sorting order for negative numbers!
|
|
mOffset[255] = 0;
|
|
for(i=0;i<127;i++) mOffset[254-i] = mOffset[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
|
|
for(i=128;i<256;i++) mOffset[i] += CurCount[i]; // Fixing the wrong place for negative values
|
|
|
|
// Perform Radix Sort
|
|
for(i=0;i<nb;i++)
|
|
{
|
|
uint32 Radix = input[mIndices[i]]>>24; // Radix byte, same as above. AND is useless here (uint32).
|
|
// ### cmp to be killed. Not good. Later.
|
|
if(Radix<128) mIndices2[mOffset[Radix]++] = mIndices[i]; // Number is positive, same as above
|
|
else mIndices2[--mOffset[Radix]] = mIndices[i]; // Number is negative, flip the sorting order
|
|
}
|
|
// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
|
|
uint32* Tmp = mIndices; mIndices = mIndices2; mIndices2 = Tmp;
|
|
}
|
|
else
|
|
{
|
|
// The pass is useless, yet we still have to reverse the order of current list if all values are negative.
|
|
if(UniqueVal>=128)
|
|
{
|
|
for(i=0;i<nb;i++) mIndices2[i] = mIndices[nb-i-1];
|
|
|
|
// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
|
|
uint32* Tmp = mIndices; mIndices = mIndices2; mIndices2 = Tmp;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
/**
|
|
* Resets the inner indices. After the call, mIndices is reset.
|
|
*/
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
void RadixSort::resetIndices()
|
|
{
|
|
for(uint32 i=0;i<mCurrentSize;i++) mIndices[i] = i;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
/**
|
|
* Gets the ram used.
|
|
* \return memory used in bytes
|
|
*/
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
uint32 RadixSort::usedRam() const
|
|
{
|
|
uint32 UsedRam = sizeof(RadixSort);
|
|
#ifndef RADIX_LOCAL_RAM
|
|
UsedRam += 256*4*sizeof(uint32); // Histograms
|
|
UsedRam += 256*sizeof(uint32); // Offsets
|
|
#endif
|
|
UsedRam += 2*mCurrentSize*sizeof(uint32); // 2 lists of indices
|
|
return UsedRam;
|
|
}
|