rename 2.0 tag to 2.0.0

2.0.0
castano 16 years ago
parent 7933aeea09
commit a1b655d0d5

@ -1,12 +1,3 @@
NVIDIA Texture Tools version 2.1.0
* CTX1 CUDA compressor.
* DXT1n CUDA compressor.
NVIDIA Texture Tools version 2.0.1
* Fix memory leaks.
* Pre-allocate device memory for CUDA compressor.
* Add single color compressor.
NVIDIA Texture Tools version 2.0.0
* Fixed PSNR formula in nvimgdiff.
* Added support for arbitrary RGB formats.

@ -2,7 +2,7 @@
--------------------------------------------------------------------------------
NVIDIA Texture Tools
README.txt
Version 2.1
Version 2.0.0
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------

@ -1 +1 @@
2.1.0
2.0.0

Binary file not shown.

Binary file not shown.

@ -327,10 +327,6 @@
RelativePath="..\..\..\src\nvcore\nvcore.h"
>
</File>
<File
RelativePath="..\..\..\src\nvcore\Ptr.h"
>
</File>
<File
RelativePath="..\..\..\src\nvcore\StrLib.h"
>

@ -3,18 +3,18 @@ Microsoft Visual Studio Solution File, Format Version 9.00
# Visual Studio 2005
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvtt", "nvtt\nvtt.vcproj", "{1AEB7681-57D8-48EE-813D-5C41CC38B647}"
ProjectSection(ProjectDependencies) = postProject
{50C465FE-B308-42BC-894D-89484482AF06} = {50C465FE-B308-42BC-894D-89484482AF06}
{4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531}
{F143D180-D4C4-4037-B3DE-BE89A21C8D1D} = {F143D180-D4C4-4037-B3DE-BE89A21C8D1D}
{CE017322-01FC-4851-9C8B-64E9A8E26C38} = {CE017322-01FC-4851-9C8B-64E9A8E26C38}
{F143D180-D4C4-4037-B3DE-BE89A21C8D1D} = {F143D180-D4C4-4037-B3DE-BE89A21C8D1D}
{4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531}
{50C465FE-B308-42BC-894D-89484482AF06} = {50C465FE-B308-42BC-894D-89484482AF06}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvcompress", "nvcompress\nvcompress.vcproj", "{88079E38-83AA-4E8A-B18A-66A78D1B058B}"
ProjectSection(ProjectDependencies) = postProject
{50C465FE-B308-42BC-894D-89484482AF06} = {50C465FE-B308-42BC-894D-89484482AF06}
{4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531}
{1AEB7681-57D8-48EE-813D-5C41CC38B647} = {1AEB7681-57D8-48EE-813D-5C41CC38B647}
{F143D180-D4C4-4037-B3DE-BE89A21C8D1D} = {F143D180-D4C4-4037-B3DE-BE89A21C8D1D}
{1AEB7681-57D8-48EE-813D-5C41CC38B647} = {1AEB7681-57D8-48EE-813D-5C41CC38B647}
{4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531}
{50C465FE-B308-42BC-894D-89484482AF06} = {50C465FE-B308-42BC-894D-89484482AF06}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvimage", "nvimage\nvimage.vcproj", "{4046F392-A18B-4C66-9639-3EABFFF5D531}"
@ -34,16 +34,16 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvddsinfo", "nvddsinfo\nvdd
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvdecompress", "nvdecompress\nvdecompress.vcproj", "{75A0527D-BFC9-49C3-B46B-CD1A901D5927}"
ProjectSection(ProjectDependencies) = postProject
{50C465FE-B308-42BC-894D-89484482AF06} = {50C465FE-B308-42BC-894D-89484482AF06}
{4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531}
{F143D180-D4C4-4037-B3DE-BE89A21C8D1D} = {F143D180-D4C4-4037-B3DE-BE89A21C8D1D}
{4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531}
{50C465FE-B308-42BC-894D-89484482AF06} = {50C465FE-B308-42BC-894D-89484482AF06}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvimgdiff", "nvimgdiff\nvimgdiff.vcproj", "{05A59E8B-EA70-4F22-89E8-E0927BA13064}"
ProjectSection(ProjectDependencies) = postProject
{F143D180-D4C4-4037-B3DE-BE89A21C8D1D} = {F143D180-D4C4-4037-B3DE-BE89A21C8D1D}
{50C465FE-B308-42BC-894D-89484482AF06} = {50C465FE-B308-42BC-894D-89484482AF06}
{4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531}
{F143D180-D4C4-4037-B3DE-BE89A21C8D1D} = {F143D180-D4C4-4037-B3DE-BE89A21C8D1D}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvassemble", "nvassemble\nvassemble.vcproj", "{3BC6D760-91E8-4FFB-BD0E-F86F367AD8EA}"
@ -55,18 +55,13 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvassemble", "nvassemble\nv
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvzoom", "nvzoom\nvzoom.vcproj", "{51999D3E-EF22-4BDD-965F-4201034D3DCE}"
ProjectSection(ProjectDependencies) = postProject
{50C465FE-B308-42BC-894D-89484482AF06} = {50C465FE-B308-42BC-894D-89484482AF06}
{4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531}
{F143D180-D4C4-4037-B3DE-BE89A21C8D1D} = {F143D180-D4C4-4037-B3DE-BE89A21C8D1D}
{4046F392-A18B-4C66-9639-3EABFFF5D531} = {4046F392-A18B-4C66-9639-3EABFFF5D531}
{50C465FE-B308-42BC-894D-89484482AF06} = {50C465FE-B308-42BC-894D-89484482AF06}
EndProjectSection
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Nvidia.TextureTools", "Nvidia.TextureTools\Nvidia.TextureTools.csproj", "{CAB55C39-8FA9-4912-98D9-E52669C8911D}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stress", "stress\stress.vcproj", "{317B694E-B5C1-42A6-956F-FC12B69175A6}"
ProjectSection(ProjectDependencies) = postProject
{1AEB7681-57D8-48EE-813D-5C41CC38B647} = {1AEB7681-57D8-48EE-813D-5C41CC38B647}
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug (no cuda)|Any CPU = Debug (no cuda)|Any CPU
@ -319,22 +314,6 @@ Global
{CAB55C39-8FA9-4912-98D9-E52669C8911D}.Release|Any CPU.Build.0 = Release|Any CPU
{CAB55C39-8FA9-4912-98D9-E52669C8911D}.Release|Win32.ActiveCfg = Release|Any CPU
{CAB55C39-8FA9-4912-98D9-E52669C8911D}.Release|x64.ActiveCfg = Release|Any CPU
{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug (no cuda)|Any CPU.ActiveCfg = Debug|Win32
{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug (no cuda)|Win32.ActiveCfg = Debug|Win32
{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug (no cuda)|Win32.Build.0 = Debug|Win32
{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug (no cuda)|x64.ActiveCfg = Debug|Win32
{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug|Any CPU.ActiveCfg = Debug|Win32
{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug|Win32.ActiveCfg = Debug|Win32
{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug|Win32.Build.0 = Debug|Win32
{317B694E-B5C1-42A6-956F-FC12B69175A6}.Debug|x64.ActiveCfg = Debug|Win32
{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release (no cuda)|Any CPU.ActiveCfg = Release|Win32
{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release (no cuda)|Win32.ActiveCfg = Release|Win32
{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release (no cuda)|Win32.Build.0 = Release|Win32
{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release (no cuda)|x64.ActiveCfg = Release|Win32
{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release|Any CPU.ActiveCfg = Release|Win32
{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release|Win32.ActiveCfg = Release|Win32
{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release|Win32.Build.0 = Release|Win32
{317B694E-B5C1-42A6-956F-FC12B69175A6}.Release|x64.ActiveCfg = Release|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE

@ -53,8 +53,8 @@ END
//
VS_VERSION_INFO VERSIONINFO
FILEVERSION 2,1,0,0
PRODUCTVERSION 2,1,0,0
FILEVERSION 2,0,0,0
PRODUCTVERSION 2,0,0,0
FILEFLAGSMASK 0x17L
#ifdef _DEBUG
FILEFLAGS 0x1L
@ -71,12 +71,12 @@ BEGIN
BEGIN
VALUE "CompanyName", "NVIDIA Corporation"
VALUE "FileDescription", "NVIDIA Texture Tools Dynamic Link Library"
VALUE "FileVersion", "2, 1, 0, 0"
VALUE "FileVersion", "2, 0, 0, 0"
VALUE "InternalName", "nvtt"
VALUE "LegalCopyright", "Copyright (C) 2007"
VALUE "OriginalFilename", "nvtt.dll"
VALUE "ProductName", "NVIDIA Texture Tools Dynamic Link Library"
VALUE "ProductVersion", "2, 1, 0, 0"
VALUE "ProductVersion", "2, 0, 0, 0"
END
END
BLOCK "VarFileInfo"

@ -1,201 +0,0 @@
<?xml version="1.0" encoding="Windows-1252"?>
<VisualStudioProject
ProjectType="Visual C++"
Version="8.00"
Name="stress"
ProjectGUID="{317B694E-B5C1-42A6-956F-FC12B69175A6}"
RootNamespace="stress"
Keyword="Win32Proj"
>
<Platforms>
<Platform
Name="Win32"
/>
</Platforms>
<ToolFiles>
</ToolFiles>
<Configurations>
<Configuration
Name="Debug|Win32"
OutputDirectory="$(ConfigurationName)\$(PlatformName)"
IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
ConfigurationType="1"
CharacterSet="1"
>
<Tool
Name="VCPreBuildEventTool"
/>
<Tool
Name="VCCustomBuildTool"
/>
<Tool
Name="VCXMLDataGeneratorTool"
/>
<Tool
Name="VCWebServiceProxyGeneratorTool"
/>
<Tool
Name="VCMIDLTool"
/>
<Tool
Name="VCCLCompilerTool"
Optimization="0"
AdditionalIncludeDirectories="..;..\..\..\src"
PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
MinimalRebuild="true"
BasicRuntimeChecks="3"
RuntimeLibrary="3"
UsePrecompiledHeader="0"
WarningLevel="3"
Detect64BitPortabilityProblems="true"
DebugInformationFormat="4"
/>
<Tool
Name="VCManagedResourceCompilerTool"
/>
<Tool
Name="VCResourceCompilerTool"
/>
<Tool
Name="VCPreLinkEventTool"
/>
<Tool
Name="VCLinkerTool"
OutputFile="$(SolutionDir)\$(ConfigurationName).$(PlatformName)\bin\$(ProjectName).exe"
LinkIncremental="2"
GenerateDebugInformation="true"
SubSystem="1"
TargetMachine="1"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
Name="VCManifestTool"
/>
<Tool
Name="VCXDCMakeTool"
/>
<Tool
Name="VCBscMakeTool"
/>
<Tool
Name="VCFxCopTool"
/>
<Tool
Name="VCAppVerifierTool"
/>
<Tool
Name="VCWebDeploymentTool"
/>
<Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
<Configuration
Name="Release|Win32"
OutputDirectory="$(ConfigurationName)\$(PlatformName)"
IntermediateDirectory="$(ConfigurationName)\$(PlatformName)"
ConfigurationType="1"
CharacterSet="1"
WholeProgramOptimization="1"
>
<Tool
Name="VCPreBuildEventTool"
/>
<Tool
Name="VCCustomBuildTool"
/>
<Tool
Name="VCXMLDataGeneratorTool"
/>
<Tool
Name="VCWebServiceProxyGeneratorTool"
/>
<Tool
Name="VCMIDLTool"
/>
<Tool
Name="VCCLCompilerTool"
AdditionalIncludeDirectories="..;..\..\..\src"
PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
RuntimeLibrary="2"
UsePrecompiledHeader="0"
WarningLevel="3"
Detect64BitPortabilityProblems="true"
DebugInformationFormat="3"
/>
<Tool
Name="VCManagedResourceCompilerTool"
/>
<Tool
Name="VCResourceCompilerTool"
/>
<Tool
Name="VCPreLinkEventTool"
/>
<Tool
Name="VCLinkerTool"
OutputFile="$(SolutionDir)\$(ConfigurationName).$(PlatformName)\bin\$(ProjectName).exe"
LinkIncremental="1"
GenerateDebugInformation="true"
SubSystem="1"
OptimizeReferences="2"
EnableCOMDATFolding="2"
TargetMachine="1"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
Name="VCManifestTool"
/>
<Tool
Name="VCXDCMakeTool"
/>
<Tool
Name="VCBscMakeTool"
/>
<Tool
Name="VCFxCopTool"
/>
<Tool
Name="VCAppVerifierTool"
/>
<Tool
Name="VCWebDeploymentTool"
/>
<Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
</Configurations>
<References>
</References>
<Files>
<Filter
Name="Source Files"
Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
>
<File
RelativePath="..\..\..\src\nvtt\tests\stress.cpp"
>
</File>
</Filter>
<Filter
Name="Header Files"
Filter="h;hpp;hxx;hm;inl;inc;xsd"
UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
>
</Filter>
<Filter
Name="Resource Files"
Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
>
</Filter>
</Files>
<Globals>
</Globals>
</VisualStudioProject>

@ -110,19 +110,6 @@ void ColorBlock::splatY()
}
}
/// Returns true if the block has a single color.
bool ColorBlock::isSingleColor() const
{
for(int i = 1; i < 16; i++)
{
if (m_color[0] != m_color[i])
{
return false;
}
}
return true;
}
/// Count number of unique colors in this color block.
uint ColorBlock::countUniqueColors() const

@ -24,7 +24,6 @@ namespace nv
void splatX();
void splatY();
bool isSingleColor() const;
uint countUniqueColors() const;
Color32 averageColor() const;
bool hasAlpha() const;

@ -79,9 +79,6 @@ TARGET_LINK_LIBRARIES(nvassemble nvcore nvmath nvimage)
ADD_EXECUTABLE(filtertest tests/filtertest.cpp tools/cmdline.h)
TARGET_LINK_LIBRARIES(filtertest nvcore nvmath nvimage)
ADD_EXECUTABLE(stress tests/stress.cpp tools/cmdline.h)
TARGET_LINK_LIBRARIES(stress nvcore nvmath nvimage nvtt)
ADD_EXECUTABLE(nvzoom tools/resize.cpp tools/cmdline.h)
TARGET_LINK_LIBRARIES(nvzoom nvcore nvmath nvimage)

@ -69,14 +69,7 @@ void nv::fastCompressDXT1(const Image * image, const OutputOptions::Private & ou
for (uint x = 0; x < w; x += 4) {
rgba.init(image, x, y);
if (rgba.isSingleColor())
{
QuickCompress::compressDXT1(rgba.color(0), &block);
}
else
{
QuickCompress::compressDXT1(rgba, &block);
}
if (outputOptions.outputHandler != NULL) {
outputOptions.outputHandler->writeData(&block, sizeof(block));
@ -221,16 +214,10 @@ void nv::compressDXT1(const Image * image, const OutputOptions::Private & output
rgba.init(image, x, y);
if (rgba.isSingleColor())
{
QuickCompress::compressDXT1(rgba.color(0), &block);
}
else
{
// Compress color.
squish::ColourSet colours((uint8 *)rgba.colors(), 0);
fit.SetColourSet(&colours, squish::kDxt1);
fit.Compress(&block);
}
if (outputOptions.outputHandler != NULL) {
outputOptions.outputHandler->writeData(&block, sizeof(block));

@ -56,7 +56,7 @@ namespace
static int blockSize(Format format)
{
if (format == Format_DXT1 || format == Format_DXT1a || format == Format_DXT1n) {
if (format == Format_DXT1 || format == Format_DXT1a) {
return 8;
}
else if (format == Format_DXT3) {
@ -71,9 +71,6 @@ namespace
else if (format == Format_BC5) {
return 16;
}
else if (format == Format_CTX1) {
return 8;
}
return 0;
}
@ -208,19 +205,16 @@ namespace nvtt
Compressor::Compressor() : m(*new Compressor::Private())
{
// CUDA initialization.
m.cudaSupported = cuda::isHardwarePresent();
m.cudaEnabled = m.cudaSupported;
if (m.cudaEnabled)
{
m.cuda = new CudaCompressor();
}
// @@ Do CUDA initialization here.
}
Compressor::~Compressor()
{
delete &m;
// @@ Free CUDA resources here.
}
@ -231,11 +225,6 @@ void Compressor::enableCudaAcceleration(bool enable)
{
m.cudaEnabled = enable;
}
if (m.cudaEnabled && m.cuda == NULL)
{
m.cuda = new CudaCompressor();
}
}
/// Check if CUDA acceleration is enabled.
@ -336,7 +325,7 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
{
header.setLinearSize(computeImageSize(inputOptions.targetWidth, inputOptions.targetHeight, inputOptions.targetDepth, compressionOptions.bitcount, compressionOptions.format));
if (compressionOptions.format == Format_DXT1 || compressionOptions.format == Format_DXT1a || compressionOptions.format == Format_DXT1n) {
if (compressionOptions.format == Format_DXT1 || compressionOptions.format == Format_DXT1a) {
header.setFourCC('D', 'X', 'T', '1');
if (inputOptions.isNormalMap) header.setNormalFlag(true);
}
@ -357,10 +346,6 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
header.setFourCC('A', 'T', 'I', '2');
if (inputOptions.isNormalMap) header.setNormalFlag(true);
}
else if (compressionOptions.format == Format_CTX1) {
header.setFourCC('C', 'T', 'X', '1');
if (inputOptions.isNormalMap) header.setNormalFlag(true);
}
}
// Swap bytes if necessary.
@ -685,7 +670,7 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
if (cudaEnabled)
{
nvDebugCheck(cudaSupported);
cuda->compressDXT1(image, outputOptions, compressionOptions);
cudaCompressDXT1(image, outputOptions, compressionOptions);
}
else
{
@ -712,18 +697,6 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
}
}
}
else if (compressionOptions.format == Format_DXT1n)
{
if (cudaEnabled)
{
nvDebugCheck(cudaSupported);
cuda->compressDXT1n(image, outputOptions, compressionOptions);
}
else
{
if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_UnsupportedFeature);
}
}
else if (compressionOptions.format == Format_DXT3)
{
if (compressionOptions.quality == Quality_Fastest)
@ -735,7 +708,7 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
if (cudaEnabled)
{
nvDebugCheck(cudaSupported);
cuda->compressDXT3(image, outputOptions, compressionOptions);
cudaCompressDXT3(image, outputOptions, compressionOptions);
}
else
{
@ -754,7 +727,7 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
if (cudaEnabled)
{
nvDebugCheck(cudaSupported);
cuda->compressDXT5(image, outputOptions, compressionOptions);
cudaCompressDXT5(image, outputOptions, compressionOptions);
}
else
{
@ -781,18 +754,6 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
{
compressBC5(image, outputOptions, compressionOptions);
}
else if (compressionOptions.format == Format_CTX1)
{
if (cudaEnabled)
{
nvDebugCheck(cudaSupported);
cuda->compressCTX1(image, outputOptions, compressionOptions);
}
else
{
if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_UnsupportedFeature);
}
}
return true;
}

@ -24,10 +24,6 @@
#ifndef NV_TT_COMPRESSOR_H
#define NV_TT_COMPRESSOR_H
#include <nvcore/Ptr.h>
#include <nvtt/cuda/CudaCompressDXT.h>
#include "nvtt.h"
namespace nv
@ -67,9 +63,6 @@ namespace nvtt
bool cudaSupported;
bool cudaEnabled;
nv::AutoPtr<nv::CudaCompressor> cuda;
};
} // nvtt namespace

@ -353,18 +353,12 @@ static void optimizeAlpha8(const ColorBlock & rgba, AlphaBlockDXT5 * block)
void QuickCompress::compressDXT1(Color32 c, BlockDXT1 * dxtBlock)
{
dxtBlock->col0.r = OMatch5[c.r][0];
dxtBlock->col0.g = OMatch6[c.g][0];
dxtBlock->col0.g = OMatch5[c.g][0];
dxtBlock->col0.b = OMatch5[c.b][0];
dxtBlock->col1.r = OMatch5[c.r][1];
dxtBlock->col1.g = OMatch6[c.g][1];
dxtBlock->col1.g = OMatch5[c.g][1];
dxtBlock->col1.b = OMatch5[c.b][1];
dxtBlock->indices = 0xaaaaaaaa;
if (dxtBlock->col0.u < dxtBlock->col1.u)
{
swap(dxtBlock->col0.u, dxtBlock->col1.u);
dxtBlock->indices ^= 0x55555555;
}
}
void QuickCompress::compressDXT1(const ColorBlock & rgba, BlockDXT1 * dxtBlock)

@ -48,12 +48,7 @@ void initTables()
};
*/
#if __CUDACC__
__constant__ unsigned short
#else
const static uint8
#endif
OMatch5[256][2] =
const static uint8 OMatch5[256][2] =
{
{0x00, 0x00},
{0x00, 0x00},
@ -313,12 +308,7 @@ OMatch5[256][2] =
{0x1F, 0x1F},
};
#if __CUDACC__
__constant__ unsigned short
#else
const static uint8
#endif
OMatch6[256][2] =
const static uint8 OMatch6[256][2] =
{
{0x00, 0x00},
{0x00, 0x01},

@ -122,7 +122,7 @@ static void doPrecomputation()
*/
const static uint s_bitmapTable[992] =
const static uint bitmaps[992] =
{
0x80000000,
0x40000000,

@ -28,8 +28,6 @@
#include "CudaMath.h"
#include "../SingleColorLookup.h"
#define NUM_THREADS 64 // Number of threads per block.
#if __DEVICE_EMULATION__
@ -62,7 +60,6 @@ __device__ void sortColors(const float * values, int * cmp)
{
int tid = threadIdx.x;
#if 1
cmp[tid] = (values[0] < values[tid]);
cmp[tid] += (values[1] < values[tid]);
cmp[tid] += (values[2] < values[tid]);
@ -96,30 +93,13 @@ __device__ void sortColors(const float * values, int * cmp)
if (tid > 12 && cmp[tid] == cmp[12]) ++cmp[tid];
if (tid > 13 && cmp[tid] == cmp[13]) ++cmp[tid];
if (tid > 14 && cmp[tid] == cmp[14]) ++cmp[tid];
#else
cmp[tid] = 0;
#pragma unroll
for (int i = 0; i < 16; i++)
{
cmp[tid] += (values[i] < values[tid]);
}
// Resolve elements with the same index.
#pragma unroll
for (int i = 0; i < 15; i++)
{
if (tid > 0 && cmp[tid] == cmp[i]) ++cmp[tid];
}
#endif
}
////////////////////////////////////////////////////////////////////////////////
// Load color block to shared mem
////////////////////////////////////////////////////////////////////////////////
__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16])
{
const int bid = blockIdx.x;
const int idx = threadIdx.x;
@ -144,8 +124,6 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
colorSums(colors, sums);
float3 axis = bestFitLine(colors, sums[0], kColorMetric);
*sameColor = (axis == make_float3(0, 0, 0));
dps[idx] = dot(colors[idx], axis);
#if __DEVICE_EMULATION__
@ -205,48 +183,11 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
}
}
__device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sums[16], int xrefs[16])
{
const int bid = blockIdx.x;
const int idx = threadIdx.x;
__shared__ float dps[16];
if (idx < 16)
{
// Read color and copy to shared mem.
uint c = image[(bid) * 16 + idx];
colors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f);
colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
// No need to synchronize, 16 < warp size.
#if __DEVICE_EMULATION__
} __debugsync(); if (idx < 16) {
#endif
// Sort colors along the best fit line.
colorSums(colors, sums);
float2 axis = bestFitLine(colors, sums[0]);
dps[idx] = dot(colors[idx], axis);
#if __DEVICE_EMULATION__
} __debugsync(); if (idx < 16) {
#endif
sortColors(dps, xrefs);
float2 tmp = colors[idx];
colors[xrefs[idx]] = tmp;
}
}
////////////////////////////////////////////////////////////////////////////////
// Round color to RGB565 and expand
////////////////////////////////////////////////////////////////////////////////
inline __device__ float3 roundAndExpand565(float3 v, ushort * w)
inline __device__ float3 roundAndExpand(float3 v, ushort * w)
{
v.x = rintf(__saturatef(v.x) * 31.0f);
v.y = rintf(__saturatef(v.y) * 63.0f);
@ -258,26 +199,6 @@ inline __device__ float3 roundAndExpand565(float3 v, ushort * w)
return v;
}
inline __device__ float2 roundAndExpand56(float2 v, ushort * w)
{
v.x = rintf(__saturatef(v.x) * 31.0f);
v.y = rintf(__saturatef(v.y) * 63.0f);
*w = ((ushort)v.x << 11) | ((ushort)v.y << 5);
v.x *= 0.03227752766457f; // approximate integer bit expansion.
v.y *= 0.01583151765563f;
return v;
}
inline __device__ float2 roundAndExpand88(float2 v, ushort * w)
{
v.x = rintf(__saturatef(v.x) * 255.0f);
v.y = rintf(__saturatef(v.y) * 255.0f);
*w = ((ushort)v.x << 8) | ((ushort)v.y);
v.x *= 1.0f / 255.0f;
v.y *= 1.0f / 255.0f;
return v;
}
////////////////////////////////////////////////////////////////////////////////
// Evaluate permutations
@ -313,8 +234,8 @@ __device__ float evalPermutation4(const float3 * colors, uint permutation, ushor
float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
// Round a, b to the closest 5-6-5 color and expand...
a = roundAndExpand565(a, start);
b = roundAndExpand565(b, end);
a = roundAndExpand(a, start);
b = roundAndExpand(b, end);
// compute the error
float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
@ -353,8 +274,8 @@ __device__ float evalPermutation3(const float3 * colors, uint permutation, ushor
float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
// Round a, b to the closest 5-6-5 color and expand...
a = roundAndExpand565(a, start);
b = roundAndExpand565(b, end);
a = roundAndExpand(a, start);
b = roundAndExpand(b, end);
// compute the error
float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
@ -394,8 +315,8 @@ __device__ float evalPermutation4(const float3 * colors, float3 color_sum, uint
float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
// Round a, b to the closest 5-6-5 color and expand...
a = roundAndExpand565(a, start);
b = roundAndExpand565(b, end);
a = roundAndExpand(a, start);
b = roundAndExpand(b, end);
// compute the error
float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
@ -430,8 +351,8 @@ __device__ float evalPermutation3(const float3 * colors, float3 color_sum, uint
float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
// Round a, b to the closest 5-6-5 color and expand...
a = roundAndExpand565(a, start);
b = roundAndExpand565(b, end);
a = roundAndExpand(a, start);
b = roundAndExpand(b, end);
// compute the error
float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
@ -470,8 +391,8 @@ __device__ float evalPermutation4(const float3 * colors, const float * weights,
float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
// Round a, b to the closest 5-6-5 color and expand...
a = roundAndExpand565(a, start);
b = roundAndExpand565(b, end);
a = roundAndExpand(a, start);
b = roundAndExpand(b, end);
// compute the error
float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
@ -511,8 +432,8 @@ __device__ float evalPermutation3(const float3 * colors, const float * weights,
float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
// Round a, b to the closest 5-6-5 color and expand...
a = roundAndExpand565(a, start);
b = roundAndExpand565(b, end);
a = roundAndExpand(a, start);
b = roundAndExpand(b, end);
// compute the error
float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
@ -521,114 +442,6 @@ __device__ float evalPermutation3(const float3 * colors, const float * weights,
}
*/
__device__ float evalPermutation4(const float2 * colors, float2 color_sum, uint permutation, ushort * start, ushort * end)
{
// Compute endpoints using least squares.
float2 alphax_sum = make_float2(0.0f, 0.0f);
uint akku = 0;
// Compute alpha & beta for this permutation.
#pragma unroll
for (int i = 0; i < 16; i++)
{
const uint bits = permutation >> (2*i);
alphax_sum += alphaTable4[bits & 3] * colors[i];
akku += prods4[bits & 3];
}
float alpha2_sum = float(akku >> 16);
float beta2_sum = float((akku >> 8) & 0xff);
float alphabeta_sum = float(akku & 0xff);
float2 betax_sum = 9.0f * color_sum - alphax_sum;
const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
float2 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
float2 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
// Round a, b to the closest 5-6 color and expand...
a = roundAndExpand56(a, start);
b = roundAndExpand56(b, end);
// compute the error
float2 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
return (1.0f / 9.0f) * (e.x + e.y);
}
__device__ float evalPermutation3(const float2 * colors, float2 color_sum, uint permutation, ushort * start, ushort * end)
{
// Compute endpoints using least squares.
float2 alphax_sum = make_float2(0.0f, 0.0f);
uint akku = 0;
// Compute alpha & beta for this permutation.
#pragma unroll
for (int i = 0; i < 16; i++)
{
const uint bits = permutation >> (2*i);
alphax_sum += alphaTable3[bits & 3] * colors[i];
akku += prods3[bits & 3];
}
float alpha2_sum = float(akku >> 16);
float beta2_sum = float((akku >> 8) & 0xff);
float alphabeta_sum = float(akku & 0xff);
float2 betax_sum = 4.0f * color_sum - alphax_sum;
const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
float2 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
float2 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
// Round a, b to the closest 5-6 color and expand...
a = roundAndExpand56(a, start);
b = roundAndExpand56(b, end);
// compute the error
float2 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
return (1.0f / 4.0f) * (e.x + e.y);
}
__device__ float evalPermutationCTX(const float2 * colors, float2 color_sum, uint permutation, ushort * start, ushort * end)
{
// Compute endpoints using least squares.
float2 alphax_sum = make_float2(0.0f, 0.0f);
uint akku = 0;
// Compute alpha & beta for this permutation.
#pragma unroll
for (int i = 0; i < 16; i++)
{
const uint bits = permutation >> (2*i);
alphax_sum += alphaTable4[bits & 3] * colors[i];
akku += prods4[bits & 3];
}
float alpha2_sum = float(akku >> 16);
float beta2_sum = float((akku >> 8) & 0xff);
float alphabeta_sum = float(akku & 0xff);
float2 betax_sum = 9.0f * color_sum - alphax_sum;
const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
float2 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
float2 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
// Round a, b to the closest 8-8 color and expand...
a = roundAndExpand88(a, start);
b = roundAndExpand88(b, end);
// compute the error
float2 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
return (1.0f / 9.0f) * (e.x + e.y);
}
////////////////////////////////////////////////////////////////////////////////
// Evaluate all permutations
@ -757,67 +570,6 @@ __device__ void evalAllPermutations(const float3 * colors, const float * weights
}
*/
__device__ void evalAllPermutations(const float2 * colors, float2 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
{
const int idx = threadIdx.x;
float bestError = FLT_MAX;
__shared__ uint s_permutations[160];
for(int i = 0; i < 16; i++)
{
int pidx = idx + NUM_THREADS * i;
if (pidx >= 992) break;
ushort start, end;
uint permutation = permutations[pidx];
if (pidx < 160) s_permutations[pidx] = permutation;
float error = evalPermutation4(colors, colorSum, permutation, &start, &end);
if (error < bestError)
{
bestError = error;
bestPermutation = permutation;
bestStart = start;
bestEnd = end;
}
}
if (bestStart < bestEnd)
{
swap(bestEnd, bestStart);
bestPermutation ^= 0x55555555; // Flip indices.
}
for(int i = 0; i < 3; i++)
{
int pidx = idx + NUM_THREADS * i;
if (pidx >= 160) break;
ushort start, end;
uint permutation = s_permutations[pidx];
float error = evalPermutation3(colors, colorSum, permutation, &start, &end);
if (error < bestError)
{
bestError = error;
bestPermutation = permutation;
bestStart = start;
bestEnd = end;
if (bestStart > bestEnd)
{
swap(bestEnd, bestStart);
bestPermutation ^= (~bestPermutation >> 1) & 0x55555555; // Flip indices.
}
}
}
errors[idx] = bestError;
}
__device__ void evalLevel4Permutations(const float3 * colors, const float * weights, float3 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
{
const int idx = threadIdx.x;
@ -852,39 +604,6 @@ __device__ void evalLevel4Permutations(const float3 * colors, const float * weig
errors[idx] = bestError;
}
__device__ void evalAllPermutationsCTX(const float2 * colors, float2 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
{
const int idx = threadIdx.x;
float bestError = FLT_MAX;
for(int i = 0; i < 16; i++)
{
int pidx = idx + NUM_THREADS * i;
if (pidx >= 992) break;
ushort start, end;
uint permutation = permutations[pidx];
float error = evalPermutationCTX(colors, colorSum, permutation, &start, &end);
if (error < bestError)
{
bestError = error;
bestPermutation = permutation;
bestStart = start;
bestEnd = end;
}
}
if (bestStart < bestEnd)
{
swap(bestEnd, bestStart);
bestPermutation ^= 0x55555555; // Flip indices.
}
errors[idx] = bestError;
}
////////////////////////////////////////////////////////////////////////////////
@ -996,55 +715,22 @@ __device__ void saveBlockDXT1(ushort start, ushort end, uint permutation, int xr
result[bid].y = indices;
}
__device__ void saveBlockCTX1(ushort start, ushort end, uint permutation, int xrefs[16], uint2 * result)
{
saveBlockDXT1(start, end, permutation, xrefs, result);
}
__device__ void saveSingleColorBlockDXT1(float3 color, uint2 * result)
{
const int bid = blockIdx.x;
int r = color.x * 255;
int g = color.y * 255;
int b = color.z * 255;
ushort color0 = (OMatch5[r][0] << 11) | (OMatch6[g][0] << 5) | OMatch5[b][0];
ushort color1 = (OMatch5[r][1] << 11) | (OMatch6[g][1] << 5) | OMatch5[b][1];
if (color0 < color1)
{
result[bid].x = (color0 << 16) | color1;
result[bid].y = 0xffffffff;
}
else
{
result[bid].x = (color1 << 16) | color0;
result[bid].y = 0xaaaaaaaa;
}
}
////////////////////////////////////////////////////////////////////////////////
// Compress color block
////////////////////////////////////////////////////////////////////////////////
__global__ void compressDXT1(const uint * permutations, const uint * image, uint2 * result)
__global__ void compress(const uint * permutations, const uint * image, uint2 * result)
{
__shared__ float3 colors[16];
__shared__ float3 sums[16];
__shared__ int xrefs[16];
__shared__ int sameColor;
loadColorBlock(image, colors, sums, xrefs, &sameColor);
loadColorBlock(image, colors, sums, xrefs);
__syncthreads();
if (sameColor)
{
if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
return;
}
ushort bestStart, bestEnd;
uint bestPermutation;
@ -1063,7 +749,7 @@ __global__ void compressDXT1(const uint * permutations, const uint * image, uint
}
__global__ void compressWeightedDXT1(const uint * permutations, const uint * image, uint2 * result)
__global__ void compressWeighted(const uint * permutations, const uint * image, uint2 * result)
{
__shared__ float3 colors[16];
__shared__ float3 sums[16];
@ -1092,61 +778,6 @@ __global__ void compressWeightedDXT1(const uint * permutations, const uint * ima
}
__global__ void compressNormalDXT1(const uint * permutations, const uint * image, uint2 * result)
{
__shared__ float2 colors[16];
__shared__ float2 sums[16];
__shared__ int xrefs[16];
loadColorBlock(image, colors, sums, xrefs);
__syncthreads();
ushort bestStart, bestEnd;
uint bestPermutation;
__shared__ float errors[NUM_THREADS];
evalAllPermutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
// Use a parallel reduction to find minimum error.
const int minIdx = findMinError(errors);
// Only write the result of the winner thread.
if (threadIdx.x == minIdx)
{
saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
}
}
__global__ void compressCTX1(const uint * permutations, const uint * image, uint2 * result)
{
__shared__ float2 colors[16];
__shared__ float2 sums[16];
__shared__ int xrefs[16];
loadColorBlock(image, colors, sums, xrefs);
__syncthreads();
ushort bestStart, bestEnd;
uint bestPermutation;
__shared__ float errors[NUM_THREADS];
evalAllPermutationsCTX(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
// Use a parallel reduction to find minimum error.
const int minIdx = findMinError(errors);
// Only write the result of the winner thread.
if (threadIdx.x == minIdx)
{
saveBlockCTX1(bestStart, bestEnd, bestPermutation, xrefs, result);
}
}
/*
__device__ float computeError(const float weights[16], uchar a0, uchar a1)
{
@ -1214,8 +845,8 @@ __device__ void optimizeAlpha8(const float alphas[16], uchar & a0, uchar & a1)
float a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
float b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
a0 = roundAndExpand8(a);
a1 = roundAndExpand8(b);
a0 = roundAndExpand(a);
a1 = roundAndExpand(b);
}
*/
/*
@ -1347,22 +978,12 @@ extern "C" void setupCompressKernel(const float weights[3])
// Launch kernel
////////////////////////////////////////////////////////////////////////////////
extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
{
compressDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
}
extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
{
compressWeightedDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
}
extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
extern "C" void compressKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
{
compressNormalDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
compress<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
}
extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
extern "C" void compressWeightedKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
{
compressCTX1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
compressWeighted<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
}

@ -48,18 +48,29 @@ using namespace nvtt;
#if defined HAVE_CUDA
//#define MAX_BLOCKS 32768U // 49152, 65535
#define MAX_BLOCKS 8192U // 49152, 65535
extern "C" void setupCompressKernel(const float weights[3]);
extern "C" void compressKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
extern "C" void compressWeightedKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
#include "Bitmaps.h"
extern "C" void setupCompressKernel(const float weights[3]);
extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
// @@ Store this pointer in CompressionOptions. Allocate in ctor, free in dtor.
static uint * d_bitmaps = NULL;
static void doPrecomputation()
{
if (d_bitmaps != NULL) {
return;
}
// Upload bitmaps.
cudaMalloc((void**) &d_bitmaps, 992 * sizeof(uint));
cudaMemcpy(d_bitmaps, bitmaps, 992 * sizeof(uint), cudaMemcpyHostToDevice);
#include "Bitmaps.h" // @@ Rename to BitmapTable.h
// @@ Check for errors.
// @@ Free allocated memory.
}
// Convert linear image to block linear.
static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
@ -81,68 +92,53 @@ static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
}
}
#endif
CudaCompressor::CudaCompressor()
{
#if defined HAVE_CUDA
// Allocate and upload bitmaps.
cudaMalloc((void**) &m_bitmapTable, 992 * sizeof(uint));
cudaMemcpy(m_bitmapTable, s_bitmapTable, 992 * sizeof(uint), cudaMemcpyHostToDevice);
// Allocate scratch buffers.
cudaMalloc((void**) &m_data, MAX_BLOCKS * 64U);
cudaMalloc((void**) &m_result, MAX_BLOCKS * 8U);
#endif
}
CudaCompressor::~CudaCompressor()
{
#if defined HAVE_CUDA
// Free device mem allocations.
cudaFree(m_data);
cudaFree(m_result);
cudaFree(m_bitmapTable);
#endif
}
#endif // defined HAVE_CUDA
// @@ This code is very repetitive and needs to be cleaned up.
/// Compress image using CUDA.
void CudaCompressor::compressDXT1(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
{
nvDebugCheck(cuda::isHardwarePresent());
#if defined HAVE_CUDA
doPrecomputation();
// Image size in blocks.
const uint w = (image->width() + 3) / 4;
const uint h = (image->height() + 3) / 4;
uint imageSize = w * h * 16 * sizeof(Color32);
uint * blockLinearImage = (uint *) malloc(imageSize);
convertToBlockLinear(image, blockLinearImage); // @@ Do this in parallel with the GPU, or in the GPU!
convertToBlockLinear(image, blockLinearImage); // @@ Do this on the GPU!
const uint blockNum = w * h;
const uint compressedSize = blockNum * 8;
const uint blockMax = 32768; // 49152, 65535
clock_t start = clock();
// Allocate image in device memory.
uint * d_data = NULL;
cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U));
// Allocate result.
uint * d_result = NULL;
cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U));
setupCompressKernel(compressionOptions.colorWeight.ptr());
// TODO: Add support for multiple GPUs.
uint bn = 0;
while(bn != blockNum)
{
uint count = min(blockNum - bn, MAX_BLOCKS);
uint count = min(blockNum - bn, blockMax);
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
// Launch kernel.
compressKernelDXT1(count, m_data, m_result, m_bitmapTable);
compressKernel(count, d_data, d_result, d_bitmaps);
// Check for errors.
cudaError_t err = cudaGetLastError();
@ -157,7 +153,7 @@ void CudaCompressor::compressDXT1(const Image * image, const OutputOptions::Priv
}
// Copy result to host, overwrite swizzled image.
cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost);
// Output result.
if (outputOptions.outputHandler != NULL)
@ -172,6 +168,8 @@ void CudaCompressor::compressDXT1(const Image * image, const OutputOptions::Priv
printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
free(blockLinearImage);
cudaFree(d_data);
cudaFree(d_result);
#else
if (outputOptions.errorHandler != NULL)
@ -183,11 +181,13 @@ void CudaCompressor::compressDXT1(const Image * image, const OutputOptions::Priv
/// Compress image using CUDA.
void CudaCompressor::compressDXT3(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
{
nvDebugCheck(cuda::isHardwarePresent());
#if defined HAVE_CUDA
doPrecomputation();
// Image size in blocks.
const uint w = (image->width() + 3) / 4;
const uint h = (image->height() + 3) / 4;
@ -198,9 +198,18 @@ void CudaCompressor::compressDXT3(const Image * image, const OutputOptions::Priv
const uint blockNum = w * h;
const uint compressedSize = blockNum * 8;
const uint blockMax = 32768; // 49152, 65535
// Allocate image in device memory.
uint * d_data = NULL;
cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U));
// Allocate result.
uint * d_result = NULL;
cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U));
AlphaBlockDXT3 * alphaBlocks = NULL;
alphaBlocks = (AlphaBlockDXT3 *)malloc(min(compressedSize, MAX_BLOCKS * 8U));
alphaBlocks = (AlphaBlockDXT3 *)malloc(min(compressedSize, blockMax * 8U));
setupCompressKernel(compressionOptions.colorWeight.ptr());
@ -209,12 +218,12 @@ void CudaCompressor::compressDXT3(const Image * image, const OutputOptions::Priv
uint bn = 0;
while(bn != blockNum)
{
uint count = min(blockNum - bn, MAX_BLOCKS);
uint count = min(blockNum - bn, blockMax);
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
// Launch kernel.
compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);
compressWeightedKernel(count, d_data, d_result, d_bitmaps);
// Compress alpha in parallel with the GPU.
for (uint i = 0; i < count; i++)
@ -236,7 +245,7 @@ void CudaCompressor::compressDXT3(const Image * image, const OutputOptions::Priv
}
// Copy result to host, overwrite swizzled image.
cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost);
// Output result.
if (outputOptions.outputHandler != NULL)
@ -256,6 +265,8 @@ void CudaCompressor::compressDXT3(const Image * image, const OutputOptions::Priv
free(alphaBlocks);
free(blockLinearImage);
cudaFree(d_data);
cudaFree(d_result);
#else
if (outputOptions.errorHandler != NULL)
@ -267,11 +278,13 @@ void CudaCompressor::compressDXT3(const Image * image, const OutputOptions::Priv
/// Compress image using CUDA.
void CudaCompressor::compressDXT5(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
{
nvDebugCheck(cuda::isHardwarePresent());
#if defined HAVE_CUDA
doPrecomputation();
// Image size in blocks.
const uint w = (image->width() + 3) / 4;
const uint h = (image->height() + 3) / 4;
@ -282,9 +295,18 @@ void CudaCompressor::compressDXT5(const Image * image, const OutputOptions::Priv
const uint blockNum = w * h;
const uint compressedSize = blockNum * 8;
const uint blockMax = 32768; // 49152, 65535
// Allocate image in device memory.
uint * d_data = NULL;
cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U));
// Allocate result.
uint * d_result = NULL;
cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U));
AlphaBlockDXT5 * alphaBlocks = NULL;
alphaBlocks = (AlphaBlockDXT5 *)malloc(min(compressedSize, MAX_BLOCKS * 8U));
alphaBlocks = (AlphaBlockDXT5 *)malloc(min(compressedSize, blockMax * 8U));
setupCompressKernel(compressionOptions.colorWeight.ptr());
@ -293,12 +315,12 @@ void CudaCompressor::compressDXT5(const Image * image, const OutputOptions::Priv
uint bn = 0;
while(bn != blockNum)
{
uint count = min(blockNum - bn, MAX_BLOCKS);
uint count = min(blockNum - bn, blockMax);
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
// Launch kernel.
compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);
compressWeightedKernel(count, d_data, d_result, d_bitmaps);
// Compress alpha in parallel with the GPU.
for (uint i = 0; i < count; i++)
@ -320,7 +342,7 @@ void CudaCompressor::compressDXT5(const Image * image, const OutputOptions::Priv
}
// Copy result to host, overwrite swizzled image.
cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost);
// Output result.
if (outputOptions.outputHandler != NULL)
@ -340,6 +362,8 @@ void CudaCompressor::compressDXT5(const Image * image, const OutputOptions::Priv
free(alphaBlocks);
free(blockLinearImage);
cudaFree(d_data);
cudaFree(d_result);
#else
if (outputOptions.errorHandler != NULL)
@ -350,147 +374,9 @@ void CudaCompressor::compressDXT5(const Image * image, const OutputOptions::Priv
}
void CudaCompressor::compressDXT1n(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions)
{
nvDebugCheck(cuda::isHardwarePresent());
#if defined HAVE_CUDA
// Image size in blocks.
const uint w = (image->width() + 3) / 4;
const uint h = (image->height() + 3) / 4;
uint imageSize = w * h * 16 * sizeof(Color32);
uint * blockLinearImage = (uint *) malloc(imageSize);
convertToBlockLinear(image, blockLinearImage); // @@ Do this in parallel with the GPU, or in the GPU!
const uint blockNum = w * h;
const uint compressedSize = blockNum * 8;
clock_t start = clock();
setupCompressKernel(compressionOptions.colorWeight.ptr());
// TODO: Add support for multiple GPUs.
uint bn = 0;
while(bn != blockNum)
{
uint count = min(blockNum - bn, MAX_BLOCKS);
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
// Launch kernel.
compressNormalKernelDXT1(count, m_data, m_result, m_bitmapTable);
// Check for errors.
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
if (outputOptions.errorHandler != NULL)
{
outputOptions.errorHandler->error(Error_CudaError);
}
}
// Copy result to host, overwrite swizzled image.
cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
// Output result.
if (outputOptions.outputHandler != NULL)
{
outputOptions.outputHandler->writeData(blockLinearImage, count * 8);
}
bn += count;
}
clock_t end = clock();
printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
free(blockLinearImage);
#else
if (outputOptions.errorHandler != NULL)
{
outputOptions.errorHandler->error(Error_CudaError);
}
#endif
}
void CudaCompressor::compressCTX1(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions)
{
nvDebugCheck(cuda::isHardwarePresent());
#if defined HAVE_CUDA
// Image size in blocks.
const uint w = (image->width() + 3) / 4;
const uint h = (image->height() + 3) / 4;
uint imageSize = w * h * 16 * sizeof(Color32);
uint * blockLinearImage = (uint *) malloc(imageSize);
convertToBlockLinear(image, blockLinearImage); // @@ Do this in parallel with the GPU, or in the GPU!
const uint blockNum = w * h;
const uint compressedSize = blockNum * 8;
clock_t start = clock();
setupCompressKernel(compressionOptions.colorWeight.ptr());
// TODO: Add support for multiple GPUs.
uint bn = 0;
while(bn != blockNum)
{
uint count = min(blockNum - bn, MAX_BLOCKS);
cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
// Launch kernel.
compressKernelCTX1(count, m_data, m_result, m_bitmapTable);
// Check for errors.
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
if (outputOptions.errorHandler != NULL)
{
outputOptions.errorHandler->error(Error_CudaError);
}
}
// Copy result to host, overwrite swizzled image.
cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
// Output result.
if (outputOptions.outputHandler != NULL)
{
outputOptions.outputHandler->writeData(blockLinearImage, count * 8);
}
bn += count;
}
clock_t end = clock();
printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
free(blockLinearImage);
#else
if (outputOptions.errorHandler != NULL)
{
outputOptions.errorHandler->error(Error_CudaError);
}
#endif
}
#if 0
class Task
{
public:
@ -583,7 +469,7 @@ public:
cudaMemcpy(d_blockLinearImage, blockLinearImage, blockCount * 64, cudaMemcpyHostToDevice);
// Launch kernel.
compressKernelDXT1(blockCount, d_blockLinearImage, d_compressedImage, d_bitmaps);
compressKernel(blockCount, d_blockLinearImage, d_compressedImage, d_bitmaps);
// Check for errors.
cudaError_t err = cudaGetLastError();
@ -625,6 +511,8 @@ private:
};
#endif // defined HAVE_CUDA
void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
{
@ -635,7 +523,9 @@ void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private &
const uint blockNum = ((w + 3) / 4) * ((h + 3) / 4);
const uint blockMax = 32768; // 49152, 65535
setupCompressKernelDXT1(compressionOptions.colorWeight.ptr());
doPrecomputation();
setupCompressKernel(compressionOptions.colorWeight.ptr());
ColorBlock rgba;
Task task(min(blockNum, blockMax));
@ -669,4 +559,4 @@ void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private &
#endif
}
#endif // 0

@ -31,24 +31,11 @@ namespace nv
{
class Image;
class CudaCompressor
{
public:
CudaCompressor();
~CudaCompressor();
void compressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
void compressDXT3(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
void compressDXT5(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
void compressDXT1n(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
void compressCTX1(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
private:
uint * m_bitmapTable;
uint * m_data;
uint * m_result;
};
void cudaCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
void cudaCompressDXT3(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
void cudaCompressDXT5(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
void cudaCompressDXT1_2(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
} // nv namespace

@ -82,68 +82,6 @@ inline __device__ __host__ void operator /=(float3 & b, float f)
b.z *= inv;
}
inline __device__ __host__ bool operator ==(float3 a, float3 b)
{
return a.x == b.x && a.y == b.y && a.z == b.z;
}
// float2 operators
inline __device__ __host__ float2 operator *(float2 a, float2 b)
{
return make_float2(a.x*b.x, a.y*b.y);
}
inline __device__ __host__ float2 operator *(float f, float2 v)
{
return make_float2(v.x*f, v.y*f);
}
inline __device__ __host__ float2 operator *(float2 v, float f)
{
return make_float2(v.x*f, v.y*f);
}
inline __device__ __host__ float2 operator +(float2 a, float2 b)
{
return make_float2(a.x+b.x, a.y+b.y);
}
inline __device__ __host__ void operator +=(float2 & b, float2 a)
{
b.x += a.x;
b.y += a.y;
}
inline __device__ __host__ float2 operator -(float2 a, float2 b)
{
return make_float2(a.x-b.x, a.y-b.y);
}
inline __device__ __host__ void operator -=(float2 & b, float2 a)
{
b.x -= a.x;
b.y -= a.y;
}
inline __device__ __host__ float2 operator /(float2 v, float f)
{
float inv = 1.0f / f;
return v * inv;
}
inline __device__ __host__ void operator /=(float2 & b, float f)
{
float inv = 1.0f / f;
b.x *= inv;
b.y *= inv;
}
inline __device__ __host__ float dot(float2 a, float2 b)
{
return a.x * b.x + a.y * b.y;
}
inline __device__ __host__ float dot(float3 a, float3 b)
{
@ -193,37 +131,15 @@ inline __device__ __host__ float3 firstEigenVector( float matrix[6] )
float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
float m = max(max(x, y), z);
float iv = 1.0f / m;
#if __DEVICE_EMULATION__
if (m == 0.0f) iv = 0.0f;
#endif
v = make_float3(x*iv, y*iv, z*iv);
}
return v;
}
inline __device__ bool singleColor(const float3 * colors)
{
#if __DEVICE_EMULATION__
bool sameColor = false;
for (int i = 0; i < 16; i++)
{
sameColor &= (colors[idx] == colors[0]);
}
return sameColor;
#else
__shared__ int sameColor[16];
const int idx = threadIdx.x;
sameColor[idx] = (colors[idx] == colors[0]);
sameColor[idx] &= sameColor[idx^8];
sameColor[idx] &= sameColor[idx^4];
sameColor[idx] &= sameColor[idx^2];
sameColor[idx] &= sameColor[idx^1];
return sameColor[0];
#endif
}
inline __device__ void colorSums(const float3 * colors, float3 * sums)
{
#if __DEVICE_EMULATION__
@ -301,89 +217,5 @@ inline __device__ float3 bestFitLine(const float3 * colors, float3 color_sum, fl
return firstEigenVector(covariance);
}
// @@ For 2D this may not be the most efficient method. It's a quadratic equation, right?
inline __device__ __host__ float2 firstEigenVector2D( float matrix[3] )
{
// @@ 8 iterations is probably more than enough.
float2 v = make_float2(1.0f, 1.0f);
for(int i = 0; i < 8; i++) {
float x = v.x * matrix[0] + v.y * matrix[1];
float y = v.x * matrix[1] + v.y * matrix[2];
float m = max(x, y);
float iv = 1.0f / m;
if (m == 0.0f) iv = 0.0f;
v = make_float2(x*iv, y*iv);
}
return v;
}
inline __device__ void colorSums(const float2 * colors, float2 * sums)
{
#if __DEVICE_EMULATION__
float2 color_sum = make_float2(0.0f, 0.0f, 0.0f);
for (int i = 0; i < 16; i++)
{
color_sum += colors[i];
}
for (int i = 0; i < 16; i++)
{
sums[i] = color_sum;
}
#else
const int idx = threadIdx.x;
sums[idx] = colors[idx];
sums[idx] += sums[idx^8];
sums[idx] += sums[idx^4];
sums[idx] += sums[idx^2];
sums[idx] += sums[idx^1];
#endif
}
inline __device__ float2 bestFitLine(const float2 * colors, float2 color_sum)
{
// Compute covariance matrix of the given colors.
#if __DEVICE_EMULATION__
float covariance[3] = {0, 0, 0};
for (int i = 0; i < 16; i++)
{
float2 a = (colors[i] - color_sum * (1.0f / 16.0f));
covariance[0] += a.x * a.x;
covariance[1] += a.x * a.y;
covariance[3] += a.y * a.y;
}
#else
const int idx = threadIdx.x;
float2 diff = (colors[idx] - color_sum * (1.0f / 16.0f));
__shared__ float covariance[16*3];
covariance[3 * idx + 0] = diff.x * diff.x;
covariance[3 * idx + 1] = diff.x * diff.y;
covariance[3 * idx + 2] = diff.y * diff.y;
for(int d = 8; d > 0; d >>= 1)
{
if (idx < d)
{
covariance[3 * idx + 0] += covariance[3 * (idx+d) + 0];
covariance[3 * idx + 1] += covariance[3 * (idx+d) + 1];
covariance[3 * idx + 2] += covariance[3 * (idx+d) + 2];
}
}
#endif
// Compute first eigen vector.
return firstEigenVector2D(covariance);
}
#endif // CUDAMATH_H

@ -75,9 +75,6 @@ namespace nvtt
Format_BC3n = Format_DXT5n,
Format_BC4, // ATI1
Format_BC5, // 3DC, ATI2
Format_DXT1n,
Format_CTX1,
};
/// Quality modes.

@ -1,221 +0,0 @@
// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
#include <nvtt/nvtt.h>
#include <stdio.h> // printf
#include <stdlib.h> // rand
#include <time.h> // clock
#include <string.h> // memcpy, memcmp
#include <assert.h>
#define FRAME_COUNT 1000
#define WIDTH 2048
#define HEIGHT 2048
#define INPUT_SIZE (WIDTH*HEIGHT)
#define OUTPUT_SIZE (WIDTH*HEIGHT/16*2)
static int s_input[INPUT_SIZE];
static int s_reference[OUTPUT_SIZE];
static int s_output[OUTPUT_SIZE];
static int s_frame = 0;
struct MyOutputHandler : public nvtt::OutputHandler
{
MyOutputHandler() : m_ptr(NULL) {}
virtual void beginImage(int size, int width, int height, int depth, int face, int miplevel)
{
assert(size == sizeof(int) * OUTPUT_SIZE);
assert(width == WIDTH);
assert(height == HEIGHT);
assert(depth == 1);
assert(face == 0);
assert(miplevel == 0);
m_ptr = (unsigned char *)s_output;
if (s_frame == 1)
{
// Save first result as reference.
memcpy(s_reference, s_output, sizeof(int) * OUTPUT_SIZE);
}
else if (s_frame > 1)
{
// Compare against reference.
if (memcmp(s_output, s_reference, sizeof(int) * OUTPUT_SIZE) != 0)
{
printf("Compressed image different to original.\n");
exit(EXIT_FAILURE);
}
}
}
virtual bool writeData(const void * data, int size)
{
memcpy(m_ptr, data, size);
m_ptr += size;
return true;
}
unsigned char * m_ptr;
};
void precomp()
{
unsigned int bitmaps[1024];
int num = 0;
printf("{\n");
printf("\t%8X,\n", 0);
bitmaps[0] = 0;
num = 1;
for (int a = 1; a <= 15; a++)
{
for (int b = a; b <= 15; b++)
{
for (int c = b; c <= 15; c++)
{
int indices[16];
int i = 0;
for(; i < a; i++) {
indices[i] = 0;
}
for(; i < a+b; i++) {
indices[i] = 2;
}
for(; i < a+b+c; i++) {
indices[i] = 3;
}
for(; i < 16; i++) {
indices[i] = 1;
}
unsigned int bm = 0;
for(i = 0; i < 16; i++) {
bm |= indices[i] << (i * 2);
}
printf("\t0x%8X, // %d %d %d %d\n", bm, a-0, b-a, c-b, 16-c);
bitmaps[num] = bm;
num++;
}
}
}
printf("}\n");
printf("// num = %d\n", num);
/*
for( int i = imax; i >= 0; --i )
{
// second cluster [i,j) is one third along
for( int m = i; m < 16; ++m )
{
indices[m] = 2;
}
const int jmax = ( i == 0 ) ? 15 : 16;
for( int j = jmax; j >= i; --j )
{
// third cluster [j,k) is two thirds along
for( int m = j; m < 16; ++m )
{
indices[m] = 3;
}
int kmax = ( j == 0 ) ? 15 : 16;
for( int k = kmax; k >= j; --k )
{
// last cluster [k,n) is at the end
if( k < 16 )
{
indices[k] = 1;
}
uint bitmap = 0;
bool hasThree = false;
for(int p = 0; p < 16; p++) {
bitmap |= indices[p] << (p * 2);
}
bitmaps[num] = bitmap;
num++;
}
}
}
*/
}
int main(int argc, char *argv[])
{
//precomp();
nvtt::InputOptions inputOptions;
inputOptions.setTextureLayout(nvtt::TextureType_2D, WIDTH, HEIGHT);
for (int i = 0; i < INPUT_SIZE; i++)
{
s_input[i] = rand();
}
inputOptions.setMipmapData(s_input, WIDTH, HEIGHT);
inputOptions.setMipmapGeneration(false);
nvtt::CompressionOptions compressionOptions;
compressionOptions.setFormat(nvtt::Format_DXT1);
// compressionOptions.setFormat(nvtt::Format_DXT1n);
// compressionOptions.setFormat(nvtt::Format_CTX1);
nvtt::OutputOptions outputOptions;
outputOptions.setOutputHeader(false);
MyOutputHandler outputHandler;
outputOptions.setOutputHandler(&outputHandler);
nvtt::Compressor compressor;
for (s_frame = 0; s_frame < FRAME_COUNT; s_frame++)
{
clock_t start = clock();
printf("compressing frame %d:\n", s_frame);
compressor.process(inputOptions, compressionOptions, outputOptions);
clock_t end = clock();
printf("time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
}
return EXIT_SUCCESS;
}

@ -129,15 +129,12 @@ struct NormalError
}
void done()
{
if (samples)
{
ade /= samples;
mse /= samples * 3;
rmse = sqrt(mse);
psnr = (rmse == 0) ? 999.0f : 20.0f * log10(255.0f / rmse);
}
}
void print()
{

Loading…
Cancel
Save