diff --git a/project/vc9/nvtt.sln b/project/vc9/nvtt.sln
index 02df0c7..015fd8d 100644
--- a/project/vc9/nvtt.sln
+++ b/project/vc9/nvtt.sln
@@ -93,6 +93,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "nvthread", "nvthread\nvthre
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cubemaptest", "cubemaptest\cubemaptest.vcproj", "{CFB3FEAC-5720-4B16-9D7E-039DB180B641}"
 	ProjectSection(ProjectDependencies) = postProject
+		{F143D180-D4C4-4037-B3DE-BE89A21C8D1D} = {F143D180-D4C4-4037-B3DE-BE89A21C8D1D}
 		{1AEB7681-57D8-48EE-813D-5C41CC38B647} = {1AEB7681-57D8-48EE-813D-5C41CC38B647}
 	EndProjectSection
 EndProject
diff --git a/src/nvcore/Utils.h b/src/nvcore/Utils.h
index fbcc9c6..dd7fd5b 100644
--- a/src/nvcore/Utils.h
+++ b/src/nvcore/Utils.h
@@ -7,8 +7,6 @@
 #include "nvcore.h"
 #include "Debug.h" // nvDebugCheck
 
-#include <stddef.h>
-
 // Just in case. Grrr.
 #undef min
 #undef max
diff --git a/src/nvimage/FloatImage.h b/src/nvimage/FloatImage.h
index 672b79f..d618be0 100644
--- a/src/nvimage/FloatImage.h
+++ b/src/nvimage/FloatImage.h
@@ -221,7 +221,7 @@ namespace nv
     }
 
     /// Get scanline pointer.
-    inline float * FloatImage::scanline(uint z, uint y, uint c)
+    inline float * FloatImage::scanline(uint c, uint y, uint z)
     {
         nvDebugCheck(y < m_height);
         return plane(c, z) + y * m_width;
diff --git a/src/nvtt/CubeSurface.cpp b/src/nvtt/CubeSurface.cpp
index 5827a50..8acfef1 100644
--- a/src/nvtt/CubeSurface.cpp
+++ b/src/nvtt/CubeSurface.cpp
@@ -333,7 +333,7 @@ const Vector3 & VectorTable::lookup(uint f, uint x, uint y) const {
 // - parallelize.
 // - use ISPC?
 
-static Vector3 faceNormals[6] = {
+static const Vector3 faceNormals[6] = {
     Vector3(1, 0, 0),
     Vector3(-1, 0, 0),
     Vector3(0, 1, 0),
@@ -342,11 +342,31 @@ static Vector3 faceNormals[6] = {
     Vector3(0, 0, -1),
 };
 
+static const Vector3 faceU[6] = {
+    Vector3(0, 0, -1),
+    Vector3(0, 0, 1),
+    Vector3(1, 0, 0),
+    Vector3(1, 0, 0),
+    Vector3(1, 0, 0),
+    Vector3(-1, 0, 0),
+};
+
+static const Vector3 faceV[6] = {
+    Vector3(0, -1, 0),
+    Vector3(0, -1, 0),
+    Vector3(0, 0, 1),
+    Vector3(0, 0, -1),
+    Vector3(0, -1, 0),
+    Vector3(0, -1, 0),
+};
+
+
 
 // Convolve filter against this cube.
-Vector3 CubeSurface::Private::applyCosinePowerFilter(const Vector3 & filterDir, float cosineConeAngle, float cosinePower)
+Vector3 CubeSurface::Private::applyCosinePowerFilter(const Vector3 & filterDir, float coneAngle, float cosinePower)
 {
-    const float coneAngle = acos(cosineConeAngle);
+    const float cosineConeAngle = cos(coneAngle);
+    nvDebugCheck(cosineConeAngle >= 0);
 
     Vector3 color(0);
     float sum = 0;
@@ -356,25 +376,74 @@ Vector3 CubeSurface::Private::applyCosinePowerFilter(const Vector3 & filterDir,
 
         // Test face cone agains filter cone.
         float cosineFaceAngle = dot(filterDir, faceNormals[f]);
+        float faceAngle = acosf(cosineFaceAngle);
 
-        if (cosineFaceAngle > cos(coneAngle + atan(sqrt(2)))) { // @@ Simplify this with cos(a+b) = cos(a)cos(b) - sin(a)sin(b) formula?
+        /*if (faceAngle > coneAngle + atanf(sqrtf(2))) {
             // Skip face.
             continue;
-        }
+        }*/
 
         // @@ We could do a less conservative test and test the face frustum against the cone...
 
-        // @@ Compute bounding box of cone intersection against face.
+        // Compute bounding box of cone intersection against face.
         // The intersection of the cone with the face is an elipse, we want the extents of that elipse.
-        // Hmm... we could even rasterize an elipse! Sounds like FUN!
-        uint x0 = 0, x1 = edgeLength-1;
-        uint y0 = 0, y1 = edgeLength-1;
+        // @@ Hmm... we could even rasterize an elipse! Sounds like FUN!
+
+        const int L = toI32(edgeLength-1);
+        int x0 = 0, x1 = L;
+        int y0 = 0, y1 = L;
+
+        // @@ Ugh. This is wrong, or only right when filterDir is aligned to one axis.
+        if (false) {
+            // uv coordinates corresponding to filterDir.
+            //float u = dot(filterDir, faceU[f]) / cosineFaceAngle;
+            //float v = dot(filterDir, faceV[f]) / cosineFaceAngle;
+
+            // Angular coordinates corresponding to filterDir with respect to faceNormal.
+            float atu = atan2(dot(filterDir, faceU[f]), cosineFaceAngle);
+            float atv = atan2(dot(filterDir, faceV[f]), cosineFaceAngle);
+
+            // Expand angles and project back to the face plane.
+            float u0 = tan(clamp(atu - coneAngle, -PI/4, PI/4));
+            float v0 = tan(clamp(atv - coneAngle, -PI/4, PI/4));
+            float u1 = tan(clamp(atu + coneAngle, -PI/4, PI/4));
+            float v1 = tan(clamp(atv + coneAngle, -PI/4, PI/4));
+            nvDebugCheck(u0 >= -1 && u0 <= 1);
+            nvDebugCheck(v0 >= -1 && v0 <= 1);
+            nvDebugCheck(u1 >= -1 && u1 <= 1);
+            nvDebugCheck(v1 >= -1 && v1 <= 1);
+
+            // Expand uv coordinates from [-1,1] to [0, edgeLength)
+            u0 = (u0 + 1) * edgeLength * 0.5f - 0.5f;
+            v0 = (v0 + 1) * edgeLength * 0.5f - 0.5f;
+            u1 = (u1 + 1) * edgeLength * 0.5f - 0.5f;
+            v1 = (v1 + 1) * edgeLength * 0.5f - 0.5f;
+            nvDebugCheck(u0 >= -0.5f && u0 <= edgeLength - 0.5f);
+            nvDebugCheck(v0 >= -0.5f && v0 <= edgeLength - 0.5f);
+            nvDebugCheck(u1 >= -0.5f && u1 <= edgeLength - 0.5f);
+            nvDebugCheck(v1 >= -0.5f && v1 <= edgeLength - 0.5f);
+
+            x0 = clamp(ifloor(u0), 0, L);
+            y0 = clamp(ifloor(v0), 0, L);
+            x1 = clamp(iceil(u1), 0, L);
+            y1 = clamp(iceil(v1), 0, L);
+
+            nvDebugCheck(x1 >= x0);
+            nvDebugCheck(y1 >= y0);
+        }
+
+        if (x1 == x0 || y1 == y0) {
+            // Skip this face.
+            continue;
+        }
+
 
         const Surface & inputFace = face[f];
         const FloatImage * inputImage = inputFace.m->image;
 
-        for (uint y = y0; y <= y1; y++) {
-            for (uint x = x0; x <= x1; x++) {
+        for (int y = y0; y <= y1; y++) {
+            bool inside = false;
+            for (int x = x0; x <= x1; x++) {
 
                 Vector3 dir = vectorTable->lookup(f, x, y);
                 float cosineAngle = dot(dir, filterDir);
@@ -388,6 +457,13 @@ Vector3 CubeSurface::Private::applyCosinePowerFilter(const Vector3 & filterDir,
                     color.x += contribution * inputImage->pixel(0, x, y, 0);
                     color.y += contribution * inputImage->pixel(1, x, y, 0);
                     color.z += contribution * inputImage->pixel(2, x, y, 0);
+
+                    inside = true;
+                }
+                else if (inside) {
+                    // Filter scale is monotonic, if we have been inside once and we just exit, then we can skip the rest of the row.
+                    // We could do the same thing for the columns and skip entire rows.
+                    break;
                 }
             }
         }
@@ -398,6 +474,39 @@ Vector3 CubeSurface::Private::applyCosinePowerFilter(const Vector3 & filterDir,
     return color;
 }
 
+#include "nvthread/ParallelFor.h"
+
+struct ApplyCosinePowerFilterContext {
+    CubeSurface::Private * inputCube;
+    CubeSurface::Private * filteredCube;
+    float coneAngle;
+    float cosinePower;
+};
+
+void ApplyCosinePowerFilterTask(void * context, int id)
+{
+    ApplyCosinePowerFilterContext * ctx = (ApplyCosinePowerFilterContext *)context;
+
+    int size = ctx->filteredCube->edgeLength;
+
+    int f = id / (size * size);
+    int idx = id % (size * size);
+    int y = idx / size;
+    int x = idx % size;
+
+    nvtt::Surface & filteredFace = ctx->filteredCube->face[f];
+    FloatImage * filteredImage = filteredFace.m->image;
+
+    const Vector3 filterDir = texelDirection(f, x, y, 1.0f / size);
+
+    // Convolve filter against cube.
+    Vector3 color = ctx->inputCube->applyCosinePowerFilter(filterDir, ctx->coneAngle, ctx->cosinePower);
+
+    filteredImage->pixel(0, idx) = color.x;
+    filteredImage->pixel(1, idx) = color.y;
+    filteredImage->pixel(2, idx) = color.z;
+}
+
 
 CubeSurface CubeSurface::cosinePowerFilter(int size, float cosinePower) const
 {
@@ -415,12 +524,47 @@ CubeSurface CubeSurface::cosinePowerFilter(int size, float cosinePower) const
         m->vectorTable = new VectorTable(edgeLength);
     }
 
-    const float threshold = 0.0001f;
-    const float cosineConeAngle = pow(threshold, 1/cosinePower);
-    //const float coneAngle = acos(cosineConeAngle);
+    const float threshold = 0.001f;
+    const float coneAngle = acosf(powf(threshold, 1.0f/cosinePower));
+
+
+#if 1
+    // Gather approach. This should be easier to parallelize, because there's no contention in the filtered output.
+
+    // For each texel of the output cube.
+    // - Determine what texels of the input cube contribute to it.
+    // - Add weighted contributions. Normalize.
+
+    // For each texel of the output cube.
+    /*for (uint f = 0; f < 6; f++) {
+        nvtt::Surface filteredFace = filteredCube.m->face[f];
+        FloatImage * filteredImage = filteredFace.m->image;
+
+        for (uint y = 0; y < uint(size); y++) {
+            for (uint x = 0; x < uint(size); x++) {
+
+                const Vector3 filterDir = texelDirection(f, x, y, 1.0f / size);
+
+                // Convolve filter against cube.
+                Vector3 color = m->applyCosinePowerFilter(filterDir, coneAngle, cosinePower);
+
+                filteredImage->pixel(0, x, y, 0) = color.x;
+                filteredImage->pixel(1, x, y, 0) = color.y;
+                filteredImage->pixel(2, x, y, 0) = color.z;
+            }
+        }
+    }*/
+
+    ApplyCosinePowerFilterContext context;
+    context.inputCube = m;
+    context.filteredCube = filteredCube.m;
+    context.coneAngle = coneAngle;
+    context.cosinePower = cosinePower;
 
+    nv::ParallelFor parallelFor(ApplyCosinePowerFilterTask, &context);
+    parallelFor.run(6 * size * size);
 
-#if 0
+#else
     // Scatter approach.
 
     // For each texel of the input cube.
@@ -480,54 +624,6 @@ CubeSurface CubeSurface::cosinePowerFilter(int size, float cosinePower) const
         }
     }
 
-#else
-
-    // Gather approach. This should be easier to parallelize, because there's no contention in the filtered output.
-
-    // For each texel of the output cube.
-    // - Determine what texels of the input cube contribute to it.
-    // - Add weighted contributions. Normalize.
-
-    // For each texel of the output cube. @@ Parallelize this loop.
-    for (uint f = 0; f < 6; f++) {
-        nvtt::Surface filteredFace = filteredCube.m->face[f];
-        FloatImage * filteredImage = filteredFace.m->image;
-
-        for (uint y = 0; y < uint(size); y++) {
-            for (uint x = 0; x < uint(size); x++) {
-
-                const Vector3 filterDir = texelDirection(f, x, y, 1.0f / size);
-
-                // Convolve filter against cube.
-                Vector3 color = m->applyCosinePowerFilter(filterDir, cosineConeAngle, cosinePower);
-
-                filteredImage->pixel(0, x, y, 0) = color.x;
-                filteredImage->pixel(1, x, y, 0) = color.y;
-                filteredImage->pixel(2, x, y, 0) = color.z;
-            }
-        }
-    }
-
-    /*int jobCount = 6 * size * size;
-    for (int i = 0; i < jobCount; i++) {
-        int f = i / (size * size);
-        int idx = i % (size * size);
-        int y = idx / size;
-        int x = idx % size;
-
-        nvtt::Surface filteredFace = filteredCube.m->face[f];
-        FloatImage * filteredImage = filteredFace.m->image;
-
-        const Vector3 filterDir = texelDirection(f, x, y, 1.0f / size);
-
-        // Convolve filter against cube.
-        Vector3 color = m->applyCosinePowerFilter(filterDir, coneAngle, cosinePower);
-
-        filteredImage->pixel(0, idx) = color.x;
-        filteredImage->pixel(1, idx) = color.y;
-        filteredImage->pixel(2, idx) = color.z;
-    }*/
-
 #endif
 
     return filteredCube;
diff --git a/src/nvtt/tests/cubemaptest.cpp b/src/nvtt/tests/cubemaptest.cpp
index be27fe5..8ad16bd 100644
--- a/src/nvtt/tests/cubemaptest.cpp
+++ b/src/nvtt/tests/cubemaptest.cpp
@@ -21,8 +21,12 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 
-#include <nvcore/StrLib.h>
+//#include <nvcore/StrLib.h>
+#include <nvcore/Timer.h>
 #include <nvtt/nvtt.h>
+#include <nvmath/nvmath.h>
+
+#include "../tools/cmdline.h"
 
 #include <stdlib.h> // EXIT_SUCCESS, EXIT_FAILURE
 #include <stdio.h> // printf
@@ -30,6 +34,9 @@
 
 int main(int argc, char *argv[])
 {
+    MyAssertHandler assertHandler;
+    MyMessageHandler messageHandler;
+
     // Init context.
     nvtt::Context context;
 
@@ -52,26 +59,43 @@ int main(int argc, char *argv[])
     // Setup output options.
     nvtt::OutputOptions outputOptions;
     outputOptions.setFileName("filtered_envmap.dds");
+    outputOptions.setSrgbFlag(true);
 
 
+    const int MAX_MIPMAP_COUNT = 7; // nv::log2(64) + 1;
+    //const int mipmapCount = MAX_MIPMAP_COUNT;
+    const int mipmapCount = 4;
+    //const int mipmapCount = 1;
+
     // Output header.
-    context.outputHeader(nvtt::TextureType_Cube, 64, 64, 1, 4, false, compressionOptions, outputOptions);
+    context.outputHeader(nvtt::TextureType_Cube, 64, 64, 1, mipmapCount, false, compressionOptions, outputOptions);
 
-    // Output filtered mipmaps.
-    for (int m = 0; m < 4; m++) {
-        int size = 64 / (1 << m);               // 64, 32, 16, 8
-        float cosine_power = float(64) / (1 << (2 * m)); // 64, 16,  4, 1
+    nv::Timer timer;
+    timer.start();
 
-        printf("filtering step: %d/4.\n", m+1);
+    nvtt::CubeSurface filteredEnvmap[mipmapCount];
 
-        nvtt::CubeSurface filteredEnvmap = envmap.cosinePowerFilter(size, cosine_power);
+    // Output filtered mipmaps.
+    for (int m = 0; m < mipmapCount; m++) {
+        int size = 64 / (1 << m);                           // 64, 32, 16, 8
+        float cosine_power = float(64) / (1 << (2 * m));    // 64, 16,  4, 1
+        cosine_power = nv::max(1.0f, cosine_power);
+
+        printf("filtering step: %d/%d\n", m+1, mipmapCount);
 
-        filteredEnvmap.toGamma(2.2f);
+        filteredEnvmap[m] = envmap.cosinePowerFilter(size, cosine_power);
+        filteredEnvmap[m].toGamma(2.2f);
+    }
 
-        context.compress(filteredEnvmap, m, compressionOptions, outputOptions);
+    for (int f = 0; f < 6; f++) {
+        for (int m = 0; m < mipmapCount; m++) {
+            context.compress(filteredEnvmap[m].face(f), f, m, compressionOptions, outputOptions);
+        }
     }
 
-    printf("done.\n");
+    timer.stop();
+
+    printf("done in %f seconds\n", timer.elapsed());
 
     return EXIT_SUCCESS;
 }