Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 71 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,77 @@
CUDA Stream Compaction
CIS 565 Project 2 - CUDA Stream Compaction
======================
* Richard Lee
* Tested on: Windows 7, i7-3720QM @ 2.60GHz 8GB, GT 650M 4GB (Personal Computer)

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
## Performance analysis
Performance testing was done on each implementation across a range of array sizes, averaged across 100 iterations each.
![](images/scanPerformance.png)
![](images/compactPerformance.png)
Overall, the CPU implementation for both the scan and stream compaction algorithms far outperformed their GPU counterparts. This was most likely due to the fact that they were able to deal with the input array and access memory much more efficiently than the GPU. In addition, I was only able to run the algorithms on inputs up to 2^16 in size, due to hardware restrictions - if run on even larger inputs, the GPU may have been able to take advantage of the parallel algorithms and gain a computational advantage over the CPU implementations.

* (TODO) YOUR NAME HERE
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
One performance bottleneck I encountered was memory, as I was unable to allocate enough memory for an array of size greater than 2^16 on the GPU. I also found that the work-efficient scan was less performant than the naive scan, which could have been due to the fact that the number of threads allocated was not adjusted at runtime based on the level of up-sweep and down-sweep, which would be an additional feature to implement.

### (TODO: Your README)
## Test Output
```
****************
** SCAN TESTS **
****************
[ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 7 0 ]
==== cpu scan, power-of-two ====
[ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 803684 803691 ]
==== cpu scan, non-power-of-two ====
[ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 803630 803660 ]
passed
==== naive scan, power-of-two ====
[ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 803684 803691 ]
passed
==== naive scan, non-power-of-two ====
passed
==== work-efficient scan, power-of-two ====
passed
==== work-efficient scan, non-power-of-two ====
passed
==== thrust scan, power-of-two ====
passed
==== thrust scan, non-power-of-two ====
passed

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
****************************
** SCAN PERFORMANCE TESTS **
****************************
CPU POW SCAN TIME ELAPSED : 0.060004 milliseconds.
CPU NPOT SCAN TIME ELAPSED : 0.060004 milliseconds.
NAIVE POW SCAN TIME ELAPSED : 0.419879 milliseconds.
NAIVE NPOT SCAN TIME ELAPSED : 0.361572 milliseconds.
EFFICIENT POW SCAN TIME ELAPSED : 0.492805 milliseconds.
EFFICIENT NPOT SCAN TIME ELAPSED : 0.493135 milliseconds.
THRUST POW SCAN TIME ELAPSED : 1.0536 milliseconds.
THRUST NPOT SCAN TIME ELAPSED : 1.06989 milliseconds.

*****************************
** STREAM COMPACTION TESTS **
*****************************
[ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 3 0 ]
==== cpu compact without scan, power-of-two ====
[ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 3 ]
passed
==== cpu compact without scan, non-power-of-two ====
[ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 2 ]
passed
==== cpu compact with scan ====
[ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 3 ]
passed
==== work-efficient compact, power-of-two ====
passed
==== work-efficient compact, non-power-of-two ====
passed

*****************************************
** STREAM COMPACTION PERFORMANCE TESTS **
*****************************************
CPU COMPACT NOSCAN POW TIME ELAPSED : 0.230013 milliseconds.
CPU COMPACT NOSCAN NPOT TIME ELAPSED : 0.230013 milliseconds.
CPU COMPACT SCAN TIME ELAPSED : 0.390022 milliseconds.
EFFICIENT POW COMPACT TIME ELAPSED : 0.530948 milliseconds.
EFFICIENT NPOT COMPACT TIME ELAPSED : 0.533724 milliseconds.
```
Binary file added images/compactPerformance.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added images/scanPerformance.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
130 changes: 128 additions & 2 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@
#include <stream_compaction/efficient.h>
#include <stream_compaction/thrust.h>
#include "testing_helpers.hpp"
#include <iostream>
#include <chrono>

int main(int argc, char* argv[]) {
const int SIZE = 1 << 8;
const int SIZE = 1 << 15;
const int NPOT = SIZE - 3;
int a[SIZE], b[SIZE], c[SIZE];

Expand Down Expand Up @@ -43,7 +45,7 @@ int main(int argc, char* argv[]) {
zeroArray(SIZE, c);
printDesc("naive scan, power-of-two");
StreamCompaction::Naive::scan(SIZE, c, a);
//printArray(SIZE, c, true);
printArray(SIZE, c, true);
printCmpResult(SIZE, b, c);

zeroArray(SIZE, c);
Expand Down Expand Up @@ -76,6 +78,79 @@ int main(int argc, char* argv[]) {
//printArray(NPOT, c, true);
printCmpResult(NPOT, b, c);

printf("\n");
printf("****************************\n");
printf("** SCAN PERFORMANCE TESTS **\n");
printf("****************************\n");
uint32_t iterations = 100;
zeroArray(SIZE, c);
auto begin = std::chrono::high_resolution_clock::now();
for (int i = 0; i < iterations; ++i)
{
StreamCompaction::CPU::scan(SIZE, c, a);
}
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count();
std::cout << "CPU POW SCAN TIME ELAPSED : " << (float)(duration / iterations) * 0.000001 << " milliseconds." << std::endl;

zeroArray(SIZE, c);
begin = std::chrono::high_resolution_clock::now();
for (int i = 0; i < iterations; ++i)
{
StreamCompaction::CPU::scan(NPOT, c, a);
}
end = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count();
std::cout << "CPU NPOT SCAN TIME ELAPSED : " << (float)(duration / iterations) * 0.000001 << " milliseconds." << std::endl;

zeroArray(SIZE, c);
float timer = 0.0f;
for (int i = 0; i < iterations; ++i)
{
timer += StreamCompaction::Naive::scan(SIZE, c, a);
}
std::cout << "NAIVE POW SCAN TIME ELAPSED : " << timer / iterations << " milliseconds." << std::endl;

zeroArray(SIZE, c);
timer = 0.0f;
for (int i = 0; i < iterations; ++i)
{
timer += StreamCompaction::Naive::scan(NPOT, c, a);
}
std::cout << "NAIVE NPOT SCAN TIME ELAPSED : " << timer / iterations << " milliseconds." << std::endl;

zeroArray(SIZE, c);
timer = 0.0f;
for (int i = 0; i < iterations; ++i)
{
timer += StreamCompaction::Efficient::scan(SIZE, c, a);
}
std::cout << "EFFICIENT POW SCAN TIME ELAPSED : " << timer / iterations << " milliseconds." << std::endl;

zeroArray(SIZE, c);
timer = 0.0f;
for (int i = 0; i < iterations; ++i)
{
timer += StreamCompaction::Efficient::scan(NPOT, c, a);
}
std::cout << "EFFICIENT NPOT SCAN TIME ELAPSED : " << timer / iterations << " milliseconds." << std::endl;

zeroArray(SIZE, c);
timer = 0.0f;
for (int i = 0; i < iterations; ++i)
{
timer += StreamCompaction::Thrust::scan(SIZE, c, a);
}
std::cout << "THRUST POW SCAN TIME ELAPSED : " << timer / iterations << " milliseconds." << std::endl;

zeroArray(SIZE, c);
timer = 0.0f;
for (int i = 0; i < iterations; ++i)
{
timer += StreamCompaction::Thrust::scan(NPOT, c, a);
}
std::cout << "THRUST NPOT SCAN TIME ELAPSED : " << timer / iterations << " milliseconds." << std::endl;

printf("\n");
printf("*****************************\n");
printf("** STREAM COMPACTION TESTS **\n");
Expand Down Expand Up @@ -120,4 +195,55 @@ int main(int argc, char* argv[]) {
count = StreamCompaction::Efficient::compact(NPOT, c, a);
//printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);

printf("\n");
printf("*****************************************\n");
printf("** STREAM COMPACTION PERFORMANCE TESTS **\n");
printf("*****************************************\n");

zeroArray(SIZE, c);
begin = std::chrono::high_resolution_clock::now();
for (int i = 0; i < iterations; ++i)
{
StreamCompaction::CPU::compactWithoutScan(SIZE, c, a);
}
end = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count();
std::cout << "CPU COMPACT NOSCAN POW TIME ELAPSED : " << (float)(duration / iterations) * 0.000001 << " milliseconds." << std::endl;

zeroArray(SIZE, c);
begin = std::chrono::high_resolution_clock::now();
for (int i = 0; i < iterations; ++i)
{
StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
}
end = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count();
std::cout << "CPU COMPACT NOSCAN NPOT TIME ELAPSED : " << (float)(duration / iterations) * 0.000001 << " milliseconds." << std::endl;

zeroArray(SIZE, c);
begin = std::chrono::high_resolution_clock::now();
for (int i = 0; i < iterations; ++i)
{
StreamCompaction::CPU::compactWithScan(SIZE, c, a);
}
end = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin).count();
std::cout << "CPU COMPACT SCAN TIME ELAPSED : " << (float)(duration / iterations) * 0.000001 << " milliseconds." << std::endl;

zeroArray(SIZE, c);
timer = 0.0f;
for (int i = 0; i < iterations; ++i)
{
StreamCompaction::Efficient::compact(SIZE, c, a, &timer);
}
std::cout << "EFFICIENT POW COMPACT TIME ELAPSED : " << timer / iterations << " milliseconds." << std::endl;

zeroArray(SIZE, c);
timer = 0.0f;
for (int i = 0; i < iterations; ++i)
{
StreamCompaction::Efficient::compact(NPOT, c, a, &timer);
}
std::cout << "EFFICIENT NPOT COMPACT TIME ELAPSED : " << timer / iterations << " milliseconds." << std::endl;
}
12 changes: 10 additions & 2 deletions stream_compaction/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@ namespace Common {
* which map to 0 will be removed, and elements which map to 1 will be kept.
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
int index = (blockIdx.x * blockDim.x) + threadIdx.x;
if (index < n) {
bools[index] = (idata[index] != 0);
}
}

/**
Expand All @@ -32,7 +35,12 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
*/
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
int index = (blockIdx.x * blockDim.x) + threadIdx.x;
if (index < n) {
if (bools[index]) {
odata[indices[index]] = idata[index];
}
}
}

}
Expand Down
40 changes: 34 additions & 6 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <cstdio>
#include <vector>
#include "cpu.h"

namespace StreamCompaction {
Expand All @@ -8,8 +9,11 @@ namespace CPU {
* CPU scan (prefix sum).
*/
void scan(int n, int *odata, const int *idata) {
// TODO
printf("TODO\n");
int sum = 0;
for (int i = 0; i < n; i++) {
odata[i] = sum;
sum += idata[i];
}
}

/**
Expand All @@ -18,8 +22,14 @@ void scan(int n, int *odata, const int *idata) {
* @returns the number of elements remaining after compaction.
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
// TODO
return -1;
int j = 0;
for (int i = 0; i < n; i++) {
if (idata[i] != 0) {
odata[j] = idata[i];
j++;
}
}
return j;
}

/**
Expand All @@ -28,8 +38,26 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
* @returns the number of elements remaining after compaction.
*/
int compactWithScan(int n, int *odata, const int *idata) {
// TODO
return -1;
// Map elements to boolean array
std::vector<int> bools(n);
for (int i = 0; i < n; i++) {
bools[i] = (idata[i] != 0);
}

// Perform exclusive scan on temp array
std::vector<int> indices(n);
scan(n, indices.data(), bools.data());

// Scatter
int elementCount;
for (int i = 0; i < n; i++) {
if (bools[i]) {
odata[indices[i]] = idata[i];
elementCount = indices[i] + 1;
}
}

return elementCount;
}

}
Expand Down
Loading