Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Project 2: Liang Peng #19

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
34 changes: 24 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,27 @@
CUDA Stream Compaction
======================
# University of Pennsylvania, CIS 565: GPU Programming and Architecture
## Project 2 - Stream Compaction
* Liang Peng
* Tested on: Windows 10, i7-6700HQ @ 2.6GHz, 8GB, GTX 960M (Personal Computer)

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
## Screenshots
* Result
<br><img src="img/Capture1.PNG" width="500"></img>

* (TODO) YOUR NAME HERE
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)

### (TODO: Your README)

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)
## Analysis
* Time measurement with std::chrono
<blockquote>
high_resolution_clock::time_point t1;<br>
kernel<<<..., ...>>>(...);<br>
cudaDeviceSynchronize();<br>
high_resolution_clock::time_point t2;<br>
duration t = t2 - t1;<br>
print t.count();<br>
</blockquote>

* Array size
<br><img src="img/Capture2.PNG" width="500"></img>
* _Observation_ CPU implementation is always faster than GPU implementation, the reason might be there is considerable overhead in the GPU implementation. My speculation is that as the scan process goes to next level, number of idling threads increases, which can be optimized by removing idle threads before launching kernel for next level. With this approach, index for threads to access elements in array must be recalculated properly.

* Block size
<br><img src="img/Capture3.PNG" width="500"></img>
* _Observation_ As block size increases, time consumed to perform scan decreases and at some point stablizes. The reason is the analyzed in last project.
Binary file added img/Capture1.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/Capture2.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/Capture3.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
14 changes: 12 additions & 2 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,14 @@
#include "testing_helpers.hpp"

int main(int argc, char* argv[]) {
const int SIZE = 1 << 8;
const int SIZE = 1 << 23;
const int NPOT = SIZE - 3;
int a[SIZE], b[SIZE], c[SIZE];
int *a, *b, *c;

// allocate memory for test data
a = new int [SIZE];
b = new int [SIZE];
c = new int [SIZE];

// Scan tests

Expand Down Expand Up @@ -120,4 +125,9 @@ int main(int argc, char* argv[]) {
count = StreamCompaction::Efficient::compact(NPOT, c, a);
//printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);

// free memory
delete [] a;
delete [] b;
delete [] c;
}
2 changes: 1 addition & 1 deletion stream_compaction/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ set(SOURCE_FILES

cuda_add_library(stream_compaction
${SOURCE_FILES}
OPTIONS -arch=sm_20
OPTIONS -arch=sm_50
)
14 changes: 14 additions & 0 deletions stream_compaction/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ namespace Common {
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
int index = blockIdx.x * blockDim.x + threadIdx.x;

if (index >= n) {
return;
}

bools[index] = idata[index] == 0 ? 0 : 1;
}

/**
Expand All @@ -33,6 +40,13 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
int index = blockIdx.x * blockDim.x + threadIdx.x;

if (index >= n || bools[index] == 0) {
return;
}

odata[indices[index]] = idata[index];
}

}
Expand Down
1 change: 1 addition & 0 deletions stream_compaction/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ inline int ilog2ceil(int x) {


namespace StreamCompaction {
const int BLOCK_SIZE = 256;
namespace Common {
__global__ void kernMapToBoolean(int n, int *bools, const int *idata);

Expand Down
43 changes: 40 additions & 3 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
#include <cstdio>
#include <chrono>
using namespace std::chrono;
#include "cpu.h"
#include <iostream>
using std::cout;
using std::endl;

namespace StreamCompaction {
namespace CPU {
Expand All @@ -9,7 +14,15 @@ namespace CPU {
*/
void scan(int n, int *odata, const int *idata) {
// TODO
printf("TODO\n");
//printf("TODO\n");
high_resolution_clock::time_point t1 = high_resolution_clock::now();
odata[0] = 0;
for (int i = 1; i < n; ++i) {
odata[i] = odata[i - 1] + idata[i - 1];
}
high_resolution_clock::time_point t2 = high_resolution_clock::now();
duration<int, std::milli> t12 = duration_cast<duration<int, std::milli>>(t2 - t1);
cout << "----------Time consumed: " << t12.count() << " ms----------" << endl;
}

/**
Expand All @@ -19,7 +32,14 @@ void scan(int n, int *odata, const int *idata) {
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
// TODO
return -1;
int m = 0;

for (int i = 0; i < n; ++i) {
if (idata[i] == 0) continue;
odata[m++] = idata[i];
}

return m;
}

/**
Expand All @@ -29,7 +49,24 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
*/
int compactWithScan(int n, int *odata, const int *idata) {
// TODO
return -1;
int *nonZero = new int [n];
int m = 0;

for (int i = 0; i < n; ++i) {
nonZero[i] = idata[i] == 0 ? 0 : 1;
}

scan(n, odata, nonZero);
m = odata[n - 1];

for (int i = 0; i < n; ++i) {
if (nonZero[i] == 0) continue;
odata[odata[i]] = idata[i];
}

delete [] nonZero;

return m;
}

}
Expand Down
142 changes: 140 additions & 2 deletions stream_compaction/efficient.cu
Original file line number Diff line number Diff line change
@@ -1,19 +1,106 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <chrono>
using namespace std::chrono;
#include <iostream>
using std::cout;
using std::endl;
#include "common.h"
#include "efficient.h"

namespace StreamCompaction {
namespace Efficient {

// TODO: __global__
__global__ void upSweep(const int n, const int step, int *data) {

int index = blockIdx.x * blockDim.x + threadIdx.x;

if (index >= n) {
return;
}

int rIndex = n - 1 - index;
int mask = 1;

for (int i = 1; i != step; (i <<= 1), (mask = mask << 1 | 1));

if (index - step >= 0 && (rIndex & mask) == 0) {
data[index] = data[index] + data[index - step];
}
}

__global__ void downSweep(const int n, const int step, int *data) {
int index = blockIdx.x * blockDim.x + threadIdx.x;

if (index >= n) {
return;
}

int rIndex = n - 1 - index;
int mask = 1;

for (int i = 1; i != step; (i <<= 1), (mask = mask << 1 | 1));

if (index - step >= 0 && (rIndex & mask) == 0) {
auto tmp = data[index];
data[index] += data[index - step];
data[index - step] = tmp;
}
}

void scanOnGPU(const int n, int *dev_data) {
dim3 blockCount = (n - 1) / BLOCK_SIZE + 1;
int step;

// up-sweep
for (step = 1; step < n; step <<= 1) {
upSweep<<<blockCount, BLOCK_SIZE>>>(n, step, dev_data);
}

// set last element to 0
cudaMemset(&dev_data[n - 1], 0, sizeof(int));

// down-sweep
for (step >>= 1; step > 0; step >>= 1) {
downSweep<<<blockCount, BLOCK_SIZE>>>(n, step, dev_data);
}
}

/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
// TODO
printf("TODO\n");
// printf("TODO\n");
int *dev_data;

// device memory allocation
cudaMalloc((void**)&dev_data, sizeof(int) * n);
checkCUDAError("Failed to allocate dev_data");

// copy input data to device
cudaMemcpy((void*)dev_data, (const void*)idata, sizeof(int) * n,
cudaMemcpyHostToDevice);

// tic
high_resolution_clock::time_point t1 = high_resolution_clock::now();

// do scan
scanOnGPU(n, dev_data);

// toc
cudaDeviceSynchronize();
high_resolution_clock::time_point t2 = high_resolution_clock::now();
duration<int, std::milli> t12 = duration_cast<duration<int, std::milli>>(t2 - t1);
cout << "----------Time consumed: " << t12.count() << " ms----------" << endl;

// copy result to host
cudaMemcpy((void*)odata, (const void*)dev_data, sizeof(int) * n,
cudaMemcpyDeviceToHost);

// free memory on device
cudaFree(dev_data);
}

/**
Expand All @@ -27,7 +114,58 @@ void scan(int n, int *odata, const int *idata) {
*/
int compact(int n, int *odata, const int *idata) {
// TODO
return -1;
int count;
int *dev_data;
int *dev_dataCopy;
int *dev_bool;
int *dev_boolScan;

// device memory allocation
cudaMalloc((void**)&dev_data, sizeof(int) * n);
checkCUDAError("Failed to allocate dev_data");

cudaMalloc((void**)&dev_dataCopy, sizeof(int) * n);
checkCUDAError("Failed to allocate dev_dataCopy");

cudaMalloc((void**)&dev_bool, sizeof(int) * n);
checkCUDAError("Failed to allocate dev_bool");

cudaMalloc((void**)&dev_boolScan, sizeof(int) * n);
checkCUDAError("Failed to allocate dev_boolScan");

// copy input data to device
cudaMemcpy((void*)dev_data, (const void*)idata, sizeof(int) * n,
cudaMemcpyHostToDevice);

dim3 blockCount = (n - 1) / BLOCK_SIZE + 1;

// map to booleans
Common::kernMapToBoolean<<<blockCount, BLOCK_SIZE>>>(n, dev_bool, dev_data);

// scan booleans
cudaMemcpy((void*)dev_boolScan, (const void*)dev_bool, sizeof(int) * n,
cudaMemcpyDeviceToDevice);
scanOnGPU(n, dev_boolScan);

// scatter
cudaMemcpy((void*)dev_dataCopy, (const void*)dev_data, sizeof(int) * n,
cudaMemcpyDeviceToDevice);
Common::kernScatter<<<blockCount, BLOCK_SIZE>>>(n, dev_data, dev_dataCopy,
dev_bool, dev_boolScan);

// copy result to host
cudaMemcpy((void*)odata, (const void*)dev_data, sizeof(int) * n,
cudaMemcpyDeviceToHost);
cudaMemcpy((void*)&count, (const void*)&dev_boolScan[n - 1], sizeof(int),
cudaMemcpyDeviceToHost);

// free memory on device
cudaFree(dev_data);
cudaFree(dev_dataCopy);
cudaFree(dev_bool);
cudaFree(dev_boolScan);

return count;
}

}
Expand Down
Loading