CIS565-Fall-2016 · itoupeter · Sep 18, 2016 · Sep 21, 2016 · Sep 21, 2016 · Sep 21, 2016
diff --git a/README.md b/README.md
@@ -1,13 +1,27 @@
-CUDA Stream Compaction
-======================
+# University of Pennsylvania, CIS 565: GPU Programming and Architecture
+## Project 2 - Stream Compaction
+* Liang Peng
+* Tested on: Windows 10, i7-6700HQ @ 2.6GHz, 8GB, GTX 960M (Personal Computer)
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
+## Screenshots
+* Result
+<br><img src="img/Capture1.PNG" width="500"></img>
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
-
-### (TODO: Your README)
-
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+## Analysis
+* Time measurement with std::chrono
+<blockquote>
+high_resolution_clock::time_point t1;<br>
+kernel<<<..., ...>>>(...);<br>
+cudaDeviceSynchronize();<br>
+high_resolution_clock::time_point t2;<br>
+duration t = t2 - t1;<br>
+print t.count();<br>
+</blockquote>
 
+* Array size
+<br><img src="img/Capture2.PNG" width="500"></img>
+  * _Observation_ CPU implementation is always faster than GPU implementation, the reason might be there is considerable overhead in the GPU implementation. My speculation is that as the scan process goes to next level, number of idling threads increases, which can be optimized by removing idle threads before launching kernel for next level. With this approach, index for threads to access elements in array must be recalculated properly.
+
+* Block size 
+<br><img src="img/Capture3.PNG" width="500"></img>
+  * _Observation_ As block size increases, time consumed to perform scan decreases and at some point stablizes. The reason is the analyzed in last project.
diff --git a/img/Capture1.PNG b/img/Capture1.PNG
diff --git a/img/Capture2.PNG b/img/Capture2.PNG
diff --git a/img/Capture3.PNG b/img/Capture3.PNG
diff --git a/src/main.cpp b/src/main.cpp
@@ -14,9 +14,14 @@
 #include "testing_helpers.hpp"
 
 int main(int argc, char* argv[]) {
-    const int SIZE = 1 << 8;
+    const int SIZE = 1 << 23;
     const int NPOT = SIZE - 3;
-    int a[SIZE], b[SIZE], c[SIZE];
+    int *a, *b, *c;
+
+	// allocate memory for test data
+	a = new int [SIZE];
+	b = new int [SIZE];
+	c = new int [SIZE];
 
     // Scan tests
 
@@ -120,4 +125,9 @@ int main(int argc, char* argv[]) {
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
+
+	// free memory
+	delete [] a;
+	delete [] b;
+	delete [] c;
 }
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_50
     )
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
@@ -24,6 +24,13 @@ namespace Common {
  */
 __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
     // TODO
+	int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (index >= n) {
+		return;
+	}
+
+	bools[index] = idata[index] == 0 ? 0 : 1;
 }
 
 /**
@@ -33,6 +40,13 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
 __global__ void kernScatter(int n, int *odata,
         const int *idata, const int *bools, const int *indices) {
     // TODO
+	int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (index >= n || bools[index] == 0) {
+		return;
+	}
+
+	odata[indices[index]] = idata[index];
 }
 
 }

diff --git a/stream_compaction/common.h b/stream_compaction/common.h
@@ -26,6 +26,7 @@ inline int ilog2ceil(int x) {
 
 
 namespace StreamCompaction {
+	const int BLOCK_SIZE = 256;
 namespace Common {
     __global__ void kernMapToBoolean(int n, int *bools, const int *idata);
 

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -1,5 +1,10 @@
 #include <cstdio>
+#include <chrono>
+using namespace std::chrono;
 #include "cpu.h"
+#include <iostream>
+using std::cout;
+using std::endl;
 
 namespace StreamCompaction {
 namespace CPU {
@@ -9,7 +14,15 @@ namespace CPU {
  */
 void scan(int n, int *odata, const int *idata) {
     // TODO
-    printf("TODO\n");
+    //printf("TODO\n");
+	high_resolution_clock::time_point t1 = high_resolution_clock::now();
+	odata[0] = 0;
+	for (int i = 1; i < n; ++i) {
+		odata[i] = odata[i - 1] + idata[i - 1];
+	}
+	high_resolution_clock::time_point t2 = high_resolution_clock::now();
+	duration<int, std::milli> t12 = duration_cast<duration<int, std::milli>>(t2 - t1);
+	cout << "----------Time consumed: " << t12.count() << " ms----------" << endl;
 }
 
 /**
@@ -19,7 +32,14 @@ void scan(int n, int *odata, const int *idata) {
  */
 int compactWithoutScan(int n, int *odata, const int *idata) {
     // TODO
-    return -1;
+	int m = 0;
+
+	for (int i = 0; i < n; ++i) {
+		if (idata[i] == 0) continue;
+		odata[m++] = idata[i];
+	}
+
+    return m;
 }
 
 /**
@@ -29,7 +49,24 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
  */
 int compactWithScan(int n, int *odata, const int *idata) {
     // TODO
-    return -1;
+	int *nonZero = new int [n];
+	int m = 0;
+
+	for (int i = 0; i < n; ++i) {
+		nonZero[i] = idata[i] == 0 ? 0 : 1;
+	}
+
+	scan(n, odata, nonZero);
+	m = odata[n - 1];
+
+	for (int i = 0; i < n; ++i) {
+		if (nonZero[i] == 0) continue;
+		odata[odata[i]] = idata[i];
+	}
+
+	delete [] nonZero;
+
+    return m;
 }
 
 }

diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
@@ -1,19 +1,106 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include <chrono>
+using namespace std::chrono;
+#include <iostream>
+using std::cout;
+using std::endl;
 #include "common.h"
 #include "efficient.h"
 
 namespace StreamCompaction {
 namespace Efficient {
 
 // TODO: __global__
+__global__ void upSweep(const int n, const int step, int *data) {
+
+	int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (index >= n) {
+		return;
+	}
+
+	int rIndex = n - 1 - index;
+	int mask = 1;
+
+	for (int i = 1; i != step; (i <<= 1), (mask = mask << 1 | 1));
+
+	if (index - step >= 0 && (rIndex & mask) == 0) {
+		data[index] = data[index] + data[index - step];
+	}
+}
+
+__global__ void downSweep(const int n, const int step, int *data) {
+	int index = blockIdx.x * blockDim.x + threadIdx.x;
+
+	if (index >= n) {
+		return;
+	}
+
+	int rIndex = n - 1 - index;
+	int mask = 1;
+
+	for (int i = 1; i != step; (i <<= 1), (mask = mask << 1 | 1));
+
+	if (index - step >= 0 && (rIndex & mask) == 0) {
+		auto tmp = data[index];
+		data[index] += data[index - step];
+		data[index - step] = tmp;
+	}
+}
+
+void scanOnGPU(const int n, int *dev_data) {
+	dim3 blockCount = (n - 1) / BLOCK_SIZE + 1;
+	int step;
+
+	// up-sweep
+	for (step = 1; step < n; step <<= 1) {
+		upSweep<<<blockCount, BLOCK_SIZE>>>(n, step, dev_data);
+	}
+
+	// set last element to 0
+	cudaMemset(&dev_data[n - 1], 0, sizeof(int));
+
+	// down-sweep
+	for (step >>= 1; step > 0; step >>= 1) {
+		downSweep<<<blockCount, BLOCK_SIZE>>>(n, step, dev_data);
+	}
+}
 
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
 void scan(int n, int *odata, const int *idata) {
     // TODO
-    printf("TODO\n");
+    // printf("TODO\n");
+	int *dev_data;
+
+	// device memory allocation
+	cudaMalloc((void**)&dev_data, sizeof(int) * n);
+	checkCUDAError("Failed to allocate dev_data");
+
+	// copy input data to device
+	cudaMemcpy((void*)dev_data, (const void*)idata, sizeof(int) * n,
+			cudaMemcpyHostToDevice);
+
+	// tic
+	high_resolution_clock::time_point t1 = high_resolution_clock::now();
+
+	// do scan
+	scanOnGPU(n, dev_data);
+
+	// toc
+	cudaDeviceSynchronize();
+	high_resolution_clock::time_point t2 = high_resolution_clock::now();
+	duration<int, std::milli> t12 = duration_cast<duration<int, std::milli>>(t2 - t1);
+	cout << "----------Time consumed: " << t12.count() << " ms----------" << endl;
+
+	// copy result to host
+	cudaMemcpy((void*)odata, (const void*)dev_data, sizeof(int) * n,
+			cudaMemcpyDeviceToHost);
+
+	// free memory on device
+	cudaFree(dev_data);
 }
 
 /**
@@ -27,7 +114,58 @@ void scan(int n, int *odata, const int *idata) {
  */
 int compact(int n, int *odata, const int *idata) {
     // TODO
-    return -1;
+	int count;
+	int *dev_data;
+	int *dev_dataCopy;
+	int *dev_bool;
+	int *dev_boolScan;
+
+	// device memory allocation
+	cudaMalloc((void**)&dev_data, sizeof(int) * n);
+	checkCUDAError("Failed to allocate dev_data");
+
+	cudaMalloc((void**)&dev_dataCopy, sizeof(int) * n);
+	checkCUDAError("Failed to allocate dev_dataCopy");
+
+	cudaMalloc((void**)&dev_bool, sizeof(int) * n);
+	checkCUDAError("Failed to allocate dev_bool");
+
+	cudaMalloc((void**)&dev_boolScan, sizeof(int) * n);
+	checkCUDAError("Failed to allocate dev_boolScan");
+
+	// copy input data to device
+	cudaMemcpy((void*)dev_data, (const void*)idata, sizeof(int) * n,
+			cudaMemcpyHostToDevice);
+
+	dim3 blockCount = (n - 1) / BLOCK_SIZE + 1;
+
+	// map to booleans
+	Common::kernMapToBoolean<<<blockCount, BLOCK_SIZE>>>(n, dev_bool, dev_data);
+
+	// scan booleans
+	cudaMemcpy((void*)dev_boolScan, (const void*)dev_bool, sizeof(int) * n,
+			cudaMemcpyDeviceToDevice);
+	scanOnGPU(n, dev_boolScan);
+
+	// scatter
+	cudaMemcpy((void*)dev_dataCopy, (const void*)dev_data, sizeof(int) * n,
+			cudaMemcpyDeviceToDevice);
+	Common::kernScatter<<<blockCount, BLOCK_SIZE>>>(n, dev_data, dev_dataCopy,
+			dev_bool, dev_boolScan);
+
+	// copy result to host
+	cudaMemcpy((void*)odata, (const void*)dev_data, sizeof(int) * n,
+			cudaMemcpyDeviceToHost);
+	cudaMemcpy((void*)&count, (const void*)&dev_boolScan[n - 1], sizeof(int),
+			cudaMemcpyDeviceToHost);
+
+	// free memory on device
+	cudaFree(dev_data);
+	cudaFree(dev_dataCopy);
+	cudaFree(dev_bool);
+	cudaFree(dev_boolScan);
+
+    return count;
 }
 
 }