Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Final Hw3 Submission #434

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion HW3/P2/mandelbrot.cl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,21 @@ mandelbrot(__global __read_only float *coords_real,

if ((x < w) && (y < h)) {
// YOUR CODE HERE
;
z_real, z_imag = 0, 0;
c_real = coords_real[x + y*w];
c_imag = coords_imag[x + y*w];

// Perform mandelbrot computations
for (iter = 0; iter < max_iter; iter++) {
if ((z_real * z_real + z_imag * z_imag) > 4)
break;
// Update z_real and z_imag
z_real_old = z_real;
z_real = c_real + (z_real * z_real) - (z_imag * z_imag);
z_imag = c_imag + 2 * z_real_old * z_imag;
}

// Store iteration number into output counts
out_counts[x + y*w] = iter;
}
}
22 changes: 22 additions & 0 deletions HW3/P3/P3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
After running the timing code, the best configuration is as follows:

configuration ('coalesced',256,128): 0.00301426 seconds

I am running the code on my 2012 MacBook Pro 13" with the following specs:

---------------------------
Apple Apple version: OpenCL 1.2 (May 10 2015 19:38:45)
The devices detected on platform Apple are:
---------------------------
Intel(R) Core(TM) i5-3210M CPU @ 2.50GHz [Type: CPU ]
Maximum clock Frequency: 2500 MHz
Maximum allocable memory size: 2147 MB
Maximum work group size 1024
---------------------------
HD Graphics 4000 [Type: GPU ]
Maximum clock Frequency: 1100 MHz
Maximum allocable memory size: 268 MB
Maximum work group size 512
---------------------------
This context is associated with 2 devices
The queue is using the device: HD Graphics 4000
27 changes: 15 additions & 12 deletions HW3/P3/sum.cl
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@ __kernel void sum_coalesced(__global float* x,
{
float sum = 0;
size_t local_id = get_local_id(0);
unsigned int global_size = get_global_size(0);

// thread i (i.e., with i = get_global_id()) should add x[i],
// x[i + get_global_size()], ... up to N-1, and store in sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (int i = get_global_id(0); i < N; i += global_size) {
sum += x[i];
}

fast[local_id] = sum;
barrier(CLK_LOCAL_MEM_FENCE);

// binary reduction
//
Expand All @@ -24,10 +24,12 @@ __kernel void sum_coalesced(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (uint s = get_local_size(0) / 2; s > 0; s >>= 1) {
if (local_id < s) {
fast[local_id] += fast[local_id + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
}

Expand All @@ -48,12 +50,11 @@ __kernel void sum_blocked(__global float* x,
//
// Be careful that each thread stays in bounds, both relative to
// size of x (i.e., N), and the range it's assigned to sum.
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (int i = get_global_id(0) * k; i < N && i < (get_global_id(0)+1) * k; i++) {
sum += x[i];
}

fast[local_id] = sum;
barrier(CLK_LOCAL_MEM_FENCE);

// binary reduction
//
Expand All @@ -64,9 +65,11 @@ __kernel void sum_blocked(__global float* x,
// You can assume get_local_size(0) is a power of 2.
//
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
for (;;) { // YOUR CODE HERE
; // YOUR CODE HERE
for (uint s = get_local_size(0) / 2; s > 0; s >>= 1) {
if (local_id < s) {
fast[local_id] += fast[local_id + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}

if (local_id == 0) partial[get_group_id(0)] = fast[0];
}
46 changes: 43 additions & 3 deletions HW3/P4/median_filter.cl
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,42 @@ median_3x3(__global __read_only float *in_values,
// Note: It may be easier for you to implement median filtering
// without using the local buffer, first, then adjust your code to
// use such a buffer after you have that working.



// Load into buffer (with 1-pixel halo).
//
// It may be helpful to consult HW3 Problem 5, and
// https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
//
// Note that globally out-of-bounds pixels should be replaced
// with the nearest valid pixel's value.

// Find global position of output pixel
const int x = get_global_id(0)
const int y = get_global_id(1)

// Find local position of local pixel relative to workgroup
const int lx = get_local_id(0)
const int lx = get_local_id(1)

// Coordinates of the upper left corner of buffer in image space, including halo
const int buf_corner_x = x - lx - halo;
const int buf_corner_y = y - ly - halo;

// Coordinates of our pixel in local buffer
const int buf_x = lx + halo;
const int buf_y = ly + halo;

// 1D index of thread within our work-group
const int idx_1D = ly * get_local_size(0) + lx;

if (idx_1D < buf_w)
for (int row = 0; row < buf_h; row++) {
buffer[row * buf_w + idx_1D] = \
FETCH(in_values, w, h,

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is FETCH defined?
This code does not compile.

buf_corner_x + idx_1D,
buf_corner_y + row);
}

barrier(CLK_LOCAL_MEM_FENCE);

// Compute 3x3 median for each pixel in core (non-halo) pixels
//
Expand All @@ -31,4 +57,18 @@ median_3x3(__global __read_only float *in_values,

// Each thread in the valid region (x < w, y < h) should write
// back its 3x3 neighborhood median.

float median = median9(buffer[(buf_y-1) * buf_w + buf_x-1],
buffer[(buf_y-1) * buf_w + buf_x],
buffer[(buf_y-1) * buf_w + buf_x+1],
buffer[buf_y * buf_w + buf_x-1],
buffer[buf_y * buf_w + buf_x],
buffer[buf_y * buf_w + buf_x+1],
buffer[(buf_y+1) * buf_w + buf_x-1],
buffer[(buf_y+1) * buf_w + buf_x],
buffer[(buf_y+1) * buf_w + buf_x+1])

if ((y < h) && (x < w)) {
out_values[y * w + x] = median;
}
}
29 changes: 29 additions & 0 deletions HW3/P5/P5.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
Results were as follows:

Maze 1:
Part1: Finished after 904 iterations, 418.91049 ms total, 0.46339656892 ms per iteration
Found 2 regions
Part2: Finished after 531 iterations, 241.37665 ms total, 0.45456996761 ms per iteration
Found 2 regions
Part 3: Finished after 10 iterations, 4.61896 ms total, 0.4618960467 ms per iteration
Found 2 regions
Part 4: Finished after 10 iterations, 13.07019 ms total, 1.30701931928 ms per iteration
Found 2 regions

Maze 2:
Part1: Finished after 523 iterations, 238.76491 ms total, 0.4565294731 ms per iteration
Found 35 regions
Part 2: Finished after 283 iterations, 128.76932 ms total, 0.45501526965 ms per iteration
Found 35 regions
Part 3: Finished after 9 iterations, 4.05541 ms total, 0.45060196899 ms per iteration
Found 35 regions
Part 4: Finished after 9 iterations, 11.66303 ms total, 1.2958923333 ms per iteration
Found 35 regions

My results for Part 4 were much slower than results for part 3 (to the magnitude of 3 to 4 times slower) which suggests that using a single thread is not a good choice. Note that there are the same number of iterations and only time per iteration increases. This is probably due to the fact that there exists a tradeoff between memory and compute - given the specific architecture of my machine, our computation in this case is compute-bound instead of memory-bound. It appears that the benefits of single-threading (which include avoiding redundantly checking global memory) is outweighed by the costs of having to perform computations serially and losing parallelism. Perhaps if the pixel labels were more similar or if my GPU were memory-bound, then using a single thread might be a good choice, but not in this case.

Part 5:

On the correctness front, the atomic_min() operation ensures that the calculations and writing out of results is done in one atomic step, i.e. no other thread is able to intervene when our native thread is still under operation. Switching to using min() would mean that race conditions might occur in the sense that our old label is updated redundantly. The final result would still be correct.

However, in terms of time, we run the risk of having extra redundant updates, which is could hurt our runtime and might cause our algorithm to run more slowly. Even though the min() operation is itself faster than the atomic_min() operation, the redundancies in operations might cause overall runtime to increase.
47 changes: 42 additions & 5 deletions HW3/P5/label_regions.cl
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ propagate_labels(__global __read_write int *labels,

// 1D index of thread within our work-group
const int idx_1D = ly * get_local_size(0) + lx;

int old_label;
// Will store the output value
int new_label;
Expand All @@ -80,20 +80,57 @@ propagate_labels(__global __read_write int *labels,
old_label = buffer[buf_y * buf_w + buf_x];

// CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)

// Overwritten code for PART 2
// if (old_label < w * h) {
// // Grab grandparent
// buffer[buf_y * buf_w + buf_x] = labels[old_label];
// }

// PART 4
// Update workgroup labels using single thread
if (lx == 0 && ly == 0) {
// Keeps track of previous key and result
int prev_label = -1;
int prev_result;

// Iterate through entire buffer
for (int i = 0; i < buf_w * buf_h; i++) {
int temp_label = buffer[i];

if (temp_label < w * h) {
// Reset if previous is not the same as current label
if (prev_label != temp_label) {
prev_label = temp_label;
prev_result = labels[prev_label];
}
buffer[i] = prev_result;
}
}
}

// stay in bounds
if ((x < w) && (y < h)) {
if ((x < w) && (y < h) && (old_label < w * h)) {
// CODE FOR PART 1 HERE
// We set new_label to the value of old_label, but you will need
// to adjust this for correctness.

// Update new label
new_label = old_label;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After parts 2 and 4, you should use buffer[buf_w * buf_y + buf_x] instead of old_label.


if (new_label < w * h) {
// Take minimum of minimums over rows and columns
int row_min = min(buffer[buf_y * buf_w + buf_x - 1], buffer[buf_y * buf_w + buf_x + 1]);
int col_min = min(buffer[(buf_y - 1) * buf_w + buf_x], buffer[(buf_y + 1) * buf_w + buf_x]);
new_label = min(row_min, col_min);
}

if (new_label != old_label) {
// CODE FOR PART 3 HERE
// indicate there was a change this iteration.
// multiple threads might write this.
*(changed_flag) += 1;
labels[y * w + x] = new_label;

atomic_min(&labels[old_label], new_label);
atomic_min(&labels[y * w + x], new_label);
}
}
}