harvard-cs205 · aidizhang · Nov 21, 2015 · adisuissa · Dec 16, 2015 · adisuissa
diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
@@ -14,6 +14,21 @@ mandelbrot(__global __read_only float *coords_real,
 
     if ((x < w) && (y < h)) {
         // YOUR CODE HERE
-        ;
+        z_real, z_imag = 0, 0;
+        c_real = coords_real[x + y*w];
+        c_imag = coords_imag[x + y*w];
+
+        // Perform mandelbrot computations
+        for (iter = 0; iter < max_iter; iter++) {
+            if ((z_real * z_real + z_imag * z_imag) > 4)
+                break;
+            // Update z_real and z_imag
+            z_real_old = z_real;
+            z_real = c_real + (z_real * z_real) - (z_imag * z_imag);
+            z_imag = c_imag + 2 * z_real_old * z_imag;
+        }
+
+        // Store iteration number into output counts
+        out_counts[x + y*w] = iter;
     }
 }
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
@@ -0,0 +1,22 @@
+After running the timing code, the best configuration is as follows:
+
+configuration ('coalesced',256,128): 0.00301426 seconds
+
+I am running the code on my 2012 MacBook Pro 13" with the following specs:
+
+---------------------------
+Apple Apple version: OpenCL 1.2 (May 10 2015 19:38:45)
+The devices detected on platform Apple are:
+---------------------------
+Intel(R) Core(TM) i5-3210M CPU @ 2.50GHz [Type: CPU ]
+Maximum clock Frequency: 2500 MHz
+Maximum allocable memory size: 2147 MB
+Maximum work group size 1024
+---------------------------
+HD Graphics 4000 [Type: GPU ]
+Maximum clock Frequency: 1100 MHz
+Maximum allocable memory size: 268 MB
+Maximum work group size 512
+---------------------------
+This context is associated with  2 devices
+The queue is using the device: HD Graphics 4000
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
@@ -5,15 +5,15 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    unsigned int global_size = get_global_size(0);
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (int i = get_global_id(0); i < N; i += global_size) {
+        sum += x[i];
     }
 
     fast[local_id] = sum;
-    barrier(CLK_LOCAL_MEM_FENCE);
 
     // binary reduction
     //
@@ -24,10 +24,12 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (uint s = get_local_size(0) / 2; s > 0; s >>= 1) {
+        if (local_id < s) {
+            fast[local_id] += fast[local_id + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
-
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
 }
 
@@ -48,12 +50,11 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (int i = get_global_id(0) * k; i < N && i <  (get_global_id(0)+1) * k; i++) {
+        sum += x[i];
     }
 
     fast[local_id] = sum;
-    barrier(CLK_LOCAL_MEM_FENCE);
 
     // binary reduction
     //
@@ -64,9 +65,11 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+    for (uint s = get_local_size(0) / 2; s > 0; s >>= 1) {
+        if (local_id < s) {
+            fast[local_id] += fast[local_id + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
-
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
 }
diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
@@ -12,16 +12,42 @@ median_3x3(__global __read_only float *in_values,
     // Note: It may be easier for you to implement median filtering
     // without using the local buffer, first, then adjust your code to
     // use such a buffer after you have that working.
-
-
+
     // Load into buffer (with 1-pixel halo).
-    //
     // It may be helpful to consult HW3 Problem 5, and
     // https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl
     //
     // Note that globally out-of-bounds pixels should be replaced
     // with the nearest valid pixel's value.
+
+    // Find global position of output pixel
+    const int x = get_global_id(0)
+    const int y = get_global_id(1)
+
+    // Find local position of local pixel relative to workgroup
+    const int lx = get_local_id(0)
+    const int lx = get_local_id(1)
 
+    // Coordinates of the upper left corner of buffer in image space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // Coordinates of our pixel in local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    if (idx_1D < buf_w)
+        for (int row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + idx_1D] = \
+                FETCH(in_values, w, h,
+                      buf_corner_x + idx_1D,
+                      buf_corner_y + row);
+        }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     // Compute 3x3 median for each pixel in core (non-halo) pixels
     //
@@ -31,4 +57,18 @@ median_3x3(__global __read_only float *in_values,
 
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
+
+    float median = median9(buffer[(buf_y-1) * buf_w + buf_x-1],
+                            buffer[(buf_y-1) * buf_w + buf_x],
+                            buffer[(buf_y-1) * buf_w + buf_x+1],
+                            buffer[buf_y * buf_w + buf_x-1],
+                            buffer[buf_y * buf_w + buf_x],
+                            buffer[buf_y * buf_w + buf_x+1],
+                            buffer[(buf_y+1) * buf_w + buf_x-1],
+                            buffer[(buf_y+1) * buf_w + buf_x],
+                            buffer[(buf_y+1) * buf_w + buf_x+1])
+
+    if ((y < h) && (x < w)) {
+        out_values[y * w + x] = median;
+    }
 }
diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
@@ -0,0 +1,29 @@
+Results were as follows:
+
+Maze 1:
+Part1: Finished after 904 iterations, 418.91049 ms total, 0.46339656892 ms per iteration
+Found 2 regions
+Part2: Finished after 531 iterations, 241.37665 ms total, 0.45456996761 ms per iteration
+Found 2 regions
+Part 3: Finished after 10 iterations, 4.61896 ms total, 0.4618960467 ms per iteration
+Found 2 regions
+Part 4: Finished after 10 iterations, 13.07019 ms total, 1.30701931928 ms per iteration
+Found 2 regions
+
+Maze 2: 
+Part1: Finished after 523 iterations, 238.76491 ms total, 0.4565294731 ms per iteration
+Found 35 regions
+Part 2: Finished after 283 iterations, 128.76932 ms total, 0.45501526965 ms per iteration
+Found 35 regions
+Part 3: Finished after 9 iterations, 4.05541 ms total, 0.45060196899 ms per iteration
+Found 35 regions
+Part 4: Finished after 9 iterations, 11.66303 ms total, 1.2958923333 ms per iteration
+Found 35 regions
+
+My results for Part 4 were much slower than results for part 3 (to the magnitude of 3 to 4 times slower) which suggests that using a single thread is not a good choice. Note that there are the same number of iterations and only time per iteration increases. This is probably due to the fact that there exists a tradeoff between memory and compute - given the specific architecture of my machine, our computation in this case is compute-bound instead of memory-bound. It appears that the benefits of single-threading (which include avoiding redundantly checking global memory) is outweighed by the costs of having to perform computations serially and losing parallelism. Perhaps if the pixel labels were more similar or if my GPU were memory-bound, then using a single thread might be a good choice, but not in this case.
+
+Part 5: 
+
+On the correctness front, the atomic_min() operation ensures that the calculations and writing out of results is done in one atomic step, i.e. no other thread is able to intervene when our native thread is still under operation. Switching to using min() would mean that race conditions might occur in the sense that our old label is updated redundantly. The final result would still be correct.
+
+However, in terms of time, we run the risk of having extra redundant updates, which is could hurt our runtime and might cause our algorithm to run more slowly. Even though the min() operation is itself faster than the atomic_min() operation, the redundancies in operations might cause overall runtime to increase. 
diff --git a/HW3/P5/label_regions.cl b/HW3/P5/label_regions.cl
@@ -56,7 +56,7 @@ propagate_labels(__global __read_write int *labels,
 
     // 1D index of thread within our work-group
     const int idx_1D = ly * get_local_size(0) + lx;
-    
+
     int old_label;
     // Will store the output value
     int new_label;
@@ -80,20 +80,57 @@ propagate_labels(__global __read_write int *labels,
     old_label = buffer[buf_y * buf_w + buf_x];
 
     // CODE FOR PARTS 2 and 4 HERE (part 4 will replace part 2)
-
+    // Overwritten code for PART 2
+    // if (old_label < w * h) {
+    //     // Grab grandparent
+    //     buffer[buf_y * buf_w + buf_x] = labels[old_label];
+    // }
+
+    // PART 4
+    // Update workgroup labels using single thread
+    if (lx == 0 && ly == 0) {
+        // Keeps track of previous key and result
+        int prev_label = -1;
+        int prev_result;
+
+        // Iterate through entire buffer
+        for (int i = 0; i < buf_w * buf_h; i++) {
+            int temp_label = buffer[i];
+
+            if (temp_label < w * h) {
+                // Reset if previous is not the same as current label
+                if (prev_label != temp_label) {
+                    prev_label = temp_label;
+                    prev_result = labels[prev_label];
+                }
+                buffer[i] = prev_result;
+            }
+        }
+    }
+
     // stay in bounds
-    if ((x < w) && (y < h)) {
+    if ((x < w) && (y < h) && (old_label < w * h)) {
         // CODE FOR PART 1 HERE
         // We set new_label to the value of old_label, but you will need
         // to adjust this for correctness.
+
+        // Update new label
         new_label = old_label;
-
+        if (new_label < w * h) {
+            // Take minimum of minimums over rows and columns
+            int row_min = min(buffer[buf_y * buf_w + buf_x - 1], buffer[buf_y * buf_w + buf_x + 1]);
+            int col_min = min(buffer[(buf_y - 1) * buf_w + buf_x], buffer[(buf_y + 1) * buf_w + buf_x]);
+            new_label = min(row_min, col_min);
+        }
+
         if (new_label != old_label) {
             // CODE FOR PART 3 HERE
             // indicate there was a change this iteration.
             // multiple threads might write this.
             *(changed_flag) += 1;
-            labels[y * w + x] = new_label;
+
+            atomic_min(&labels[old_label], new_label);
+            atomic_min(&labels[y * w + x], new_label);
         }
     }
 }