harvard-cs205 · george-qi · Nov 3, 2015 · Nov 5, 2015 · Nov 5, 2015 · Nov 20, 2015
diff --git a/HW3/.DS_Store b/HW3/.DS_Store
diff --git a/HW3/.gitignore b/HW3/.gitignore
diff --git a/HW3/P2/mandelbrot.cl b/HW3/P2/mandelbrot.cl
@@ -9,11 +9,24 @@ mandelbrot(__global __read_only float *coords_real,
     const int y = get_global_id(1);
 
     float c_real, c_imag;
-    float z_real, z_imag;
+    float z_real, z_imag, zr_temp;
     int iter;
 
+    z_real = z_imag = iter = 0.;
+    c_real = coords_real[y * w + x];
+    c_imag = coords_imag[y * w + x];
+
     if ((x < w) && (y < h)) {
-        // YOUR CODE HERE
-        ;
+
+        while (z_real * z_real + z_imag * z_imag < 4.0 && iter < max_iter) {
+            zr_temp = z_real;
+            z_real = z_real * z_real - z_imag * z_imag + c_real;
+            z_imag = 2.0 * zr_temp * z_imag + c_imag;
+
+            iter++;
+        }
+
+        out_counts[y * w + x] = iter;
     }
+
 }
diff --git a/HW3/P3/P3.txt b/HW3/P3/P3.txt
@@ -0,0 +1,92 @@
+Below, please find my results:
+
+#0: Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz on Apple
+#1: Intel(R) Iris(TM) Graphics 6100 on Apple
+
+Best one: configuration ('coalesced', 512, 128): 0.00284936 seconds
+
+coalesced reads, workgroups: 8, num_workers: 4, 0.14073944 seconds
+coalesced reads, workgroups: 8, num_workers: 8, 0.07071592 seconds
+coalesced reads, workgroups: 8, num_workers: 16, 0.04490056 seconds
+coalesced reads, workgroups: 8, num_workers: 32, 0.02539208 seconds
+coalesced reads, workgroups: 8, num_workers: 64, 0.01459816 seconds
+coalesced reads, workgroups: 8, num_workers: 128, 0.00884136 seconds
+coalesced reads, workgroups: 16, num_workers: 4, 0.07848448 seconds
+coalesced reads, workgroups: 16, num_workers: 8, 0.03739232 seconds
+coalesced reads, workgroups: 16, num_workers: 16, 0.02174264 seconds
+coalesced reads, workgroups: 16, num_workers: 32, 0.01281008 seconds
+coalesced reads, workgroups: 16, num_workers: 64, 0.00894688 seconds
+coalesced reads, workgroups: 16, num_workers: 128, 0.00466936 seconds
+coalesced reads, workgroups: 32, num_workers: 4, 0.04298944 seconds
+coalesced reads, workgroups: 32, num_workers: 8, 0.02368376 seconds
+coalesced reads, workgroups: 32, num_workers: 16, 0.01368224 seconds
+coalesced reads, workgroups: 32, num_workers: 32, 0.00891504 seconds
+coalesced reads, workgroups: 32, num_workers: 64, 0.00464296 seconds
+coalesced reads, workgroups: 32, num_workers: 128, 0.00299816 seconds
+coalesced reads, workgroups: 64, num_workers: 4, 0.02353056 seconds
+coalesced reads, workgroups: 64, num_workers: 8, 0.01384016 seconds
+coalesced reads, workgroups: 64, num_workers: 16, 0.00888672 seconds
+coalesced reads, workgroups: 64, num_workers: 32, 0.00468648 seconds
+coalesced reads, workgroups: 64, num_workers: 64, 0.00326712 seconds
+coalesced reads, workgroups: 64, num_workers: 128, 0.00355136 seconds
+coalesced reads, workgroups: 128, num_workers: 4, 0.02740848 seconds
+coalesced reads, workgroups: 128, num_workers: 8, 0.01566936 seconds
+coalesced reads, workgroups: 128, num_workers: 16, 0.0089792 seconds
+coalesced reads, workgroups: 128, num_workers: 32, 0.00475928 seconds
+coalesced reads, workgroups: 128, num_workers: 64, 0.00319496 seconds
+coalesced reads, workgroups: 128, num_workers: 128, 0.0034688 seconds
+coalesced reads, workgroups: 256, num_workers: 4, 0.02341696 seconds
+coalesced reads, workgroups: 256, num_workers: 8, 0.01217416 seconds
+coalesced reads, workgroups: 256, num_workers: 16, 0.00652944 seconds
+coalesced reads, workgroups: 256, num_workers: 32, 0.00355272 seconds
+coalesced reads, workgroups: 256, num_workers: 64, 0.00321208 seconds
+coalesced reads, workgroups: 256, num_workers: 128, 0.002994 seconds
+coalesced reads, workgroups: 512, num_workers: 4, 0.02225928 seconds
+coalesced reads, workgroups: 512, num_workers: 8, 0.0117604 seconds
+coalesced reads, workgroups: 512, num_workers: 16, 0.00665648 seconds
+coalesced reads, workgroups: 512, num_workers: 32, 0.00359816 seconds
+coalesced reads, workgroups: 512, num_workers: 64, 0.00299704 seconds
+coalesced reads, workgroups: 512, num_workers: 128, 0.00284936 seconds
+blocked reads, workgroups: 8, num_workers: 4, 0.14570672 seconds
+blocked reads, workgroups: 8, num_workers: 8, 0.08341456 seconds
+blocked reads, workgroups: 8, num_workers: 16, 0.05593968 seconds
+blocked reads, workgroups: 8, num_workers: 32, 0.03242192 seconds
+blocked reads, workgroups: 8, num_workers: 64, 0.01547184 seconds
+blocked reads, workgroups: 8, num_workers: 128, 0.00994824 seconds
+blocked reads, workgroups: 16, num_workers: 4, 0.07736544 seconds
+blocked reads, workgroups: 16, num_workers: 8, 0.04720448 seconds
+blocked reads, workgroups: 16, num_workers: 16, 0.03139616 seconds
+blocked reads, workgroups: 16, num_workers: 32, 0.01388616 seconds
+blocked reads, workgroups: 16, num_workers: 64, 0.00971104 seconds
+blocked reads, workgroups: 16, num_workers: 128, 0.00672608 seconds
+blocked reads, workgroups: 32, num_workers: 4, 0.04234944 seconds
+blocked reads, workgroups: 32, num_workers: 8, 0.02223264 seconds
+blocked reads, workgroups: 32, num_workers: 16, 0.0127568 seconds
+blocked reads, workgroups: 32, num_workers: 32, 0.00973904 seconds
+blocked reads, workgroups: 32, num_workers: 64, 0.00668352 seconds
+blocked reads, workgroups: 32, num_workers: 128, 0.00656008 seconds
+blocked reads, workgroups: 64, num_workers: 4, 0.02402304 seconds
+blocked reads, workgroups: 64, num_workers: 8, 0.01290296 seconds
+blocked reads, workgroups: 64, num_workers: 16, 0.00971152 seconds
+blocked reads, workgroups: 64, num_workers: 32, 0.006676 seconds
+blocked reads, workgroups: 64, num_workers: 64, 0.0063536 seconds
+blocked reads, workgroups: 64, num_workers: 128, 0.006968 seconds
+blocked reads, workgroups: 128, num_workers: 4, 0.02390144 seconds
+blocked reads, workgroups: 128, num_workers: 8, 0.0153476 seconds
+blocked reads, workgroups: 128, num_workers: 16, 0.01144752 seconds
+blocked reads, workgroups: 128, num_workers: 32, 0.00693616 seconds
+blocked reads, workgroups: 128, num_workers: 64, 0.00701288 seconds
+blocked reads, workgroups: 128, num_workers: 128, 0.00648784 seconds
+blocked reads, workgroups: 256, num_workers: 4, 0.01887544 seconds
+blocked reads, workgroups: 256, num_workers: 8, 0.01251208 seconds
+blocked reads, workgroups: 256, num_workers: 16, 0.00761376 seconds
+blocked reads, workgroups: 256, num_workers: 32, 0.00619296 seconds
+blocked reads, workgroups: 256, num_workers: 64, 0.00656528 seconds
+blocked reads, workgroups: 256, num_workers: 128, 0.00608576 seconds
+blocked reads, workgroups: 512, num_workers: 4, 0.018794 seconds
+blocked reads, workgroups: 512, num_workers: 8, 0.01230216 seconds
+blocked reads, workgroups: 512, num_workers: 16, 0.00884832 seconds
+blocked reads, workgroups: 512, num_workers: 32, 0.0061172 seconds
+blocked reads, workgroups: 512, num_workers: 64, 0.00630272 seconds
+blocked reads, workgroups: 512, num_workers: 128, 0.00573808 seconds
+configuration ('coalesced', 512, 128): 0.00284936 seconds
diff --git a/HW3/P3/sum.cl b/HW3/P3/sum.cl
@@ -5,11 +5,14 @@ __kernel void sum_coalesced(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
+    size_t local_size = get_local_size(0);
+    size_t global_id = get_global_id(0);
+    size_t global_size = get_global_size(0);
 
     // thread i (i.e., with i = get_global_id()) should add x[i],
     // x[i + get_global_size()], ... up to N-1, and store in sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE 
+    for (int i = global_id; i < N; i += global_size) {
+        sum += x[i];
     }
 
     fast[local_id] = sum;
@@ -24,8 +27,12 @@ __kernel void sum_coalesced(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    for (uint s = local_size / 2; s > 0; s >>= 1) {
+        if (local_id < s) {
+            fast[local_id] += fast[local_id + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];
@@ -38,7 +45,10 @@ __kernel void sum_blocked(__global float* x,
 {
     float sum = 0;
     size_t local_id = get_local_id(0);
-    int k = ceil(float(N) / get_global_size(0));
+    size_t local_size = get_local_size(0);
+    long global_id = get_global_id(0);
+    size_t global_size = get_global_size(0);
+    int k = ceil((float)N / global_size);
 
     // thread with global_id 0 should add 0..k-1
     // thread with global_id 1 should add k..2k-1
@@ -48,8 +58,12 @@ __kernel void sum_blocked(__global float* x,
     // 
     // Be careful that each thread stays in bounds, both relative to
     // size of x (i.e., N), and the range it's assigned to sum.
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    for (int i = global_id * k; i < (global_id + 1) * k; i++ ) {
+        // check bounds
+        if (i < N) {
+            sum += x[i];
+        }
     }
 
     fast[local_id] = sum;
@@ -64,8 +78,12 @@ __kernel void sum_blocked(__global float* x,
     // You can assume get_local_size(0) is a power of 2.
     //
     // See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/
-    for (;;) { // YOUR CODE HERE
-        ; // YOUR CODE HERE
+
+    for (uint s = local_size / 2; s > 0; s >>= 1) { 
+        if (local_id < s) {
+            fast[local_id] += fast[local_id + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     if (local_id == 0) partial[get_group_id(0)] = fast[0];

diff --git a/HW3/P3/tune.py b/HW3/P3/tune.py
@@ -23,7 +23,7 @@ def create_data(N):
     times = {}
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)
@@ -40,7 +40,7 @@ def create_data(N):
                   format(num_workgroups, num_workers, seconds))
 
     for num_workgroups in 2 ** np.arange(3, 10):
-        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups + 4)
+        partial_sums = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 4 * num_workgroups)
         host_partial = np.empty(num_workgroups).astype(np.float32)
         for num_workers in 2 ** np.arange(2, 8):
             local = cl.LocalMemory(num_workers * 4)

diff --git a/HW3/P4/.DS_Store b/HW3/P4/.DS_Store
diff --git a/HW3/P4/.gitignore b/HW3/P4/.gitignore
diff --git a/HW3/P4/image.npz.zip b/HW3/P4/image.npz.zip
diff --git a/HW3/P4/median_filter.cl b/HW3/P4/median_filter.cl
@@ -1,5 +1,25 @@
 #include "median9.h"
 
+// returns pixel value at x, y
+float FETCH(__global __read_only float *in_values,
+            int w, int h,
+            int x, int y)
+{
+    if (x < 0) {
+        x = 0;
+    } 
+    else if (x >= w) {
+        x = w - 1;
+    }
+    if (y < 0) {
+        y = 0;
+    }
+    else if (y >= h) {
+        y = h - 1;
+    }
+    return in_values[y * w + x];
+}
+
 // 3x3 median filter
 __kernel void
 median_3x3(__global __read_only float *in_values,
@@ -22,13 +42,58 @@ median_3x3(__global __read_only float *in_values,
     // Note that globally out-of-bounds pixels should be replaced
     // with the nearest valid pixel's value.
 
+    // Global position of output pixel
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    // Local position relative to (0, 0) in workgroup
+    const int lx = get_local_id(0);
+    const int ly = get_local_id(1);
+
+    // Coordinates of the upper left corner of the buffer in image
+    // space, including halo
+    const int buf_corner_x = x - lx - halo;
+    const int buf_corner_y = y - ly - halo;
+
+    // Coordinates of our pixel in the local buffer
+    const int buf_x = lx + halo;
+    const int buf_y = ly + halo;
+
+    // 1D index of thread within our work-group
+    const int idx_1D = ly * get_local_size(0) + lx;
+
+    // load buffer
+    if (idx_1D < buf_w) {
+        for (int row = 0; row < buf_h; row++) {
+            buffer[row * buf_w + 1dx_1D] = \
+                FETCH(image, img_w, img_h,
+                      buf_corner_x + idx_1D,
+                      buf_corner_y + row);
+        }
+    }
+
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     // Compute 3x3 median for each pixel in core (non-halo) pixels
     //
     // We've given you median9.h, and included it above, so you can
     // use the median9() function.
 
-
     // Each thread in the valid region (x < w, y < h) should write
     // back its 3x3 neighborhood median.
+
+    // stay within bounds
+    if ((y < h) && (x < w)) {
+        // median9 with all combinations of y-1, y, y+1 with x-1, x, x+1
+        out_values[y * w + x] = median9(buffer[(buf_y-1)*buf_w + buf_x - 1],
+                                        buffer[(buf_y-1)*buf_w + buf_x],
+                                        buffer[(buf_y-1)*buf_w + buf_x + 1],
+                                        buffer[buf_y*buf_w + buf_x - 1],
+                                        buffer[buf_y*buf_w + buf_x],
+                                        buffer[buf_y*buf_w + buf_x + 1],
+                                        buffer[(buf_y+1)*buf_w + buf_x - 1],
+                                        buffer[(buf_y+1)*buf_w + buf_x],
+                                        buffer[(buf_y+1)*buf_w + buf_x + 1]);
+    }
+
 }
diff --git a/HW3/P4/median_filter.py b/HW3/P4/median_filter.py
@@ -1,8 +1,8 @@
 from __future__ import division
 import pyopencl as cl
 import numpy as np
-import imread
 import pylab
+import os.path
 
 def round_up(global_size, group_size):
     r = global_size % group_size
@@ -51,7 +51,8 @@ def numpy_median(image, iterations=10):
                             properties=cl.command_queue_properties.PROFILING_ENABLE)
     print 'The queue is using the device:', queue.device.name
 
-    program = cl.Program(context, open('median_filter.cl').read()).build(options='')
+    curdir = os.path.dirname(os.path.realpath(__file__))
+    program = cl.Program(context, open('median_filter.cl').read()).build(options=['-I', curdir])
 
     host_image = np.load('image.npz')['image'].astype(np.float32)[::2, ::2].copy()
     host_image_filtered = np.zeros_like(host_image)

diff --git a/HW3/P5/P5.txt b/HW3/P5/P5.txt
@@ -0,0 +1,62 @@
+-- PART 1
+
+Finished after 915 iterations, 213.29096 ms total, 0.233104874317 ms per iteration
+Found 2 regions
+
+Maze 2:
+Finished after 531 iterations, 122.0564 ms total, 0.229861393597 ms per iteration
+Found 35 regions
+
+
+-- PART 2
+
+Maze 1:
+Finished after 529 iterations, 121.61456 ms total, 0.229895198488 ms per iteration
+Found 2 regions
+
+Maze 2:
+Finished after 272 iterations, 62.71688 ms total, 0.230576764706 ms per iteration
+Found 35 regions
+
+
+-- PART 3 
+
+Maze 1:
+Finished after 10 iterations, 2.81728 ms total, 0.281728 ms per iteration
+Found 2 regions
+
+Maze 2:
+Finished after 9 iterations, 2.19712 ms total, 0.244124444444 ms per iteration
+Found 35 regions
+
+
+-- PART 4
+
+Maze 1:
+Finished after 10 iterations, 8.49528 ms total, 0.849528 ms per iteration
+Found 2 regions
+
+Maze 2:
+Finished after 9 iterations, 7.49304 ms total, 0.83256 ms per iteration
+Found 35 regions
+
+A graph of my results from Parts 1-4 can be found under "maze_1.png" and "maze_2.png".
+
+From my results above, it is clear that -- given the architecture of my computer -- using a single thread is not a reasonable choice, as the performance is roughly 3-4x worse. Further note that the single thread does not increase the number of iterations but only the amount of time per iteration. In general, the motivation behind using a single thread is to avoid some redundant global memory leads, but we lose the benefits of computing in parallel. 
+
+In this case, it is clear that the efficiency gained from eliminating these memory redundancies does not dominate the slow down caused from having to perform all the work serially (e.g. having to remember the last fetch). If there the labels of pixels were more similar (i.e. more redundancy in the labels), then this optimization would have yielded better results due to reduced latency costs associated with going to global memory.
+
+This implies that we are compute bound in that computation is faster than global memory reads. Therefore, if the GPU were memory bound (i.e. were biased towards read speed as opposed to computation speed), then it could be possible that a single thread would be a good choice. This could include factors such as latency costs of accessing global memory, the number of iterations that we could reduce the problem to in the single thread case, and so on.
+
+
+-- PART 5
+
+First of all, the atomic_min(*p, val) takes the 32-bit value stored at the location pointed by p, computes the minimum of that value and val, and then stores the answer at the location pointed to by p. 
+
+Now given this, if we used min instead of atomic_min(), then a thread could write over the correct answer of another thread. 
+
+This is probably best described with an example. Say we have a current label X3, and two other labels that both a smaller label number, X1 and X2, are both comparing to it. Our order in this case is X1 < X2 < X3. What should happen (and what does happen in the case of atomic updates), is that all three labels will result in the minimum of the three, or X1.
+
+In the case of using min(), however, both threads would perform comparisons at the same time. One thread might overwrite X3 with X1 while another thread might then overwrite it again with X2, which is higher than X2. In this example, X3 was unintentionally increased from X1 to X2, forcing us to perform at least one more iteration to completely decrease it. Thus, after a given iteration, the result may be wrong, but at the very end, we will still have a correct result (it may just more iterations to reach it).
+
+As a result of the above analysis, we have an increased number of iterations, which could make total computation time longer. In general though, min() is faster than atomic_min(), so it is unclear which will provide superior performance -- it really boils down to what hurts more: the extra iterations, or the slower atomic_min() function.