-
Notifications
You must be signed in to change notification settings - Fork 96
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Hw3 #430
base: HW3
Are you sure you want to change the base?
Hw3 #430
Changes from all commits
ed6f016
6706ee7
f7dc5f6
3f397aa
2d25b7a
0d4dbeb
22082ad
62ffe6d
3b2416f
4cad4c3
3912be1
6e18677
ea73c1d
23d418a
522f842
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
|
||
The best configuration for my machine is: | ||
|
||
configuration ('coalesced', 512, 128): 0.000331392 seconds | ||
|
||
The coalesced read is faster than the blocked read on average | ||
for the same number of work groups and workers because more | ||
threads can do work on the same block of fetched memory. In the | ||
blocked reads, once a thread fetchs its block to sum, more | ||
threads may have to wait to fetch their block of memory. However in the | ||
coalesced reads, more threads can sum elements simultaneously | ||
more often since a fetched block of memory will be more likely to | ||
contain elements needed by more threads than in the blocked scheme. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,11 +5,16 @@ __kernel void sum_coalesced(__global float* x, | |
{ | ||
float sum = 0; | ||
size_t local_id = get_local_id(0); | ||
|
||
int i, j, gID, gSize, temp, lSize, loglSize; | ||
|
||
gID = get_global_id(0); | ||
gSize = get_global_size(0); | ||
lSize = get_local_size(0); | ||
|
||
// thread i (i.e., with i = get_global_id()) should add x[i], | ||
// x[i + get_global_size()], ... up to N-1, and store in sum. | ||
for (;;) { // YOUR CODE HERE | ||
; // YOUR CODE HERE | ||
for (i = gID; i < N; i += gSize) { | ||
sum = sum + x[i]; | ||
} | ||
|
||
fast[local_id] = sum; | ||
|
@@ -24,8 +29,17 @@ __kernel void sum_coalesced(__global float* x, | |
// You can assume get_local_size(0) is a power of 2. | ||
// | ||
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ | ||
for (;;) { // YOUR CODE HERE | ||
; // YOUR CODE HERE | ||
loglSize = 1; | ||
temp = lSize >> 1; | ||
while (temp > 1){ | ||
temp = temp >> 1; | ||
loglSize = loglSize + 1; | ||
} | ||
for (j = 1; j <= loglSize; j++) { | ||
if (local_id < (lSize >> j)) { | ||
fast[local_id] = fast[local_id] + fast[local_id + (lSize >> j)]; | ||
} | ||
barrier(CLK_LOCAL_MEM_FENCE); | ||
} | ||
|
||
if (local_id == 0) partial[get_group_id(0)] = fast[0]; | ||
|
@@ -38,7 +52,8 @@ __kernel void sum_blocked(__global float* x, | |
{ | ||
float sum = 0; | ||
size_t local_id = get_local_id(0); | ||
int k = ceil(float(N) / get_global_size(0)); | ||
int k = ceil((float)N / get_global_size(0)); | ||
int j, gID, temp, loglSize, lSize, minS; | ||
|
||
// thread with global_id 0 should add 0..k-1 | ||
// thread with global_id 1 should add k..2k-1 | ||
|
@@ -48,8 +63,16 @@ __kernel void sum_blocked(__global float* x, | |
// | ||
// Be careful that each thread stays in bounds, both relative to | ||
// size of x (i.e., N), and the range it's assigned to sum. | ||
for (;;) { // YOUR CODE HERE | ||
; // YOUR CODE HERE | ||
lSize = get_local_size(0); | ||
gID = get_global_id(0); | ||
if (k-1 < N - k*gID){ | ||
minS = k; | ||
} | ||
else{ | ||
minS = N - k*gID; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can simplify the above if statement, and just put in the following for loop condition: |
||
for (j = 0; j < minS; j++) { | ||
sum = sum + x[k*gID + j]; | ||
} | ||
|
||
fast[local_id] = sum; | ||
|
@@ -64,8 +87,17 @@ __kernel void sum_blocked(__global float* x, | |
// You can assume get_local_size(0) is a power of 2. | ||
// | ||
// See http://www.nehalemlabs.net/prototype/blog/2014/06/16/parallel-programming-with-opencl-and-python-parallel-reduce/ | ||
for (;;) { // YOUR CODE HERE | ||
; // YOUR CODE HERE | ||
loglSize = 1; | ||
temp = lSize >> 1; | ||
while (temp > 1){ | ||
temp = temp >> 1; | ||
loglSize = loglSize + 1; | ||
} | ||
for (j = 1; j <= loglSize; j++) { | ||
if (local_id < (lSize >> j)) { | ||
fast[local_id] = fast[local_id] + fast[local_id + (lSize >> j)]; | ||
} | ||
barrier(CLK_LOCAL_MEM_FENCE); | ||
} | ||
|
||
if (local_id == 0) partial[get_group_id(0)] = fast[0]; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
#include "median9.h" | ||
|
||
|
||
// 3x3 median filter | ||
__kernel void | ||
median_3x3(__global __read_only float *in_values, | ||
|
@@ -12,23 +13,81 @@ median_3x3(__global __read_only float *in_values, | |
// Note: It may be easier for you to implement median filtering | ||
// without using the local buffer, first, then adjust your code to | ||
// use such a buffer after you have that working. | ||
int gID, lID, x, y, lx, ly, gSizeX, gSizeY, | ||
lSizeX, lSizeY, xTemp, yTemp, xUse, yUse, | ||
buf_corner_x, buf_corner_y, buf_x, buf_y, row; | ||
// the code below is adapted from the lecture code on halos | ||
x = get_global_id(0); | ||
y = get_global_id(1); | ||
lx = get_local_id(0); | ||
ly = get_local_id(1); | ||
gSizeX = get_global_size(0); | ||
gSizeY = get_global_size(1); | ||
lSizeX = get_local_size(0); | ||
lSizeY = get_local_size(1); | ||
|
||
|
||
gID = gSizeX*y + x; | ||
lID = lSizeX*ly + lx; | ||
|
||
buf_corner_x = x - lx - halo; | ||
buf_corner_y = y - ly - halo; | ||
|
||
// Load into buffer (with 1-pixel halo). | ||
// | ||
// It may be helpful to consult HW3 Problem 5, and | ||
// https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl | ||
// | ||
// Note that globally out-of-bounds pixels should be replaced | ||
// with the nearest valid pixel's value. | ||
buf_x = lx + halo; | ||
buf_y = ly + halo; | ||
|
||
if ((y < h) && (x < w)){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This if statement should not be here. |
||
if (lID < buf_w){ // only work with buf_w threads | ||
xTemp = buf_corner_x + lID; | ||
xUse = xTemp; | ||
if (xTemp < 0){ // if pixel out of bounds, add compensation steps to find closest in bound pixel | ||
xUse += 1; | ||
} | ||
if (xTemp > w - 1){ | ||
xUse -= 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would lead to accessing wrong values if the buffer size has more than 1 value out of bounds with the right side of the input matrix. |
||
} | ||
for (row = 0; row < buf_h; row++) { | ||
yTemp = buf_corner_y + row; | ||
yUse = yTemp; | ||
if (yTemp < 0){ | ||
yUse += 1; | ||
} | ||
if (yTemp > h - 1){ | ||
yUse -= 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would lead to accessing wrong values if the buffer size has more than 1 value out of bounds with the bottom of the input matrix. |
||
} | ||
buffer[row * buf_w + lID] = in_values[yUse*gSizeX + xUse]; // assign global memory of pixel or closest in bound pixel to buffer | ||
} | ||
} | ||
} | ||
|
||
// Compute 3x3 median for each pixel in core (non-halo) pixels | ||
// | ||
// We've given you median9.h, and included it above, so you can | ||
// use the median9() function. | ||
barrier(CLK_LOCAL_MEM_FENCE); | ||
if ((y < h) && (x < w)){ | ||
out_values[gID] = median9(buffer[(buf_y-1)*buf_w + (buf_x-1)], // take median of 8 neighbors and current pixel | ||
buffer[(buf_y-1)*buf_w + (buf_x)], | ||
buffer[(buf_y-1)*buf_w + (buf_x+1)], | ||
buffer[(buf_y)*buf_w + (buf_x-1)], | ||
buffer[(buf_y)*buf_w + (buf_x)], | ||
buffer[(buf_y)*buf_w + (buf_x+1)], | ||
buffer[(buf_y+1)*buf_w + (buf_x-1)], | ||
buffer[(buf_y+1)*buf_w + (buf_x)], | ||
buffer[(buf_y+1)*buf_w + (buf_x+1)]); | ||
} | ||
|
||
// Load into buffer (with 1-pixel halo). | ||
// | ||
// It may be helpful to consult HW3 Problem 5, and | ||
// https://github.com/harvard-cs205/OpenCL-examples/blob/master/load_halo.cl | ||
// | ||
// Note that globally out-of-bounds pixels should be replaced | ||
// with the nearest valid pixel's value. | ||
|
||
// Each thread in the valid region (x < w, y < h) should write | ||
// back its 3x3 neighborhood median. | ||
|
||
// Compute 3x3 median for each pixel in core (non-halo) pixels | ||
// | ||
// We've given you median9.h, and included it above, so you can | ||
// use the median9() function. | ||
|
||
|
||
// Each thread in the valid region (x < w, y < h) should write | ||
// back its 3x3 neighborhood median. | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
|
||
|
||
Explanation: | ||
|
||
Part 1: | ||
|
||
This is the base code. | ||
|
||
Part 2: | ||
|
||
This is optimized over the first part because the buffer values are updated with the | ||
grandparent values, which is guaranteed to be less than or equal to the current | ||
buffer value. | ||
|
||
Part 3: | ||
|
||
This is optimized over the second part because a pixel's parent is updated to the pixel's | ||
value if it is smaller than the pixel's parent using atomic min. However, the iteration | ||
time increases due to the atomic (min) operation. | ||
|
||
Part 4: | ||
|
||
Making 1 thread update the buffer regions with grandparent values is not as efficient on average | ||
given the time per iteration is roughly twice as long as Part 3. Even though lots of adjacent pixels | ||
may have equal buffer values after sufficient iterations, the reduced number of memory reads | ||
does not outweight the loss of parallelism between threads. If more threads are used, for | ||
example due to smaller context sizes, then even more memory calls to the labels array will occur. | ||
So using one thread to remember previous grandparent values may perform better than having | ||
each thread fetch a value from memory simultaneously (resulting in partial serialization since | ||
more threads will have to wait for memory) as the number of threads gets even larger. | ||
|
||
Part 5: | ||
|
||
If a standard min operation were used instead of atomic min, the iteration time would decrease | ||
because the imposed serialized delays from atomic operation will not be applied. The final result | ||
will still be correct because even if a thread overwrites the pixel's parent's value with a greater value | ||
than another thread, the value will still be less than the original parent value. Thus the number of | ||
iterations may increase. As stated, the value in label could increase, but that is during the same iteration. | ||
Between iterations, label values cannot increase because a pixel's previous iteration value is compared | ||
via the minimum operator with a new label. Thus after the current iteration finishs, each label's value | ||
will be less than or equal to that of the previous iteration. | ||
|
||
|
||
Results: | ||
|
||
Maze 1 | ||
|
||
Part1: | ||
|
||
Finished after 915 iterations, 36.084992 ms total, 0.0394371497268 ms per iteration | ||
Found 2 regions | ||
|
||
Part 2: | ||
|
||
Finished after 529 iterations, 20.321376 ms total, 0.0384146994329 ms per iteration | ||
Found 2 regions | ||
|
||
Part 3: | ||
|
||
Finished after 12 iterations, 0.611552 ms total, 0.0509626666667 ms per iteration | ||
Found 2 regions | ||
|
||
Part 4: | ||
|
||
Finished after 11 iterations, 1.224416 ms total, 0.111310545455 ms per iteration | ||
Found 2 regions | ||
|
||
Maze 2 | ||
|
||
Part 1: | ||
Finished after 532 iterations, 20.138752 ms total, 0.0378547969925 ms per iteration | ||
Found 35 regions | ||
|
||
Part 2: | ||
Finished after 276 iterations, 10.62384 ms total, 0.038492173913 ms per iteration | ||
Found 35 regions | ||
|
||
Part 3: | ||
Finished after 11 iterations, 0.539008 ms total, 0.0490007272727 ms per iteration | ||
Found 35 regions | ||
|
||
Part 4: | ||
Finished after 10 iterations, 1.11216 ms total, 0.111216 ms per iteration | ||
Found 35 regions | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of using the above while loop, you can incorporate this to following for loop,
by setting the initial j to lSize >> 1 or (lSize/2), and decrease it by half on each iteration.