Skip to content

Commit

Permalink
Merge pull request #94 from computablee/features
Browse files Browse the repository at this point in the history
Worksharing-For Optimizations
  • Loading branch information
computablee authored Oct 17, 2023
2 parents d08034d + 6509181 commit 7edd5c8
Show file tree
Hide file tree
Showing 12 changed files with 59 additions and 127 deletions.
15 changes: 13 additions & 2 deletions DotMP-Tests/ParallelTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ public void Dynamic_should_produce_correct_results()
y[i] = 1.0f;
}

float[] z = saxpy_parallelregion_for(2.0f, x, y, Schedule.Dynamic, 16);
float[] z = saxpy_parallelregion_for(2.0f, x, y, Schedule.Dynamic, 1);

for (int i = 0; i < z.Length; i++)
{
Expand Down Expand Up @@ -394,6 +394,13 @@ public void Schedule_runtime_works()
DotMP.Parallel.GetChunkSize().Should().Be(256);
});

Environment.SetEnvironmentVariable("OMP_SCHEDULE", "static");
DotMP.Parallel.ParallelFor(0, 1025, num_threads: 4, schedule: DotMP.Schedule.Runtime, action: i =>
{
DotMP.Parallel.GetSchedule().Should().Be(DotMP.Schedule.Static);
DotMP.Parallel.GetChunkSize().Should().Be(257);
});

Environment.SetEnvironmentVariable("OMP_SCHEDULE", "guided,2");
DotMP.Parallel.ParallelFor(0, 1024, schedule: DotMP.Schedule.Runtime, action: i =>
{
Expand Down Expand Up @@ -649,11 +656,15 @@ public void Ordered_works()
{
uint threads = 8;
int[] incrementing = new int[1024];
int ctr = 0;

DotMP.Parallel.ParallelFor(0, 1024, schedule: DotMP.Schedule.Static,
num_threads: threads, action: i =>
{
DotMP.Parallel.Ordered(0, () => incrementing[i] = i);
DotMP.Parallel.Ordered(0, () =>
{
incrementing[i] = ctr++;
});
});

for (int i = 0; i < incrementing.Length; i++)
Expand Down
2 changes: 2 additions & 0 deletions DotMP/DotMP.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
<PackageLicenseExpression>MIT</PackageLicenseExpression>
<PackageReleaseNotes>Added support for .NET 7.0.</PackageReleaseNotes>
<GenerateDocumentationFile>true</GenerateDocumentationFile>
<DebugType>pdbonly</DebugType>
<DebugSymbols>true</DebugSymbols>
</PropertyGroup>

<ItemGroup>
Expand Down
7 changes: 0 additions & 7 deletions DotMP/ForkedRegion.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,6 @@ internal class Region
/// The function to be executed.
/// </summary>
internal Action omp_fn;
/// <summary>
/// Generic SpinWait objects for each thread.
/// </summary>
internal SpinWait[] spin;

/// <summary>
/// Creates a specified number of threads available to the parallel region, and sets the function to be executed.
Expand All @@ -48,9 +44,6 @@ internal Region(uint num_threads, Action omp_fn)
ws_lock = new object();
this.num_threads = num_threads;
this.omp_fn = omp_fn;
this.spin = new SpinWait[num_threads];
for (int i = 0; i < num_threads; i++)
this.spin[i] = new SpinWait();
}
}

Expand Down
9 changes: 3 additions & 6 deletions DotMP/Init.cs
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,6 @@ internal int start
{
return start_pv;
}
private set
{
start_pv = value;
}
}
/// <summary>
/// A generic lock to be used within the parallel for loop.
Expand Down Expand Up @@ -210,9 +206,10 @@ internal WorkShare() { }
/// Advance the start by some value.
/// </summary>
/// <param name="advance_by">The value to advance start by.</param>
internal void Advance(int advance_by)
/// <returns>The start of the current chunk to execute.</returns>
internal int Advance(int advance_by)
{
start += advance_by;
return Interlocked.Add(ref start_pv, advance_by) - advance_by;
}

/// <summary>
Expand Down
18 changes: 8 additions & 10 deletions DotMP/Iter.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using System.Runtime.CompilerServices;
using System.Threading;

namespace DotMP
Expand Down Expand Up @@ -101,6 +102,7 @@ internal static void StaticLoop<T>(WorkShare ws, int thread_id, ForAction<T> for
/// <param name="chunk_size">The chunk size.</param>
/// <param name="forAction">The function to be executed.</param>
/// <param name="local">The local variable for reductions.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void StaticNext<T>(WorkShare ws, Thr thr, uint chunk_size, ForAction<T> forAction, ref T local)
{
int start = thr.curr_iter;
Expand All @@ -124,7 +126,8 @@ internal static void LoadBalancingLoop<T>(WorkShare ws, ForAction<T> forAction,
int end = ws.end;

T local = default;
ws.SetLocal(ref local);
if (forAction.IsReduction)
ws.SetLocal(ref local);

if (schedule == Schedule.Guided) while (ws.start < end)
{
Expand All @@ -147,16 +150,10 @@ internal static void LoadBalancingLoop<T>(WorkShare ws, ForAction<T> forAction,
/// <param name="thr">The Thr object for the current thread.</param>
/// <param name="forAction">The function to be executed.</param>
/// <param name="local">The local variable for reductions.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void DynamicNext<T>(WorkShare ws, Thr thr, ForAction<T> forAction, ref T local)
{
int chunk_start;

lock (ws.ws_lock)
{
chunk_start = ws.start;
ws.Advance((int)ws.chunk_size);
}

int chunk_start = ws.Advance((int)ws.chunk_size);
int chunk_end = (int)Math.Min(chunk_start + ws.chunk_size, ws.end);

forAction.PerformLoop(ref thr.working_iter, chunk_start, chunk_end, ref local);
Expand All @@ -171,14 +168,15 @@ private static void DynamicNext<T>(WorkShare ws, Thr thr, ForAction<T> forAction
/// <param name="thr">The Thr object for the current thread.</param>
/// <param name="forAction">The function to be executed.</param>
/// <param name="local">The local variable for reductions.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void GuidedNext<T>(WorkShare ws, Thr thr, ForAction<T> forAction, ref T local)
{
int chunk_start, chunk_size;

lock (ws.ws_lock)
{
chunk_start = ws.start;
chunk_size = (int)Math.Max(ws.chunk_size, (ws.end - chunk_start) / ws.num_threads);
chunk_size = (int)Math.Max(ws.chunk_size, (ws.end - chunk_start) / (ws.num_threads * 2));

ws.Advance(chunk_size);
}
Expand Down
7 changes: 3 additions & 4 deletions DotMP/Parallel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ private static void FixArgs(int start, int end, ref Schedule sched, ref uint? ch
{
case Schedule.Static:
chunk_size = (uint)((end - start) / num_threads);
if ((end - start) % num_threads > 0)
chunk_size++;
break;
case Schedule.Dynamic:
chunk_size = (uint)((end - start) / num_threads) / 32;
Expand Down Expand Up @@ -1107,10 +1109,7 @@ public static void Ordered(int id, Action action)

WorkShare ws = new WorkShare();

while (ordered[id] != ws.thread.working_iter)
{
freg.reg.spin[tid].SpinOnce();
}
while (ordered[id] != ws.thread.working_iter) ;

action();

Expand Down
32 changes: 18 additions & 14 deletions DotMP/Wrappers.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using System;
using System.Linq;
using System.Runtime.CompilerServices;

namespace DotMP
{
Expand Down Expand Up @@ -334,15 +335,14 @@ internal ForAction(ActionRefN<T> action, (int, int)[] ranges)
/// <param name="diff2">The difference in the second pair of indices.</param>
/// <param name="start1">The start of the first pair of indices.</param>
/// <param name="start2">The start of the second pair of indices.</param>
/// <returns>The two indices.</returns>
private ValueTuple<int, int> ComputeIndices2(int curr_iter, int diff2, int start1, int start2)
/// <param name="i">The first computed index.</param>
/// <param name="j">The second computed index.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void ComputeIndices2(int curr_iter, int diff2, int start1, int start2, out int i, out int j)
{
int i, j;
i = Math.DivRem(curr_iter, diff2, out j);
i += start1;
j += start2;

return (i, j);
}

/// <summary>
Expand All @@ -354,17 +354,17 @@ private ValueTuple<int, int> ComputeIndices2(int curr_iter, int diff2, int start
/// <param name="start1">The start of the first pair of indices.</param>
/// <param name="start2">The start of the second pair of indices.</param>
/// <param name="start3">The start of the third pair of indices.</param>
/// <returns>The three indices.</returns>
private ValueTuple<int, int, int> ComputeIndices3(int curr_iter, int diff2, int diff3, int start1, int start2, int start3)
/// <param name="i">The first computed index.</param>
/// <param name="j">The second computed index.</param>
/// <param name="k">The third computed index.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void ComputeIndices3(int curr_iter, int diff2, int diff3, int start1, int start2, int start3, out int i, out int j, out int k)
{
int i, j, k;
i = Math.DivRem(curr_iter, diff2 * diff3, out j);
j = Math.DivRem(j, diff3, out k);
i += start1;
j += start2;
k += start3;

return (i, j, k);
}

/// <summary>
Expand Down Expand Up @@ -426,7 +426,8 @@ internal void PerformLoop(ref int curr_iter, int start, int end, ref T local)

for (curr_iter = start; curr_iter < end; curr_iter++)
{
(int i, int j) = ComputeIndices2(curr_iter, diff2, start1, start2);
int i, j;
ComputeIndices2(curr_iter, diff2, start1, start2, out i, out j);
omp_col_2(i, j);
}
break;
Expand All @@ -440,7 +441,8 @@ internal void PerformLoop(ref int curr_iter, int start, int end, ref T local)

for (curr_iter = start; curr_iter < end; curr_iter++)
{
(int i, int j, int k) = ComputeIndices3(curr_iter, diff2, diff3, start1, start2, start3);
int i, j, k;
ComputeIndices3(curr_iter, diff2, diff3, start1, start2, start3, out i, out j, out k);
omp_col_3(i, j, k);
}
break;
Expand All @@ -466,7 +468,8 @@ internal void PerformLoop(ref int curr_iter, int start, int end, ref T local)

for (curr_iter = start; curr_iter < end; curr_iter++)
{
(int i, int j) = ComputeIndices2(curr_iter, diff2, start1, start2);
int i, j;
ComputeIndices2(curr_iter, diff2, start1, start2, out i, out j);
omp_red_col_2(ref local, i, j);
}
break;
Expand All @@ -480,7 +483,8 @@ internal void PerformLoop(ref int curr_iter, int start, int end, ref T local)

for (curr_iter = start; curr_iter < end; curr_iter++)
{
(int i, int j, int k) = ComputeIndices3(curr_iter, diff2, diff3, start1, start2, start3);
int i, j, k;
ComputeIndices3(curr_iter, diff2, diff3, start1, start2, start3, out i, out j, out k);
omp_red_col_3(ref local, i, j, k);
}
break;
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/HeatTransfer/HeatTransfer.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<DebugType>pdbonly</DebugType>
<DebugSymbols>true</DebugSymbols>
</PropertyGroup>

</Project>
19 changes: 10 additions & 9 deletions benchmarks/HeatTransfer/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ public void DoStep()
[SimpleJob(RuntimeMoniker.Net70)]
[ThreadingDiagnoser]
[HardwareCounters]
[EventPipeProfiler(EventPipeProfile.CpuSampling)]
// test heat transfer using Parallel.For
public class HeatTransferFor
{
Expand All @@ -96,11 +97,11 @@ public class HeatTransferFor
private double[,] grid = new double[0, 0];

// test dims of 100x100, 1000x1000, and 5000x5000
[Params(100, 1000, 5000)]
[Params(1000)]
public int dim;

// test with 10 steps and 100 steps
[Params(10, 100)]
[Params(100)]
public int steps;

// change this to configure the number of threads to use
Expand Down Expand Up @@ -136,7 +137,7 @@ public void DoSimulation()
public void DoStep()
{
//iterate over all cells not on the border
DotMP.Parallel.For(1, dim - 1, action: i =>
DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Dynamic, chunk_size: 1, action: i =>
{
for (int j = 1; j < dim - 1; j++)
{
Expand All @@ -146,7 +147,7 @@ public void DoStep()
});

//copy the scratch array to the grid array
DotMP.Parallel.For(1, dim - 1, action: i =>
DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Dynamic, chunk_size: 1, action: i =>
{
for (int j = 1; j < dim - 1; j++)
{
Expand All @@ -157,9 +158,9 @@ public void DoStep()
}

[SimpleJob(RuntimeMoniker.Net60)]
[SimpleJob(RuntimeMoniker.Net70)]
[ThreadingDiagnoser]
[HardwareCounters]
[EventPipeProfiler(EventPipeProfile.CpuSampling)]
// test heat transfer using Parallel.ForCollapse
public class HeatTransferForCollapse
{
Expand All @@ -169,11 +170,11 @@ public class HeatTransferForCollapse
private double[,] grid = new double[0, 0];

// test dims of 100x100, 1000x1000, and 5000x5000
[Params(100, 1000, 5000)]
[Params(500)]
public int dim;

// test with 10 steps and 100 steps
[Params(10, 100)]
[Params(100)]
public int steps;

// change this to configure the number of threads to use
Expand Down Expand Up @@ -209,14 +210,14 @@ public void DoSimulation()
public void DoStep()
{
//iterate over all cells not on the border
DotMP.Parallel.ForCollapse((1, dim - 1), (1, dim - 1), action: (i, j) =>
DotMP.Parallel.ForCollapse((1, dim - 1), (1, dim - 1), schedule: DotMP.Schedule.Dynamic, chunk_size: 1, action: (i, j) =>
{
//set the scratch array to the average of the surrounding cells
scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
});

//copy the scratch array to the grid array
DotMP.Parallel.ForCollapse((1, dim - 1), (1, dim - 1), action: (i, j) =>
DotMP.Parallel.ForCollapse((1, dim - 1), (1, dim - 1), schedule: DotMP.Schedule.Dynamic, chunk_size: 1, action: (i, j) =>
{
grid[i, j] = scratch[i, j];
});
Expand Down
25 changes: 0 additions & 25 deletions examples/CSParallel/KNN/KNN.sln

This file was deleted.

Loading

0 comments on commit 7edd5c8

Please sign in to comment.