Merge pull request #94 from computablee/features

Worksharing-For Optimizations
computablee · Oct 17, 2023 · 7edd5c8 · 7edd5c8
2 parents d08034d + 6509181
commit 7edd5c8
Show file tree

Hide file tree

Showing 12 changed files with 59 additions and 127 deletions.
diff --git a/DotMP-Tests/ParallelTests.cs b/DotMP-Tests/ParallelTests.cs
@@ -139,7 +139,7 @@ public void Dynamic_should_produce_correct_results()
                 y[i] = 1.0f;
             }
 
-            float[] z = saxpy_parallelregion_for(2.0f, x, y, Schedule.Dynamic, 16);
+            float[] z = saxpy_parallelregion_for(2.0f, x, y, Schedule.Dynamic, 1);
 
             for (int i = 0; i < z.Length; i++)
             {
@@ -394,6 +394,13 @@ public void Schedule_runtime_works()
                 DotMP.Parallel.GetChunkSize().Should().Be(256);
             });
 
+            Environment.SetEnvironmentVariable("OMP_SCHEDULE", "static");
+            DotMP.Parallel.ParallelFor(0, 1025, num_threads: 4, schedule: DotMP.Schedule.Runtime, action: i =>
+            {
+                DotMP.Parallel.GetSchedule().Should().Be(DotMP.Schedule.Static);
+                DotMP.Parallel.GetChunkSize().Should().Be(257);
+            });
+
             Environment.SetEnvironmentVariable("OMP_SCHEDULE", "guided,2");
             DotMP.Parallel.ParallelFor(0, 1024, schedule: DotMP.Schedule.Runtime, action: i =>
             {
@@ -649,11 +656,15 @@ public void Ordered_works()
         {
             uint threads = 8;
             int[] incrementing = new int[1024];
+            int ctr = 0;
 
             DotMP.Parallel.ParallelFor(0, 1024, schedule: DotMP.Schedule.Static,
                                         num_threads: threads, action: i =>
             {
-                DotMP.Parallel.Ordered(0, () => incrementing[i] = i);
+                DotMP.Parallel.Ordered(0, () =>
+                {
+                    incrementing[i] = ctr++;
+                });
             });
 
             for (int i = 0; i < incrementing.Length; i++)

diff --git a/DotMP/DotMP.csproj b/DotMP/DotMP.csproj
@@ -15,6 +15,8 @@
     <PackageLicenseExpression>MIT</PackageLicenseExpression>
     <PackageReleaseNotes>Added support for .NET 7.0.</PackageReleaseNotes>
     <GenerateDocumentationFile>true</GenerateDocumentationFile>
+    <DebugType>pdbonly</DebugType>
+    <DebugSymbols>true</DebugSymbols>
   </PropertyGroup>
 
   <ItemGroup>

diff --git a/DotMP/ForkedRegion.cs b/DotMP/ForkedRegion.cs
@@ -25,10 +25,6 @@ internal class Region
         /// The function to be executed.
         /// </summary>
         internal Action omp_fn;
-        /// <summary>
-        /// Generic SpinWait objects for each thread.
-        /// </summary>
-        internal SpinWait[] spin;
 
         /// <summary>
         /// Creates a specified number of threads available to the parallel region, and sets the function to be executed.
@@ -48,9 +44,6 @@ internal Region(uint num_threads, Action omp_fn)
             ws_lock = new object();
             this.num_threads = num_threads;
             this.omp_fn = omp_fn;
-            this.spin = new SpinWait[num_threads];
-            for (int i = 0; i < num_threads; i++)
-                this.spin[i] = new SpinWait();
         }
     }
 

diff --git a/DotMP/Init.cs b/DotMP/Init.cs
@@ -68,10 +68,6 @@ internal int start
             {
                 return start_pv;
             }
-            private set
-            {
-                start_pv = value;
-            }
         }
         /// <summary>
         /// A generic lock to be used within the parallel for loop.
@@ -210,9 +206,10 @@ internal WorkShare() { }
         /// Advance the start by some value.
         /// </summary>
         /// <param name="advance_by">The value to advance start by.</param>
-        internal void Advance(int advance_by)
+        /// <returns>The start of the current chunk to execute.</returns>
+        internal int Advance(int advance_by)
         {
-            start += advance_by;
+            return Interlocked.Add(ref start_pv, advance_by) - advance_by;
         }
 
         /// <summary>

diff --git a/DotMP/Iter.cs b/DotMP/Iter.cs
@@ -1,4 +1,5 @@
 using System;
+using System.Runtime.CompilerServices;
 using System.Threading;
 
 namespace DotMP
@@ -101,6 +102,7 @@ internal static void StaticLoop<T>(WorkShare ws, int thread_id, ForAction<T> for
         /// <param name="chunk_size">The chunk size.</param>
         /// <param name="forAction">The function to be executed.</param>
         /// <param name="local">The local variable for reductions.</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static void StaticNext<T>(WorkShare ws, Thr thr, uint chunk_size, ForAction<T> forAction, ref T local)
         {
             int start = thr.curr_iter;
@@ -124,7 +126,8 @@ internal static void LoadBalancingLoop<T>(WorkShare ws, ForAction<T> forAction,
             int end = ws.end;
 
             T local = default;
-            ws.SetLocal(ref local);
+            if (forAction.IsReduction)
+                ws.SetLocal(ref local);
 
             if (schedule == Schedule.Guided) while (ws.start < end)
                 {
@@ -147,16 +150,10 @@ internal static void LoadBalancingLoop<T>(WorkShare ws, ForAction<T> forAction,
         /// <param name="thr">The Thr object for the current thread.</param>
         /// <param name="forAction">The function to be executed.</param>
         /// <param name="local">The local variable for reductions.</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static void DynamicNext<T>(WorkShare ws, Thr thr, ForAction<T> forAction, ref T local)
         {
-            int chunk_start;
-
-            lock (ws.ws_lock)
-            {
-                chunk_start = ws.start;
-                ws.Advance((int)ws.chunk_size);
-            }
-
+            int chunk_start = ws.Advance((int)ws.chunk_size);
             int chunk_end = (int)Math.Min(chunk_start + ws.chunk_size, ws.end);
 
             forAction.PerformLoop(ref thr.working_iter, chunk_start, chunk_end, ref local);
@@ -171,14 +168,15 @@ private static void DynamicNext<T>(WorkShare ws, Thr thr, ForAction<T> forAction
         /// <param name="thr">The Thr object for the current thread.</param>
         /// <param name="forAction">The function to be executed.</param>
         /// <param name="local">The local variable for reductions.</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         private static void GuidedNext<T>(WorkShare ws, Thr thr, ForAction<T> forAction, ref T local)
         {
             int chunk_start, chunk_size;
 
             lock (ws.ws_lock)
             {
                 chunk_start = ws.start;
-                chunk_size = (int)Math.Max(ws.chunk_size, (ws.end - chunk_start) / ws.num_threads);
+                chunk_size = (int)Math.Max(ws.chunk_size, (ws.end - chunk_start) / (ws.num_threads * 2));
 
                 ws.Advance(chunk_size);
             }

diff --git a/DotMP/Parallel.cs b/DotMP/Parallel.cs
@@ -94,6 +94,8 @@ private static void FixArgs(int start, int end, ref Schedule sched, ref uint? ch
                 {
                     case Schedule.Static:
                         chunk_size = (uint)((end - start) / num_threads);
+                        if ((end - start) % num_threads > 0)
+                            chunk_size++;
                         break;
                     case Schedule.Dynamic:
                         chunk_size = (uint)((end - start) / num_threads) / 32;
@@ -1107,10 +1109,7 @@ public static void Ordered(int id, Action action)
 
             WorkShare ws = new WorkShare();
 
-            while (ordered[id] != ws.thread.working_iter)
-            {
-                freg.reg.spin[tid].SpinOnce();
-            }
+            while (ordered[id] != ws.thread.working_iter) ;
 
             action();
 

diff --git a/DotMP/Wrappers.cs b/DotMP/Wrappers.cs
@@ -1,5 +1,6 @@
 using System;
 using System.Linq;
+using System.Runtime.CompilerServices;
 
 namespace DotMP
 {
@@ -334,15 +335,14 @@ internal ForAction(ActionRefN<T> action, (int, int)[] ranges)
         /// <param name="diff2">The difference in the second pair of indices.</param>
         /// <param name="start1">The start of the first pair of indices.</param>
         /// <param name="start2">The start of the second pair of indices.</param>
-        /// <returns>The two indices.</returns>
-        private ValueTuple<int, int> ComputeIndices2(int curr_iter, int diff2, int start1, int start2)
+        /// <param name="i">The first computed index.</param>
+        /// <param name="j">The second computed index.</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void ComputeIndices2(int curr_iter, int diff2, int start1, int start2, out int i, out int j)
         {
-            int i, j;
             i = Math.DivRem(curr_iter, diff2, out j);
             i += start1;
             j += start2;
-
-            return (i, j);
         }
 
         /// <summary>
@@ -354,17 +354,17 @@ private ValueTuple<int, int> ComputeIndices2(int curr_iter, int diff2, int start
         /// <param name="start1">The start of the first pair of indices.</param>
         /// <param name="start2">The start of the second pair of indices.</param>
         /// <param name="start3">The start of the third pair of indices.</param>
-        /// <returns>The three indices.</returns>
-        private ValueTuple<int, int, int> ComputeIndices3(int curr_iter, int diff2, int diff3, int start1, int start2, int start3)
+        /// <param name="i">The first computed index.</param>
+        /// <param name="j">The second computed index.</param>
+        /// <param name="k">The third computed index.</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void ComputeIndices3(int curr_iter, int diff2, int diff3, int start1, int start2, int start3, out int i, out int j, out int k)
         {
-            int i, j, k;
             i = Math.DivRem(curr_iter, diff2 * diff3, out j);
             j = Math.DivRem(j, diff3, out k);
             i += start1;
             j += start2;
             k += start3;
-
-            return (i, j, k);
         }
 
         /// <summary>
@@ -426,7 +426,8 @@ internal void PerformLoop(ref int curr_iter, int start, int end, ref T local)
 
                     for (curr_iter = start; curr_iter < end; curr_iter++)
                     {
-                        (int i, int j) = ComputeIndices2(curr_iter, diff2, start1, start2);
+                        int i, j;
+                        ComputeIndices2(curr_iter, diff2, start1, start2, out i, out j);
                         omp_col_2(i, j);
                     }
                     break;
@@ -440,7 +441,8 @@ internal void PerformLoop(ref int curr_iter, int start, int end, ref T local)
 
                     for (curr_iter = start; curr_iter < end; curr_iter++)
                     {
-                        (int i, int j, int k) = ComputeIndices3(curr_iter, diff2, diff3, start1, start2, start3);
+                        int i, j, k;
+                        ComputeIndices3(curr_iter, diff2, diff3, start1, start2, start3, out i, out j, out k);
                         omp_col_3(i, j, k);
                     }
                     break;
@@ -466,7 +468,8 @@ internal void PerformLoop(ref int curr_iter, int start, int end, ref T local)
 
                     for (curr_iter = start; curr_iter < end; curr_iter++)
                     {
-                        (int i, int j) = ComputeIndices2(curr_iter, diff2, start1, start2);
+                        int i, j;
+                        ComputeIndices2(curr_iter, diff2, start1, start2, out i, out j);
                         omp_red_col_2(ref local, i, j);
                     }
                     break;
@@ -480,7 +483,8 @@ internal void PerformLoop(ref int curr_iter, int start, int end, ref T local)
 
                     for (curr_iter = start; curr_iter < end; curr_iter++)
                     {
-                        (int i, int j, int k) = ComputeIndices3(curr_iter, diff2, diff3, start1, start2, start3);
+                        int i, j, k;
+                        ComputeIndices3(curr_iter, diff2, diff3, start1, start2, start3, out i, out j, out k);
                         omp_red_col_3(ref local, i, j, k);
                     }
                     break;

diff --git a/benchmarks/HeatTransfer/HeatTransfer.csproj b/benchmarks/HeatTransfer/HeatTransfer.csproj
@@ -14,6 +14,8 @@
     <TargetFramework>net6.0</TargetFramework>
     <ImplicitUsings>enable</ImplicitUsings>
     <Nullable>enable</Nullable>
+    <DebugType>pdbonly</DebugType>
+    <DebugSymbols>true</DebugSymbols>
   </PropertyGroup>
 
 </Project>
diff --git a/benchmarks/HeatTransfer/Program.cs b/benchmarks/HeatTransfer/Program.cs
@@ -87,6 +87,7 @@ public void DoStep()
 [SimpleJob(RuntimeMoniker.Net70)]
 [ThreadingDiagnoser]
 [HardwareCounters]
+[EventPipeProfiler(EventPipeProfile.CpuSampling)]
 // test heat transfer using Parallel.For
 public class HeatTransferFor
 {
@@ -96,11 +97,11 @@ public class HeatTransferFor
     private double[,] grid = new double[0, 0];
 
     // test dims of 100x100, 1000x1000, and 5000x5000
-    [Params(100, 1000, 5000)]
+    [Params(1000)]
     public int dim;
 
     // test with 10 steps and 100 steps
-    [Params(10, 100)]
+    [Params(100)]
     public int steps;
 
     // change this to configure the number of threads to use
@@ -136,7 +137,7 @@ public void DoSimulation()
     public void DoStep()
     {
         //iterate over all cells not on the border
-        DotMP.Parallel.For(1, dim - 1, action: i =>
+        DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Dynamic, chunk_size: 1, action: i =>
         {
             for (int j = 1; j < dim - 1; j++)
             {
@@ -146,7 +147,7 @@ public void DoStep()
         });
 
         //copy the scratch array to the grid array
-        DotMP.Parallel.For(1, dim - 1, action: i =>
+        DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Dynamic, chunk_size: 1, action: i =>
         {
             for (int j = 1; j < dim - 1; j++)
             {
@@ -157,9 +158,9 @@ public void DoStep()
 }
 
 [SimpleJob(RuntimeMoniker.Net60)]
-[SimpleJob(RuntimeMoniker.Net70)]
 [ThreadingDiagnoser]
 [HardwareCounters]
+[EventPipeProfiler(EventPipeProfile.CpuSampling)]
 // test heat transfer using Parallel.ForCollapse
 public class HeatTransferForCollapse
 {
@@ -169,11 +170,11 @@ public class HeatTransferForCollapse
     private double[,] grid = new double[0, 0];
 
     // test dims of 100x100, 1000x1000, and 5000x5000
-    [Params(100, 1000, 5000)]
+    [Params(500)]
     public int dim;
 
     // test with 10 steps and 100 steps
-    [Params(10, 100)]
+    [Params(100)]
     public int steps;
 
     // change this to configure the number of threads to use
@@ -209,14 +210,14 @@ public void DoSimulation()
     public void DoStep()
     {
         //iterate over all cells not on the border
-        DotMP.Parallel.ForCollapse((1, dim - 1), (1, dim - 1), action: (i, j) =>
+        DotMP.Parallel.ForCollapse((1, dim - 1), (1, dim - 1), schedule: DotMP.Schedule.Dynamic, chunk_size: 1, action: (i, j) =>
         {
             //set the scratch array to the average of the surrounding cells
             scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
         });
 
         //copy the scratch array to the grid array
-        DotMP.Parallel.ForCollapse((1, dim - 1), (1, dim - 1), action: (i, j) =>
+        DotMP.Parallel.ForCollapse((1, dim - 1), (1, dim - 1), schedule: DotMP.Schedule.Dynamic, chunk_size: 1, action: (i, j) =>
         {
             grid[i, j] = scratch[i, j];
         });

diff --git a/examples/CSParallel/KNN/KNN.sln b/examples/CSParallel/KNN/KNN.sln