diff --git a/.github/workflows/publish_site.yml b/.github/workflows/publish_site.yml index 21b8cc44..2bc3b185 100644 --- a/.github/workflows/publish_site.yml +++ b/.github/workflows/publish_site.yml @@ -12,6 +12,14 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Setup .NET + uses: actions/setup-dotnet@v3 + with: + dotnet-version: 8.0.x + + - name: Build Templates + run: make build + - name: Install Doxygen run: sudo apt-get install doxygen graphviz -y shell: bash @@ -28,4 +36,4 @@ jobs: uses: JamesIves/github-pages-deploy-action@v4 with: token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - folder: docs/html \ No newline at end of file + folder: docs/html diff --git a/.gitignore b/.gitignore index 25fb8c99..ac1e66dc 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,8 @@ docs/* .vscode *.opencover.xml *.sln +AcceleratorHandler.cs +Gpu.cs ProcessedREADME.md # User-specific files diff --git a/DotMP-Tests/ParallelTests.cs b/DotMP-Tests/CPUTests.cs similarity index 97% rename from DotMP-Tests/ParallelTests.cs rename to DotMP-Tests/CPUTests.cs index 4f38d367..038ff4ba 100644 --- a/DotMP-Tests/ParallelTests.cs +++ b/DotMP-Tests/CPUTests.cs @@ -28,9 +28,9 @@ namespace DotMPTests { /// - /// Tests for the DotMP library. + /// CPU tests for the DotMP library. /// - public class ParallelTests + public class CPUTests { private readonly ITestOutputHelper output; @@ -38,7 +38,7 @@ public class ParallelTests /// Constructor to write output. /// /// Output object. - public ParallelTests(ITestOutputHelper output) + public CPUTests(ITestOutputHelper output) { this.output = output; } @@ -522,7 +522,7 @@ public void Critical_works() DotMP.Parallel.ParallelRegion(num_threads: threads, action: () => { for (int i = 0; i < iters; i++) - DotMP.Parallel.Critical(0, () => ++total); + DotMP.Parallel.Critical(() => ++total); }); total.Should().Be((int)threads * iters); @@ -531,14 +531,13 @@ public void Critical_works() DotMP.Parallel.ParallelRegion(num_threads: 4, action: () => { - if (DotMP.Parallel.GetThreadNum() == 0) DotMP.Parallel.Critical(0, () => Thread.Sleep(1000)); - if (DotMP.Parallel.GetThreadNum() == 1) DotMP.Parallel.Critical(1, () => Thread.Sleep(1000)); - if (DotMP.Parallel.GetThreadNum() == 2) DotMP.Parallel.Critical(0, () => Thread.Sleep(1000)); - if (DotMP.Parallel.GetThreadNum() == 3) DotMP.Parallel.Critical(1, () => Thread.Sleep(1000)); + if (DotMP.Parallel.GetThreadNum() % 2 == 0) DotMP.Parallel.Critical(() => Thread.Sleep(1000)); + if (DotMP.Parallel.GetThreadNum() % 2 == 1) DotMP.Parallel.Critical(() => Thread.Sleep(1000)); }); double elapsed = DotMP.Parallel.GetWTime() - start; - elapsed.Should().BeLessThan(2200); + elapsed.Should().BeLessThan(2.2); + elapsed.Should().BeGreaterThan(2.0); } /// @@ -571,7 +570,7 @@ public void Single_works() { for (int i = 0; i < 10; i++) { - DotMP.Parallel.Single(0, () => DotMP.Atomic.Inc(ref total)); + DotMP.Parallel.Single(() => DotMP.Atomic.Inc(ref total)); } }); @@ -583,7 +582,7 @@ public void Single_works() { for (int i = 0; i < 10; i++) { - DotMP.Parallel.Single(0, () => DotMP.Atomic.Inc(ref total)); + DotMP.Parallel.Single(() => DotMP.Atomic.Inc(ref total)); } }); @@ -749,7 +748,7 @@ public void Ordered_works() DotMP.Parallel.ParallelFor(0, 1024, schedule: DotMP.Schedule.Static, num_threads: threads, action: i => { - DotMP.Parallel.Ordered(0, () => + DotMP.Parallel.Ordered(() => { incrementing[i] = ctr++; }); @@ -1111,7 +1110,7 @@ public void Tasking_works() DotMP.Parallel.ParallelRegion(num_threads: threads, action: () => { - DotMP.Parallel.Single(0, () => + DotMP.Parallel.Single(() => { for (int i = 0; i < threads * 2; i++) { @@ -1139,7 +1138,7 @@ public void Tasking_works() DotMP.Parallel.ParallelRegion(num_threads: threads, action: () => { - DotMP.Parallel.Single(0, () => + DotMP.Parallel.Single(() => { for (int i = 0; i < tasks_to_spawn; i++) { @@ -1199,7 +1198,7 @@ public void Nested_tasks_work() DotMP.Parallel.ParallelRegion(num_threads: threads, action: () => { - DotMP.Parallel.Single(0, () => + DotMP.Parallel.Single(() => { DotMP.Parallel.Task(() => { @@ -1369,7 +1368,7 @@ public void Non_parallel_single_should_except() { Assert.Throws(() => { - DotMP.Parallel.Single(0, () => { }); + DotMP.Parallel.Single(() => { }); }); } @@ -1381,7 +1380,7 @@ public void Non_parallel_critical_should_except() { Assert.Throws(() => { - DotMP.Parallel.Critical(0, () => { }); + DotMP.Parallel.Critical(() => { }); }); } @@ -1395,7 +1394,7 @@ public void Nested_worksharing_should_except() { DotMP.Parallel.ParallelFor(0, 10, num_threads: 4, action: i => { - DotMP.Parallel.Single(0, () => { }); + DotMP.Parallel.Single(() => { }); }); }); @@ -1403,7 +1402,7 @@ public void Nested_worksharing_should_except() { DotMP.Parallel.ParallelRegion(num_threads: 4, action: () => { - DotMP.Parallel.Single(0, () => + DotMP.Parallel.Single(() => { DotMP.Parallel.For(0, 10, action: i => { }); }); @@ -1427,7 +1426,7 @@ public void Non_for_ordered_should_except() { Assert.Throws(() => { - DotMP.Parallel.Ordered(0, () => { }); + DotMP.Parallel.Ordered(() => { }); }); } diff --git a/DotMP-Tests/GPUTests.cs b/DotMP-Tests/GPUTests.cs new file mode 100644 index 00000000..7da2b446 --- /dev/null +++ b/DotMP-Tests/GPUTests.cs @@ -0,0 +1,136 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Text.Json.Serialization; +using System.Threading; +using DotMP; +using DotMP.GPU; +using FluentAssertions; +using Xunit; +using Xunit.Abstractions; + + +namespace DotMPTests +{ + /// + /// CPU tests for the DotMP library. + /// + public class GPUTests + { + /// + /// Tests to make sure that for loops work in GPU kernels. + /// + [Fact] + public void GPU_for_works() + { + double[] a = new double[50000]; + double[] x = new double[50000]; + double[] y = new double[50000]; + float[] res = new float[50000]; + float[] res_cpu = new float[50000]; + + random_init(a); + random_init(x); + random_init(y); + + { + using var a_gpu = new DotMP.GPU.Buffer(a, DotMP.GPU.Buffer.Behavior.To); + using var x_gpu = new DotMP.GPU.Buffer(x, DotMP.GPU.Buffer.Behavior.To); + using var y_gpu = new DotMP.GPU.Buffer(y, DotMP.GPU.Buffer.Behavior.To); + using var res_gpu = new DotMP.GPU.Buffer(res, DotMP.GPU.Buffer.Behavior.From); + + DotMP.GPU.Parallel.ParallelFor(0, a.Length, a_gpu, x_gpu, y_gpu, res_gpu, + (i, a, x, y, res) => + { + res[i] = (float)(a[i] * x[i] + y[i]); + }); + } + + for (int i = 0; i < a.Length; i++) + { + res_cpu[i] = (float)(a[i] * x[i] + y[i]); + } + + Assert.Equal(res_cpu, res); + + double[] a_old = a.Select(a => a).ToArray(); + + using (var a_gpu = new DotMP.GPU.Buffer(a, DotMP.GPU.Buffer.Behavior.ToFrom)) + { + DotMP.GPU.Parallel.ParallelFor(0, a.Length, a_gpu, (i, a) => + { + a[i]++; + }); + } + + for (int i = 0; i < a.Length; i++) + { + a_old[i]++; + } + + Assert.Equal(a, a_old); + } + + /// + /// Tests to make sure that DotMP.GPU.Parallel.ForCollapse produces correct results. + /// + [Fact] + public void Collapse_works() + { + int[,] iters_hit = new int[1024, 1024]; + + using (var buf = new Buffer(iters_hit, DotMP.GPU.Buffer.Behavior.ToFrom)) + { + DotMP.GPU.Parallel.ParallelForCollapse((258, 512), (512, 600), buf, (i, j, iters_hit) => + { + iters_hit[i, j]++; + }); + } + + for (int i = 0; i < 1024; i++) + for (int j = 0; j < 1024; j++) + if (i >= 258 && i < 512 && j >= 512 && j < 600) + iters_hit[i, j].Should().Be(1); + else + iters_hit[i, j].Should().Be(0); + + iters_hit = null; + + int[,,] iters_hit_3 = new int[128, 128, 64]; + + using (var buf = new Buffer(iters_hit_3, DotMP.GPU.Buffer.Behavior.ToFrom)) + { + DotMP.GPU.Parallel.ParallelForCollapse((35, 64), (16, 100), (10, 62), buf, action: (i, j, k, iters_hit_3) => + { + iters_hit_3[i, j, k]++; + }); + } + + for (int i = 0; i < 128; i++) + for (int j = 0; j < 128; j++) + for (int k = 0; k < 64; k++) + if (i >= 35 && i < 64 && j >= 16 && j < 100 && k >= 10 && k < 62) + iters_hit_3[i, j, k].Should().Be(1); + else + iters_hit_3[i, j, k].Should().Be(0); + + iters_hit_3 = null; + } + + /// + /// Randomly initialize an array of type T. + /// + /// The type to initialize to. + /// The allocated array to store values into. + private void random_init(T[] arr) + { + Random r = new Random(); + + for (int i = 0; i < arr.Length; i++) + { + arr[i] = (T)Convert.ChangeType(r.NextDouble() * 128, typeof(T)); + } + } + } +} diff --git a/DotMP/DotMP.csproj b/DotMP/DotMP.csproj index e8cc69f0..d045a50c 100644 --- a/DotMP/DotMP.csproj +++ b/DotMP/DotMP.csproj @@ -4,7 +4,7 @@ net6.0;net7.0;net8.0 DotMP DotMP - 1.6.0 + 2.0-pre1 Phillip Allen Lane,et al. A library for fork-join parallelism in .NET, with an OpenMP-like API. https://github.com/computablee/DotMP @@ -23,4 +23,21 @@ + + + + + + True + True + GPU/AcceleratorHandler.tt + + + + True + True + GPU/Gpu.tt + + + diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt new file mode 100644 index 00000000..2527d1ba --- /dev/null +++ b/DotMP/GPU/AcceleratorHandler.tt @@ -0,0 +1,488 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +<#@ template debug="false" hostspecific="false" language="C#" #> +<#@ output extension=".cs" #> +<# var letters = new char[] { 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'A', 'B', 'C', 'D', 'E', 'F' }; + int max = 13; #> + +using System; +using System.Collections.Generic; +using System.Linq; +using ILGPU; +using ILGPU.Runtime; + +namespace DotMP.GPU +{ + /// + /// The handler class managing GPU acceleration. + /// + internal sealed class AcceleratorHandler + { + /// + /// Determines if a GPU context has been initialized yet. + /// + private static bool initialized = false; + /// + /// The GPU context. + /// + private static Context context; + /// + /// The accelerator object. + /// + internal static Accelerator accelerator; + /// + /// Block size to use for kernels. + /// + private static int block_size; + /// + /// Kernel cache. + /// + private static Dictionary kernels = new Dictionary(); + /// + /// Index cache for 1D kernels. + /// + private static Dictionary>> indices1d = new Dictionary>>(); + /// + /// Index cache for 2D kernels. + /// + private static Dictionary, Buffer>> indices2d = + new Dictionary, Buffer>>(); + /// + /// Index cache for 3D kernels. + /// + private static Dictionary, ValueTuple, ValueTuple, Buffer, Buffer, Buffer>> indices3d = + new Dictionary, ValueTuple, ValueTuple, Buffer, Buffer, Buffer>>(); + + /// + /// Default constructor. If this is the first time it's called, it initializes all relevant singleton data. + /// + internal AcceleratorHandler() + { + if (initialized) return; + + context = Context.Create() + .Optimize(OptimizationLevel.O2) + .Inlining(InliningMode.Aggressive) + .AllAccelerators() + //.Math(MathMode.Fast32BitOnly) + .ToContext(); + var selectedDevice = context.Devices[0]; + + foreach (var d in context.Devices) + { + Console.WriteLine("Detected {0} accelerator.", d.ToString()); + + if (selectedDevice.AcceleratorType == AcceleratorType.CPU && d.AcceleratorType == AcceleratorType.OpenCL) + selectedDevice = d; + if (selectedDevice.AcceleratorType != AcceleratorType.Cuda && d.AcceleratorType == AcceleratorType.Cuda) + selectedDevice = d; + } + + accelerator = selectedDevice.CreateAccelerator(context); + //accelerator = context.Devices[0].CreateAccelerator(context); + + Console.WriteLine("Using {0} accelerator.", accelerator.AcceleratorType.ToString()); + initialized = true; + block_size = accelerator.AcceleratorType == AcceleratorType.CPU ? 16 : 256; + } + + /// + /// Synchronize pending operations. + /// + private void Synchronize() => accelerator.Synchronize(); + +<# for (int c = 1; c <= max; c++) { #> + /// + /// Get the kernel associated with this lambda. + /// + /// The action provided on the CPU. + /// The calling location. + /// The GPU kernel. + private Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > GetKernel< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >(Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #> + > action, string src) +<# for (int i = 0; i < c; i++) { #> + where <#= letters[i] #> : unmanaged +<# } #> + { + if (!kernels.ContainsKey(src)) + kernels.Add(src, accelerator.LoadStreamKernel(action)); + + return (Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + >) kernels[src]; + } +<# } #> + +<# for (int c = 1; c <= max - 1; c++) { #> + /// + /// Get the kernel associated with this lambda. + /// + /// The action provided on the CPU. + /// The calling location. + /// The GPU kernel. + private Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > GetKernel< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >(Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #> + > action, string src) +<# for (int i = 0; i < c; i++) { #> + where <#= letters[i] #> : unmanaged +<# } #> + { + if (!kernels.ContainsKey(src)) + kernels.Add(src, accelerator.LoadStreamKernel(action)); + + return (Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + >) kernels[src]; + } +<# } #> + +<# for (int c = 1; c <= max - 2; c++) { #> + /// + /// Get the kernel associated with this lambda. + /// + /// The action provided on the CPU. + /// The calling location. + /// The GPU kernel. + private Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > GetKernel< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >(Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #> + > action, string src) +<# for (int i = 0; i < c; i++) { #> + where <#= letters[i] #> : unmanaged +<# } #> + { + if (!kernels.ContainsKey(src)) + kernels.Add(src, accelerator.LoadStreamKernel(action)); + + return (Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + >) kernels[src]; + } +<# } #> + + /// + /// Precomputes and caches the indices for a 1D for loop. + /// + /// The range of the for loop. + /// The calling location in the source code. + /// The calculated index. + internal Index Get1DIdx((int, int) range, string src) + { + if (indices1d.ContainsKey(src)) + { + var data = indices1d[src]; + if (data.Item1 == range.Item1 && data.Item2 == range.Item2) + return new Index(data.Item3); + else + data.Item3.Dispose(); + } + + int[] indices = new int[range.Item2 - range.Item1]; + + for (int i = 0; i < indices.Length; i++) + indices[i] = i + range.Item1; + + var buf = new Buffer(indices, Buffer.Behavior.To); + indices1d[src] = (range.Item1, range.Item2, buf); + return new Index(buf); + } + + /// + /// Precomputes and caches the indices for a 2D for loop. + /// + /// The outer range of the for loop. + /// The inner range of the for loop. + /// The calling location in the source code. + /// A tuple of calculated indices. + internal ValueTuple Get2DIdx((int, int) range1, (int, int) range2, string src) + { + if (indices2d.ContainsKey(src)) + { + var data = indices2d[src]; + if (data.Item1 == range1.Item1 && data.Item2 == range1.Item2 && + data.Item3 == range2.Item1 && data.Item4 == range2.Item2) + return (new Index(data.Item5), new Index(data.Item6)); + else + { + data.Item5.Dispose(); + data.Item6.Dispose(); + } + } + + int[] indi = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1)]; + int[] indj = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1)]; + + int ci = range1.Item1, cj = range2.Item1; + + for (int i = 0; i < indi.Length; i++) + { + indi[i] = ci; + indj[i] = cj; + + if (++cj == range2.Item2) + { + cj = range2.Item1; + ++ci; + } + } + + var b1 = new Buffer(indi, Buffer.Behavior.To); + var b2 = new Buffer(indj, Buffer.Behavior.To); + indices2d[src] = (range1.Item1, range1.Item2, range2.Item1, range2.Item2, b1, b2); + + return (new Index(b1), new Index(b2)); + } + + /// + /// Precomputes and caches the indices for a 3D for loop. + /// + /// The outer range of the for loop. + /// The middle range of the for loop. + /// The inner range of the for loop. + /// The calling location in the source code. + /// A tuple of calculated indices. + internal ValueTuple Get3DIdx((int, int) range1, (int, int) range2, (int, int) range3, string src) + { + if (indices3d.ContainsKey(src)) + { + var data = indices3d[src]; + if (data.Item1.Item1 == range1.Item1 && data.Item1.Item2 == range1.Item2 && + data.Item2.Item1 == range2.Item1 && data.Item2.Item2 == range2.Item2 && + data.Item3.Item1 == range3.Item1 && data.Item3.Item2 == range3.Item2) + return (new Index(data.Item4), new Index(data.Item5), new Index(data.Item6)); + else + { + data.Item4.Dispose(); + data.Item5.Dispose(); + data.Item6.Dispose(); + } + } + + int[] indi = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1)]; + int[] indj = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1)]; + int[] indk = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1)]; + + int ci = range1.Item1, cj = range2.Item1, ck = range3.Item1; + + for (int i = 0; i < indi.Length; i++) + { + indi[i] = ci; + indj[i] = cj; + indk[i] = ck; + + if (++ck == range3.Item2) + { + ck = range3.Item1; + + if (++cj == range2.Item2) + { + cj = range2.Item1; + ++ci; + } + } + } + + var b1 = new Buffer(indi, Buffer.Behavior.To); + var b2 = new Buffer(indj, Buffer.Behavior.To); + var b3 = new Buffer(indk, Buffer.Behavior.To); + indices3d[src] = ((range1.Item1, range1.Item2), (range2.Item1, range2.Item2), (range3.Item1, range3.Item2), b1, b2, b3); + + return (new Index(b1), new Index(b2), new Index(b3)); + } + + +<# for (int c = 1; c <= max; c++) { #> + /// + /// Dispatches a linear kernel with the given number of parameters. + /// + /// The range of the for loop. +<# for (int i = 0; i < c; i++) { #> + /// Buffer #<#= i + 1 #> to run the kernel with. +<# } #> + /// The kernel to run on the GPU. + /// The originating caller location. + internal void DispatchKernel< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >((int, int) range1, +<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> + Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > action, string src) +<# for (int i = 0; i < c; i++) { #> + where <#= letters[i] #> : unmanaged +<# } #> + { + var idx = Get1DIdx(range1, src); + var len = range1.Item2 - range1.Item1; + + var kernel = GetKernel(action, src); + +<# for (int i = 0; i < c; i++) { #> + var gpu<#= i + 1 #> = new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>); +<# } #> + + kernel((len / block_size, block_size), idx +<# for (int i = 0; i < c; i++) { #> + , gpu<#= i + 1 #> +<# } #> + ); + + int not_done = len % block_size; + + if (not_done > 0) + { + int offset = len - not_done; + idx.AddOffset(offset); + + kernel((1, not_done), idx +<# for (int i = 0; i < c; i++) { #> + , gpu<#= i + 1 #> +<# } #> + ); + } + + Synchronize(); + } +<# } #> + +<# for (int c = 1; c <= max - 1; c++) { #> + /// + /// Dispatches a 2D kernel with the given number of parameters. + /// + /// The outer range of the for loop. + /// The inner range of the for loop. +<# for (int i = 0; i < c; i++) { #> + /// Buffer #<#= i + 1 #> to run the kernel with. +<# } #> + /// The kernel to run on the GPU. + /// The originating caller location. + internal void DispatchKernel< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >((int, int) range1, (int, int) range2, +<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> + Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > action, string src) +<# for (int i = 0; i < c; i++) { #> + where <#= letters[i] #> : unmanaged +<# } #> + { + var len = (range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1); + (var i, var j) = Get2DIdx(range1, range2, src); + + var kernel = GetKernel(action, src); + +<# for (int i = 0; i < c; i++) { #> + var gpu<#= i + 1 #> = new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>); +<# } #> + + kernel((len / block_size, block_size), i, j +<# for (int i = 0; i < c; i++) { #> + , gpu<#= i + 1 #> +<# } #> + ); + + int not_done = len % block_size; + + if (not_done > 0) + { + int offset = len - not_done; + i.AddOffset(offset); + j.AddOffset(offset); + + kernel((1, not_done), i, j +<# for (int i = 0; i < c; i++) { #> + , gpu<#= i + 1 #> +<# } #> + ); + } + + Synchronize(); + } +<# } #> + +<# for (int c = 1; c <= max - 2; c++) { #> + /// + /// Dispatches a 3D kernel with the given number of parameters. + /// + /// The outer range of the for loop. + /// The middle range of the for loop. + /// The inner range of the for loop. +<# for (int i = 0; i < c; i++) { #> + /// Buffer #<#= i + 1 #> to run the kernel with. +<# } #> + /// The kernel to run on the GPU. + /// The originating caller location. + internal void DispatchKernel< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >((int, int) range1, (int, int) range2, (int, int) range3, +<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> + Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > action, string src) +<# for (int i = 0; i < c; i++) { #> + where <#= letters[i] #> : unmanaged +<# } #> + { + var len = (range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1); + (var i, var j, var k) = Get3DIdx(range1, range2, range3, src); + + var kernel = GetKernel(action, src); + +<# for (int i = 0; i < c; i++) { #> + var gpu<#= i + 1 #> = new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>); +<# } #> + + kernel((len / block_size, block_size), i, j, k +<# for (int i = 0; i < c; i++) { #> + , gpu<#= i + 1 #> +<# } #> + ); + + int not_done = len % block_size; + + if (not_done > 0) + { + int offset = len - not_done; + i.AddOffset(offset); + j.AddOffset(offset); + k.AddOffset(offset); + + kernel((1, not_done), i, j, k +<# for (int i = 0; i < c; i++) { #> + , gpu<#= i + 1 #> +<# } #> + ); + } + + Synchronize(); + } +<# } #> + } +} diff --git a/DotMP/GPU/AssemblyAttributes.cs b/DotMP/GPU/AssemblyAttributes.cs new file mode 100644 index 00000000..7077a588 --- /dev/null +++ b/DotMP/GPU/AssemblyAttributes.cs @@ -0,0 +1,19 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +using System.Runtime.CompilerServices; + +[assembly: InternalsVisibleTo("ILGPURuntime")] \ No newline at end of file diff --git a/DotMP/GPU/Buffer.cs b/DotMP/GPU/Buffer.cs new file mode 100644 index 00000000..26832163 --- /dev/null +++ b/DotMP/GPU/Buffer.cs @@ -0,0 +1,226 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +using System; +using System.Runtime.CompilerServices; +using ILGPU; +using ILGPU.Runtime; + +namespace DotMP.GPU +{ + namespace Buffer + { + /// + /// Specifies the behavior of the buffer. + /// + public enum Behavior + { + /// + /// Specifies that data should be transfered to the GPU, but not from it. + /// + To, + /// + /// Specifies that data should be transfered from the GPU, but not to it. + /// + From, + /// + /// Specifies that data should be transfered both to and from the GPU. + /// + ToFrom, + /// + /// Specifies that the data shouldn't be transfered to or from the GPU. For internal use. + /// + NoCopy + } + } + + /// + /// Buffer to manage GPU memory. Should only be created on the CPU. + /// + public sealed class Buffer : IDisposable + where T : unmanaged + { + /// + /// The ILGPU buffer for 1D arrays. + /// + private MemoryBuffer1D buf1d; + + /// + /// The ILGPU buffer for 2D arrays. + /// + private MemoryBuffer2D buf2d; + + /// + /// The ILGPU buffer for 3D arrays. + /// + private MemoryBuffer3D buf3d; + + /// + /// Behavior of the data, as specified by Behavior. + /// + private Buffer.Behavior behavior; + + /// + /// The CPU 1D array, so that we can copy the data back. + /// + private T[] data1d; + + /// + /// The CPU 2D array, so that we can copy the data back. + /// + private T[,] data2d; + + /// + /// The CPU 3D array, so that we can copy the data back. + /// + private T[,,] data3d; + + /// + /// The number of dimensions in the array. + /// + internal int Dimensions { get; private set; } + + /// + /// Constructor for buffer object. Allocates a 1D array on the GPU and makes it available for the next GPU kernel. + /// + /// The data to allocate on the GPU. + /// The behavior of the data, see Behavior. + public Buffer(T[] data, Buffer.Behavior behavior) + { + new AcceleratorHandler(); + + this.behavior = behavior; + this.data1d = data; + + switch (behavior) + { + case Buffer.Behavior.To: + case Buffer.Behavior.ToFrom: + buf1d = AcceleratorHandler.accelerator.Allocate1D(data); + break; + case Buffer.Behavior.From: + case Buffer.Behavior.NoCopy: + buf1d = AcceleratorHandler.accelerator.Allocate1D(data.Length); + break; + } + + Dimensions = 1; + } + + /// + /// Constructor for buffer object. Allocates a 2D array on the GPU and makes it available for the next GPU kernel. + /// + /// The data to allocate on the GPU. + /// The behavior of the data, see Behavior. + public Buffer(T[,] data, Buffer.Behavior behavior) + { + new AcceleratorHandler(); + + this.behavior = behavior; + this.data2d = data; + + switch (behavior) + { + case Buffer.Behavior.To: + case Buffer.Behavior.ToFrom: + buf2d = AcceleratorHandler.accelerator.Allocate2DDenseY(data); + break; + case Buffer.Behavior.From: + case Buffer.Behavior.NoCopy: + buf2d = AcceleratorHandler.accelerator.Allocate2DDenseY((data.GetLength(0), data.GetLength(1))); + break; + } + + Dimensions = 2; + } + + /// + /// Constructor for buffer object. Allocates a 3D array on the GPU and makes it available for the next GPU kernel. + /// + /// The data to allocate on the GPU. + /// The behavior of the data, see Behavior. + public Buffer(T[,,] data, Buffer.Behavior behavior) + { + new AcceleratorHandler(); + + this.behavior = behavior; + this.data3d = data; + + switch (behavior) + { + case Buffer.Behavior.To: + case Buffer.Behavior.ToFrom: + buf3d = AcceleratorHandler.accelerator.Allocate3DDenseXY(data); + break; + case Buffer.Behavior.From: + case Buffer.Behavior.NoCopy: + buf3d = AcceleratorHandler.accelerator.Allocate3DDenseXY((data.GetLength(0), data.GetLength(1), data.GetLength(2))); + break; + } + + Dimensions = 3; + } + + /// + /// Dispose of the buffer, freeing GPU memory and copying any relevant data back to the CPU. + /// + public void Dispose() + { + if (Dimensions == 1) + { + if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom) + { + buf1d.GetAsArray1D().CopyTo(data1d, 0); + } + + buf1d.Dispose(); + } + else if (Dimensions == 2) + { + if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom) + { + System.Buffer.BlockCopy(buf2d.GetAsArray2D(), 0, data2d, 0, Unsafe.SizeOf() * data2d.Length); + } + + buf2d.Dispose(); + } + else if (Dimensions == 3) + { + if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom) + { + System.Buffer.BlockCopy(buf3d.GetAsArray3D(), 0, data3d, 0, Unsafe.SizeOf() * data3d.Length); + } + + buf3d.Dispose(); + } + } + + /// + /// Get the view of the memory for the GPU. + /// + internal ArrayView1D View1D { get => buf1d.View; } + + /// + /// Get the view of the memory for the GPU. + /// + internal ArrayView2D View2D { get => buf2d.View; } + + /// + /// Get the view of the memory for the GPU. + /// + internal ArrayView3D View3D { get => buf3d.View; } + } +} \ No newline at end of file diff --git a/DotMP/GPU/Exceptions.cs b/DotMP/GPU/Exceptions.cs new file mode 100644 index 00000000..4705041b --- /dev/null +++ b/DotMP/GPU/Exceptions.cs @@ -0,0 +1,21 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +using System; + +namespace DotMP.GPU +{ +} \ No newline at end of file diff --git a/DotMP/GPU/Gpu.tt b/DotMP/GPU/Gpu.tt new file mode 100644 index 00000000..6cf2d841 --- /dev/null +++ b/DotMP/GPU/Gpu.tt @@ -0,0 +1,140 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +<#@ template debug="false" hostspecific="false" language="C#" #> +<#@ output extension=".cs" #> +<# var letters = new char[] { 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'A', 'B', 'C', 'D', 'E', 'F' }; + int max = 13; #> + +using System; +using System.Runtime.CompilerServices; + +namespace DotMP.GPU +{ + /// + /// The main class of DotMP's GPU API, powered by the ILGPU project. + /// Contains all the main methods for constructing and running GPU kernels. + /// The GPU API is not thread-safe at the current moment, so its methods should not be called from within a Parallel.ParallelRegion! + /// + public static class Parallel + { + /// + /// Formats the caller information for determining uniqueness of a call. + /// + /// The calling file. + /// The calling line number. + /// A formatted string representing "{filename}:{linenum}" + private static string FormatCaller(string filename, int linenum) + { + return string.Format("{0}:{1}", filename, linenum); + } + +<# for (int c = 1; c <= max; c++) { #> + /// + /// Creates a GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. +<# for (int i = 0; i < c; i++) { #> + /// Buffer #<#= i + 1 #> to run the kernel with. +<# } #> + /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. + public static void ParallelFor< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >(int start, int end, +<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> + Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) +<# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #> + { + var handler = new AcceleratorHandler(); + string src = FormatCaller(path, line); + handler.DispatchKernel((start, end), +<# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #> + action, src); + } +<# } #> + +<# for (int c = 1; c <= max - 1; c++) { #> + /// + /// Creates a collapsed GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// + /// The range of the outer for loop. + /// The range of the inner for loop. +<# for (int i = 0; i < c; i++) { #> + /// Buffer #<#= i + 1 #> to run the kernel with. +<# } #> + /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. + public static void ParallelForCollapse< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >((int, int) range1, (int, int) range2, +<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> + Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) +<# for (int i = 0; i < c; i++) { #> + where <#= letters[i] #> : unmanaged +<# } #> + { + var handler = new AcceleratorHandler(); + string src = FormatCaller(path, line); + handler.DispatchKernel(range1, range2, +<# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #> + action, src); + } +<# } #> + +<# for (int c = 1; c <= max - 2; c++) { #> + /// + /// Creates a collapsed GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// + /// The range of the outer for loop. + /// The range of the middle for loop. + /// The range of the inner for loop. +<# for (int i = 0; i < c; i++) { #> + /// Buffer #<#= i + 1 #> to run the kernel with. +<# } #> + /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. + public static void ParallelForCollapse< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >((int, int) range1, (int, int) range2, (int, int) range3, +<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> + Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) +<# for (int i = 0; i < c; i++) { #> + where <#= letters[i] #> : unmanaged +<# } #> + { + var handler = new AcceleratorHandler(); + string src = FormatCaller(path, line); + handler.DispatchKernel(range1, range2, range3, +<# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #> + action, src); + } +<# } #> + } +} diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs new file mode 100644 index 00000000..036fe1a4 --- /dev/null +++ b/DotMP/GPU/GpuArray.cs @@ -0,0 +1,141 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +using ILGPU; +using ILGPU.IR.Values; +using ILGPU.Runtime; +using System; +using System.Diagnostics.CodeAnalysis; + +namespace DotMP.GPU +{ + /// + /// Wrapper object for representing arrays on the GPU. + /// + /// + [ExcludeFromCodeCoverage] + public struct GPUArray + where T : unmanaged + { + /// + /// The ILGPU view for 1D arrays. + /// + private ArrayView1D view1d; + + /// + /// The ILGPU view for 2D arrays. + /// + private ArrayView2D view2d; + + /// + /// The ILGPU view for 3D arrays. + /// + private ArrayView3D view3d; + + /// + /// Number of dimensions. + /// + private int dims; + + /// + /// Constructor. + /// + /// The Buffer to create an array from. + internal GPUArray(Buffer buf) + { + switch (buf.Dimensions) + { + default: + case 1: + view1d = buf.View1D; + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + view2d = new Buffer(new T[1, 1], Buffer.Behavior.NoCopy).View2D; + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + view3d = new Buffer(new T[1, 1, 1], Buffer.Behavior.NoCopy).View3D; + break; + case 2: + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + view1d = new Buffer(new T[1], Buffer.Behavior.NoCopy).View1D; + view2d = buf.View2D; + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + view3d = new Buffer(new T[1, 1, 1], Buffer.Behavior.NoCopy).View3D; + break; + case 3: + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + view1d = new Buffer(new T[1], Buffer.Behavior.NoCopy).View1D; + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + view2d = new Buffer(new T[1, 1], Buffer.Behavior.NoCopy).View2D; + view3d = buf.View3D; + break; + } + + dims = buf.Dimensions; + } + + /// + /// Overload for [] operator. + /// + /// The ID to index into. + /// The data at that ID. + public ref T this[int idx] + { + get => ref view1d[idx]; + } + + /// + /// Overload for [,] operator. + /// + /// The first ID to index into. + /// The second ID to index into. + /// The data at that ID. + public ref T this[int i, int j] + { + get => ref view2d[i, j]; + } + + /// + /// Overload for [,,] operator. + /// + /// The first ID to index into. + /// The second ID to index into. + /// The third ID to index into. + /// The data at that ID. + public ref T this[int i, int j, int k] + { + get => ref view3d[i, j, k]; + } + + /// + /// Gets the length of the array. + /// + public int Length + { + get + { + switch (dims) + { + case 1: + default: + return view1d.IntLength; + case 2: + return view2d.IntLength; + case 3: + return view3d.IntLength; + } + } + } + } +} diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs new file mode 100644 index 00000000..8de4dc3e --- /dev/null +++ b/DotMP/GPU/Index.cs @@ -0,0 +1,78 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +using ILGPU; +using ILGPU.Runtime; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Xml; + +namespace DotMP.GPU +{ + /// + /// Represents an index passed as the first index argument. + /// + [ExcludeFromCodeCoverage] + public struct Index + { + /// + /// Lookup table for indices. + /// + private ArrayView1D lookup; + /// + /// Offset for followup kernels. + /// + private int offset; + /// + /// Cached index. + /// + private int idx; + + /// + /// Constructor. + /// + /// Buffer representing the indices. + internal Index(Buffer buf) + { + this.lookup = buf.View1D; + offset = 0; + idx = -1; + } + + /// + /// Adds an offset in preperation for a followup kernel. + /// + /// The offset to set. + internal void AddOffset(int offset) + { + this.offset = offset; + } + + /// + /// Calculates the index and caches for future use. + /// + /// The Index object to cast to int. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static implicit operator int(Index i) + { + if (i.idx == -1) + i.idx = i.lookup[Grid.GlobalLinearIndex + i.offset]; + + return i.idx; + } + } +} diff --git a/DotMP/Parallel.cs b/DotMP/Parallel.cs index a7f4f8c9..a64ae99f 100644 --- a/DotMP/Parallel.cs +++ b/DotMP/Parallel.cs @@ -16,31 +16,34 @@ using System; using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.ComponentModel; using System.Threading; using DotMP.Exceptions; using DotMP.Schedulers; +using System.Diagnostics.CodeAnalysis; namespace DotMP { /// /// The main class of DotMP. /// Contains all the main methods for parallelism. - /// For users, this is the main class you want to worry about, along with Lock, Shared, and Atomic + /// For users, this is the main class you want to worry about, along with Lock, Shared, Atomic, and GPU. /// public static class Parallel { /// /// The dictionary for critical regions. /// - private static volatile Dictionary critical_lock = new Dictionary(); + private static volatile Dictionary critical_lock = new Dictionary(); /// /// The dictionary for single regions. /// - private static volatile HashSet single_thread = new HashSet(); + private static volatile HashSet single_thread = new HashSet(); /// /// The dictionary for ordered regions. /// - private static volatile Dictionary ordered = new Dictionary(); + private static volatile Dictionary ordered = new Dictionary(); /// /// Barrier object for DotMP.Parallel.Barrier() /// @@ -174,6 +177,17 @@ private static void ValidateParams(int start = 0, int end = 0, IScheduler schedu throw new InvalidArgumentsException(string.Format("Chunk size must be specified with user-defined schedulers, as it cannot be inferred.")); } + /// + /// Formats the caller information for determining uniqueness of a call. + /// + /// The calling file. + /// The calling line number. + /// A formatted string representing "{filename}:{linenum}" + private static string FormatCaller(string filename, int linenum) + { + return string.Format("{0}:{1}", filename, linenum); + } + /// /// Creates a for loop inside a parallel region. /// A for loop created with For inside of a parallel region is executed in parallel, with iterations being distributed among the threads, and potentially out-of-order. @@ -1096,11 +1110,16 @@ public static void ParallelSections(uint? num_threads = null, params Action[] ac /// Creates a critical region. /// A critical region is a region of code that can only be executed by one thread at a time. /// If a thread encounters a critical region while another thread is inside a critical region, it will wait until the other thread is finished. + /// + /// THIS METHOD IS NOW DEPRECATED. /// /// The ID of the critical region. Must be unique per region but consistent across all threads. /// The action to be performed in the critical region. /// The ID of the critical region. /// Thrown when not in a parallel region. + [Obsolete("This version of Critical is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")] + [EditorBrowsable(EditorBrowsableState.Never)] + [ExcludeFromCodeCoverage] public static int Critical(int id, Action action) { if (!InParallel()) @@ -1110,6 +1129,45 @@ public static int Critical(int id, Action action) object lock_obj; + lock (critical_lock) + { + if (!critical_lock.ContainsKey(id.ToString())) + { + critical_lock.Add(id.ToString(), new object()); + } + + lock_obj = critical_lock[id.ToString()]; + } + + lock (lock_obj) + { + action(); + } + + return id; + } + + /// + /// Creates a critical region. + /// A critical region is a region of code that can only be executed by one thread at a time. + /// If a thread encounters a critical region while another thread is inside a critical region, it will wait until the other thread is finished. + /// + /// The action to be performed in the critical region. + /// The line number this method was called from. + /// The path to the file this method was called from. + /// The ID of the critical region. + /// Thrown when not in a parallel region. + public static void Critical(Action action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) + { + string id = FormatCaller(path, line); + + if (!InParallel()) + { + throw new NotInParallelRegionException("Cannot use DotMP Critical outside of a parallel region."); + } + + object lock_obj; + lock (critical_lock) { if (!critical_lock.ContainsKey(id)) @@ -1124,8 +1182,6 @@ public static int Critical(int id, Action action) { action(); } - - return id; } /// @@ -1180,11 +1236,16 @@ public static void Master(Action action) /// Creates a single region. /// A single region is only executed once per Parallel.ParallelRegion. /// The first thread to encounter the single region marks the region as encountered, then executes it. + /// + /// THIS METHOD IS NOW DEPRECATED. /// /// The ID of the single region. Must be unique per region but consistent across all threads. /// The action to be performed in the single region. /// Thrown when not in a parallel region. /// Thrown when nested inside another worksharing region. + [Obsolete("This version of Single is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")] + [EditorBrowsable(EditorBrowsableState.Never)] + [ExcludeFromCodeCoverage] public static void Single(int id, Action action) { var freg = new ForkedRegion(); @@ -1204,6 +1265,55 @@ public static void Single(int id, Action action) Interlocked.Increment(ref freg.in_workshare); + lock (single_thread) + { + if (!single_thread.Contains(id.ToString())) + { + single_thread.Add(id.ToString()); + new_single = true; + } + } + + if (new_single) + { + action(); + } + + Interlocked.Decrement(ref freg.in_workshare); + + Barrier(); + } + + /// + /// Creates a single region. + /// A single region is only executed once per Parallel.ParallelRegion. + /// The first thread to encounter the single region marks the region as encountered, then executes it. + /// + /// The action to be performed in the single region. + /// The line number this method was called from. + /// The path to the file this method was called from. + /// Thrown when not in a parallel region. + /// Thrown when nested inside another worksharing region. + public static void Single(Action action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) + { + string id = FormatCaller(path, line); + var freg = new ForkedRegion(); + bool new_single = false; + + if (!freg.in_parallel) + { + throw new NotInParallelRegionException("Cannot use DotMP Single outside of a parallel region."); + } + + var ws = new WorkShare(); + + if (ws.in_for) + { + throw new CannotPerformNestedWorksharingException("Cannot use DotMP Single nested within other worksharing constructs."); + } + + Interlocked.Increment(ref freg.in_workshare); + lock (single_thread) { if (!single_thread.Contains(id)) @@ -1227,10 +1337,15 @@ public static void Single(int id, Action action) /// Creates an ordered region. /// An ordered region is a region of code that is executed in order inside of a For() or ForReduction<T>() loop. /// This also acts as an implicit Critical() region. + /// + /// THIS METHOD IS NOW DEPRECATED. /// /// The ID of the ordered region. Must be unique per region but consistent across all threads. /// The action to be performed in the ordered region. /// Thrown when not in a parallel region. + [Obsolete("This version of Ordered is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")] + [EditorBrowsable(EditorBrowsableState.Never)] + [ExcludeFromCodeCoverage] public static void Ordered(int id, Action action) { var freg = new ForkedRegion(); @@ -1240,6 +1355,46 @@ public static void Ordered(int id, Action action) throw new NotInParallelRegionException("Cannot use DotMP Ordered outside of a parallel region."); } + lock (ordered) + { + if (!ordered.ContainsKey(id.ToString())) + { + ordered.Add(id.ToString(), 0); + } + Thread.MemoryBarrier(); + } + + WorkShare ws = new WorkShare(); + + while (ordered[id.ToString()] != ws.working_iter) ; + + action(); + + lock (ordered) + { + ordered[id.ToString()]++; + } + } + + /// + /// Creates an ordered region. + /// An ordered region is a region of code that is executed in order inside of a For() or ForReduction<T>() loop. + /// This also acts as an implicit Critical() region. + /// + /// The action to be performed in the ordered region. + /// The line number this method was called from. + /// The path to the file this method was called from. + /// Thrown when not in a parallel region. + public static void Ordered(Action action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) + { + string id = FormatCaller(path, line); + var freg = new ForkedRegion(); + + if (!freg.in_parallel) + { + throw new NotInParallelRegionException("Cannot use DotMP Ordered outside of a parallel region."); + } + lock (ordered) { if (!ordered.ContainsKey(id)) @@ -1270,7 +1425,7 @@ public static int GetNumThreads() { var freg = new ForkedRegion(); - return (freg.reg is not null) + return freg.in_parallel ? (int)freg.reg.num_threads : 1; } diff --git a/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj b/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj new file mode 100644 index 00000000..9cf0a6f0 --- /dev/null +++ b/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj @@ -0,0 +1,18 @@ + + + + Exe + net6.0 + enable + enable + + + + + + + + + + + diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs new file mode 100644 index 00000000..75d0747f --- /dev/null +++ b/benchmarks/GPUHeatTransfer/Program.cs @@ -0,0 +1,307 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Jobs; +using BenchmarkDotNet.Running; +using BenchmarkDotNet.Diagnosers; + +/* jscpd:ignore-start */ + +[SimpleJob(RuntimeMoniker.Net60)] +[ThreadingDiagnoser] +[HardwareCounters] +[EventPipeProfiler(EventPipeProfile.CpuSampling)] +// test heat transfer using Parallel.For +public class HeatTransfer +{ + // scratch array + private double[,] scratch = new double[0, 0]; + // grid array + private double[,] grid = new double[0, 0]; + + // parallel type enum + public enum ParType { DMPFor, DMPGPU } + + // test dims of 100x100, 1000x1000, and 5000x5000 + [Params(768)] + public int dim; + + // test with 10 steps and 100 steps + [Params(100)] + public int steps; + + // test with all 3 parallel types + [Params(ParType.DMPFor, ParType.DMPGPU)] + public ParType type; + + // change this to configure the number of threads to use + public uint num_threads = 6; + + // buffer for grid + private DotMP.GPU.Buffer gridbuf; + + // buffer for scratch + private DotMP.GPU.Buffer scratchbuf; + + // run the setup + [GlobalSetup] + public void Setup() + { + scratch = new double[dim, dim]; + grid = new double[dim, dim]; + + for (int i = 0; i < dim; i++) + { + grid[0, i] = 100.0; + grid[i, 0] = 100.0; + grid[dim - 1, i] = 100.0; + grid[i, dim - 1] = 100.0; + } + + if (type == ParType.DMPGPU) + { + gridbuf = new DotMP.GPU.Buffer(grid, DotMP.GPU.Buffer.Behavior.ToFrom); + scratchbuf = new DotMP.GPU.Buffer(scratch, DotMP.GPU.Buffer.Behavior.NoCopy); + } + } + + //run the simulation + [Benchmark] + public void DoSimulation() + { + Action action = () => + { + //do the steps + for (int i = 0; i < steps; i++) + { + DoStep(); + } + }; + + if (type == ParType.DMPGPU) + { + action(); + //gridbuf.Dispose(); + //scratchbuf.Dispose(); + } + else + { + // spawn a parallel region + DotMP.Parallel.ParallelRegion(num_threads: num_threads, action: action); + } + } + + //do a step of the heat transfer simulation + public void DoStep() + { + switch (type) + { + case ParType.DMPFor: + //iterate over all cells not on the border + DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i => + { + for (int j = 1; j < dim - 1; j++) + { + //set the scratch array to the average of the surrounding cells + scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); + } + }); + + //copy the scratch array to the grid array + DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i => + { + for (int j = 1; j < dim - 1; j++) + { + grid[i, j] = scratch[i, j]; + } + }); + break; + + case ParType.DMPGPU: + DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (i, j, grid, scratch) => + { + //set the scratch array to the average of the surrounding cells + scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); + }); + + DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (i, j, grid, scratch) => + { + grid[i, j] = scratch[i, j]; + }); + break; + } + } +} + +// test heat transfer using Parallel.For +public class HeatTransferVerify +{ + // scratch array + private double[,] scratch = new double[0, 0]; + // grid array + private double[,] grid = new double[0, 0]; + + // parallel type enum + public enum ParType { DMPFor, DMPGPU } + + // test dims of 100x100, 1000x1000, and 5000x5000 + public int dim = 1000; + + // test with 10 steps and 100 steps + public int steps = 100; + + // test with all 3 parallel types + public ParType type = ParType.DMPFor; + + // change this to configure the number of threads to use + public uint num_threads = 6; + + // buffer for grid + private DotMP.GPU.Buffer gridbuf; + + // buffer for scratch + private DotMP.GPU.Buffer scratchbuf; + + // run the setup + public void Setup() + { + scratch = new double[dim, dim]; + grid = new double[dim, dim]; + + for (int i = 0; i < dim; i++) + { + grid[0, i] = 100.0; + grid[i, 0] = 100.0; + grid[dim - 1, i] = 100.0; + grid[i, dim - 1] = 100.0; + } + + if (type == ParType.DMPGPU) + { + gridbuf = new DotMP.GPU.Buffer(grid, DotMP.GPU.Buffer.Behavior.ToFrom); + scratchbuf = new DotMP.GPU.Buffer(scratch, DotMP.GPU.Buffer.Behavior.NoCopy); + } + } + + //run the simulation + public void DoSimulation() + { + Action action = () => + { + //do the steps + for (int i = 0; i < steps; i++) + { + DoStep(); + } + }; + + if (type == ParType.DMPGPU) + { + action(); + gridbuf.Dispose(); + scratchbuf.Dispose(); + } + else + { + // spawn a parallel region + DotMP.Parallel.ParallelRegion(num_threads: num_threads, action: action); + } + } + + //do a step of the heat transfer simulation + public void DoStep() + { + switch (type) + { + case ParType.DMPFor: + //iterate over all cells not on the border + DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i => + { + for (int j = 1; j < dim - 1; j++) + { + //set the scratch array to the average of the surrounding cells + scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); + } + }); + + //copy the scratch array to the grid array + DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i => + { + for (int j = 1; j < dim - 1; j++) + { + grid[i, j] = scratch[i, j]; + } + }); + break; + + case ParType.DMPGPU: + DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (i, j, grid, scratch) => + { + //set the scratch array to the average of the surrounding cells + scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); + }); + + DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (i, j, grid, scratch) => + { + grid[i, j] = scratch[i, j]; + }); + break; + } + } + + public void Verify() + { + type = ParType.DMPFor; + Setup(); + DoSimulation(); + double[,] gridA = grid; + + type = ParType.DMPGPU; + Setup(); + DoSimulation(); + double[,] gridB = grid; + + bool wrong = false; + + for (int i = 0; i < dim; i++) + for (int j = 0; j < dim; j++) + if (gridA[i, j] != gridB[i, j]) + { + wrong = true; + Console.WriteLine("Wrong at ({0}, {1}), expected {2}, got {3}.", i, j, gridA[i, j], gridB[i, j]); + } + + if (wrong) + Console.WriteLine("WRONG RESULT"); + else + Console.WriteLine("RIGHT RESULT"); + } +} + +/* jscpd:ignore-end */ + +// driver +public class Program +{ + public static void Main(string[] args) + { + if (args.Length > 0 && args[0] == "verify") + new HeatTransferVerify().Verify(); + else + BenchmarkRunner.Run(); + } +} diff --git a/benchmarks/GPUOverhead/GPUOverhead.csproj b/benchmarks/GPUOverhead/GPUOverhead.csproj new file mode 100644 index 00000000..9cf0a6f0 --- /dev/null +++ b/benchmarks/GPUOverhead/GPUOverhead.csproj @@ -0,0 +1,18 @@ + + + + Exe + net6.0 + enable + enable + + + + + + + + + + + diff --git a/benchmarks/GPUOverhead/Program.cs b/benchmarks/GPUOverhead/Program.cs new file mode 100644 index 00000000..9c0dde2c --- /dev/null +++ b/benchmarks/GPUOverhead/Program.cs @@ -0,0 +1,56 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Jobs; +using BenchmarkDotNet.Running; +using BenchmarkDotNet.Diagnosers; + +/* jscpd:ignore-start */ + +[SimpleJob(RuntimeMoniker.Net60)] +[ThreadingDiagnoser] +[HardwareCounters] +[EventPipeProfiler(EventPipeProfile.CpuSampling)] +public class Overhead +{ + DotMP.GPU.Buffer buf; + + // run the setup + [GlobalSetup] + public void Setup() + { + buf = new DotMP.GPU.Buffer(new int[1, 1], DotMP.GPU.Buffer.Behavior.NoCopy); + } + + //run the simulation + [Benchmark] + public void TestOverhead() + { + DotMP.GPU.Parallel.ParallelForCollapse((0, 500), (0, 500), buf, (i, j, buf) => { }); + } +} + +/* jscpd:ignore-end */ + +// driver +public class Program +{ + public static void Main(string[] args) + { + BenchmarkRunner.Run(); + } +} diff --git a/benchmarks/ILGPUOverhead/ILGPUOverhead.csproj b/benchmarks/ILGPUOverhead/ILGPUOverhead.csproj new file mode 100644 index 00000000..9cf0a6f0 --- /dev/null +++ b/benchmarks/ILGPUOverhead/ILGPUOverhead.csproj @@ -0,0 +1,18 @@ + + + + Exe + net6.0 + enable + enable + + + + + + + + + + + diff --git a/benchmarks/ILGPUOverhead/Program.cs b/benchmarks/ILGPUOverhead/Program.cs new file mode 100644 index 00000000..6153183c --- /dev/null +++ b/benchmarks/ILGPUOverhead/Program.cs @@ -0,0 +1,63 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Jobs; +using BenchmarkDotNet.Running; +using BenchmarkDotNet.Diagnosers; +using System; +using ILGPU; +using ILGPU.Runtime; + +/* jscpd:ignore-start */ + +[SimpleJob(RuntimeMoniker.Net60)] +[ThreadingDiagnoser] +[HardwareCounters] +[EventPipeProfiler(EventPipeProfile.CpuSampling)] +public class Overhead +{ + Action> kernel; + ArrayView1D data; + + // run the setup + [GlobalSetup] + public void Setup() + { + var context = Context.CreateDefault(); + var accelerator = context.Devices[1].CreateAccelerator(context); + kernel = accelerator.LoadStreamKernel>(arr => { }); + data = accelerator.Allocate1D(1); + } + + //run the simulation + [Benchmark] + public void TestOverhead() + { + kernel((1, 256), data); + } +} + +/* jscpd:ignore-end */ + +// driver +public class Program +{ + public static void Main(string[] args) + { + BenchmarkRunner.Run(); + } +}