From 61b93af05eb3b136d05ed197e6808b26106361ee Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 04:18:46 -0600 Subject: [PATCH 01/61] test using new GPU data transfer API --- DotMP-Tests/GPUTests.cs | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/DotMP-Tests/GPUTests.cs b/DotMP-Tests/GPUTests.cs index e97422b5..c155fd2d 100644 --- a/DotMP-Tests/GPUTests.cs +++ b/DotMP-Tests/GPUTests.cs @@ -33,13 +33,18 @@ public void GPU_for_works() random_init(x); random_init(y); - DotMP.GPU.Parallel.DataTo(a, x, y); - DotMP.GPU.Parallel.DataFrom(res); - DotMP.GPU.Parallel.ParallelFor - (0, a.Length, (i, a, x, y, res) => { - res[i] = (float)(a[i] * x[i] + y[i]); - }); + using var a_gpu = new DotMP.GPU.Buffer(a, DotMP.GPU.Buffer.Behavior.To); + using var x_gpu = new DotMP.GPU.Buffer(x, DotMP.GPU.Buffer.Behavior.To); + using var y_gpu = new DotMP.GPU.Buffer(y, DotMP.GPU.Buffer.Behavior.To); + using var res_gpu = new DotMP.GPU.Buffer(res, DotMP.GPU.Buffer.Behavior.From); + + DotMP.GPU.Parallel.ParallelFor(0, a.Length, a_gpu, x_gpu, y_gpu, res_gpu, + (i, a, x, y, res) => + { + res[i] = (float)(a[i] * x[i] + y[i]); + }); + } for (int i = 0; i < a.Length; i++) { @@ -50,11 +55,13 @@ public void GPU_for_works() double[] a_old = a.Select(a => a).ToArray(); - DotMP.GPU.Parallel.DataToFrom(a); - DotMP.GPU.Parallel.ParallelFor(0, a.Length, (i, a) => + using (var a_gpu = new DotMP.GPU.Buffer(a, DotMP.GPU.Buffer.Behavior.ToFrom)) { - a[i]++; - }); + DotMP.GPU.Parallel.ParallelFor(0, a.Length, a_gpu, (i, a) => + { + a[i]++; + }); + } for (int i = 0; i < a.Length; i++) { From 60a1c9fd35a80c58ef4e2fb9c696644cf1cc09a3 Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 04:18:57 -0600 Subject: [PATCH 02/61] implement new memory model --- DotMP/GPU/AcceleratorHandler.cs | 201 +++++--------------------------- DotMP/GPU/Buffer.cs | 95 +++++++++++++++ DotMP/GPU/Gpu.cs | 65 +++-------- DotMP/GPU/GpuArray.cs | 4 +- 4 files changed, 144 insertions(+), 221 deletions(-) create mode 100644 DotMP/GPU/Buffer.cs diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs index e6f2771f..f2889a00 100644 --- a/DotMP/GPU/AcceleratorHandler.cs +++ b/DotMP/GPU/AcceleratorHandler.cs @@ -21,31 +21,10 @@ internal class AcceleratorHandler /// /// The accelerator object. /// - private static Accelerator accelerator; + internal static Accelerator accelerator; /// - /// The GPU pointers for arrays going to the GPU. + /// /// - private static dynamic[] tos; - /// - /// The GPU pointers for arrays coming back from the GPU. - /// - private static dynamic[] froms; - /// - /// The CPU pointers for arrays coming back from the GPU. - /// - private static dynamic[] froms_cpu; - /// - /// The GPU pointers for arrays going both to and from the GPU. - /// - private static dynamic[] tofroms; - /// - /// The CPU pointers for arrays going both to and from the GPU. - /// - private static dynamic[] tofroms_cpu; - /// - /// Counts how many arrays have been copied back to the CPU for bookkeeping. - /// - private static int copied_back; private static int block_size; /// @@ -59,129 +38,13 @@ internal AcceleratorHandler() accelerator = context.Devices[0].CreateAccelerator(context); Console.WriteLine("Using {0} accelerator.", accelerator.AcceleratorType.ToString()); initialized = true; - copied_back = 0; block_size = accelerator.AcceleratorType == AcceleratorType.CPU ? 16 : 256; - - tos = new dynamic[0]; - froms = new dynamic[0]; - tofroms = new dynamic[0]; - froms_cpu = new dynamic[0][]; - tofroms_cpu = new dynamic[0][]; - } - - /// - /// Aggregates the parameters into a single array. - /// - /// A dynamic array of all parameters. - private dynamic[] AggregateParams(int count) - { - dynamic[] ret = tos.Concat(froms).Concat(tofroms).ToArray(); - - if (ret.Length != count) - throw new WrongNumberOfDataMovementsSpecifiedException(string.Format("Specified {0} data movement(s), expected {1}.", ret.Length, count)); - - return ret; - } - - /// - /// Called if data should be moved to the device. - /// Allocates data on GPU and copies the data from the CPU. - /// - /// The type of data to allocate. - /// The data to allocate. - internal void AllocateTo(T[][] values) - where T : unmanaged - { - if (froms.Length > 0 || tofroms.Length > 0) - throw new ImproperDataMovementOrderingException("DataTo should be called before DataFrom and DataToFrom."); - - var tos = values.Select(v => accelerator.Allocate1D(v)).ToArray(); - AcceleratorHandler.tos = AcceleratorHandler.tos.Concat(tos).ToArray(); - - for (int i = 0; i < tos.Length; i++) - tos[i].CopyFromCPU(values[i]); - } - - /// - /// Called if data should be moved from the device. - /// Allocates data on GPU. - /// - /// The type of data to allocate. - /// The data to allocate. - internal void AllocateFrom(T[][] values) - where T : unmanaged - { - if (tofroms.Length > 0) - throw new ImproperDataMovementOrderingException("DataFrom should be called before DataToFrom."); - - var froms = values.Select(v => accelerator.Allocate1D(v.Length)).ToArray(); - AcceleratorHandler.froms = AcceleratorHandler.froms.Concat(froms).ToArray(); - AcceleratorHandler.froms_cpu = AcceleratorHandler.froms_cpu.Concat(values).ToArray(); - } - - /// - /// Called if data should be moved to and from the device. - /// Allocates data on GPU and copies the data from the CPU. - /// - /// The type of data to allocate. - /// The data to allocate. - internal void AllocateToFrom(T[][] values) - where T : unmanaged - { - var tofroms = values.Select(v => accelerator.Allocate1D(v)).ToArray(); - AcceleratorHandler.tofroms = AcceleratorHandler.tofroms.Concat(tofroms).ToArray(); - AcceleratorHandler.tofroms_cpu = AcceleratorHandler.tofroms_cpu.Concat(values).ToArray(); - - for (int i = 0; i < tos.Length; i++) - tofroms[i].CopyFromCPU(values[i]); - } - - /// - /// Synchronizes the GPU stream. - /// - internal void Synchronize() => - accelerator.DefaultStream.Synchronize(); - - /// - /// Copies a piece of GPU memory back to the CPU. - /// - /// The type of the data to transfer. - /// A MemoryBuffer1D object to transfer. - internal void CopyBack(dynamic item) - where T : unmanaged - { - MemoryBuffer1D castedItem = item; - - if (copied_back >= tos.Length && copied_back - tos.Length < froms.Length) - castedItem.GetAsArray1D().CopyTo(froms_cpu[copied_back - tos.Length], 0); - else if (copied_back >= tos.Length) - castedItem.GetAsArray1D().CopyTo(tofroms_cpu[copied_back - tos.Length - froms.Length], 0); - - copied_back++; } /// - /// Called to finalize kernel execution. - /// Clears all of the arrays used in the kernel. + /// Synchronize pending operations. /// - internal void FinalizeKernel() - { - foreach (var i in tos) - i.Dispose(); - tos = new dynamic[0]; - - foreach (var i in froms) - i.Dispose(); - froms = new dynamic[0]; - froms_cpu = new dynamic[0][]; - - foreach (var i in tofroms) - i.Dispose(); - tofroms = new dynamic[0]; - tofroms_cpu = new dynamic[0][]; - - copied_back = 0; - } + private void Synchronize() => accelerator.Synchronize(); /// /// Dispatches a kernel with one data parameter. @@ -189,21 +52,19 @@ internal void FinalizeKernel() /// The type of the data parameter. /// The start of the loop, inclusive. /// The end of the loop, exclusive. + /// The buffer to run the kernel with. /// The action to perform. - internal void DispatchKernel(int start, int end, Action> action) + internal void DispatchKernel(int start, int end, Buffer buf, Action> action) where T : unmanaged { - dynamic[] parameters = AggregateParams(1); var idx = new Index(); var kernel = accelerator.LoadStreamKernel(action); kernel(((end - start) / block_size, block_size), idx, - new GPUArray(parameters[0].View)); + new GPUArray(buf.View)); Synchronize(); - CopyBack(parameters[0]); - FinalizeKernel(); } /// @@ -213,24 +74,22 @@ internal void DispatchKernel(int start, int end, Action> a /// The type of the second data parameter. /// The start of the loop, inclusive. /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. /// The action to perform. - internal void DispatchKernel(int start, int end, Action, GPUArray> action) + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Action, GPUArray> action) where T : unmanaged where U : unmanaged { - dynamic[] parameters = AggregateParams(2); var idx = new Index(); var kernel = accelerator.LoadStreamKernel(action); kernel(((end - start) / block_size, block_size), idx, - new GPUArray(parameters[0].View), - new GPUArray(parameters[1].View)); + new GPUArray(buf1.View), + new GPUArray(buf2.View)); Synchronize(); - CopyBack(parameters[0]); - CopyBack(parameters[1]); - FinalizeKernel(); } /// @@ -241,27 +100,25 @@ internal void DispatchKernel(int start, int end, Action /// The type of the third data parameter. /// The start of the loop, inclusive. /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. /// The action to perform. - internal void DispatchKernel(int start, int end, Action, GPUArray, GPUArray> action) + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Action, GPUArray, GPUArray> action) where T : unmanaged where U : unmanaged where V : unmanaged { - dynamic[] parameters = AggregateParams(3); var idx = new Index(); var kernel = accelerator.LoadStreamKernel(action); kernel(((end - start) / block_size, block_size), idx, - new GPUArray(parameters[0].View), - new GPUArray(parameters[1].View), - new GPUArray(parameters[2].View)); + new GPUArray(buf1.View), + new GPUArray(buf2.View), + new GPUArray(buf3.View)); Synchronize(); - CopyBack(parameters[0]); - CopyBack(parameters[1]); - CopyBack(parameters[2]); - FinalizeKernel(); } /// @@ -273,30 +130,28 @@ internal void DispatchKernel(int start, int end, ActionThe type of the fourth data parameter. /// The start of the loop, inclusive. /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. /// The action to perform. - internal void DispatchKernel(int start, int end, Action, GPUArray, GPUArray, GPUArray> action) + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Action, GPUArray, GPUArray, GPUArray> action) where T : unmanaged where U : unmanaged where V : unmanaged where W : unmanaged { - dynamic[] parameters = AggregateParams(4); var idx = new Index(); var kernel = accelerator.LoadStreamKernel(action); kernel(((end - start) / block_size, block_size), idx, - new GPUArray(parameters[0].View), - new GPUArray(parameters[1].View), - new GPUArray(parameters[2].View), - new GPUArray(parameters[3].View)); + new GPUArray(buf1.View), + new GPUArray(buf2.View), + new GPUArray(buf3.View), + new GPUArray(buf4.View)); Synchronize(); - CopyBack(parameters[0]); - CopyBack(parameters[1]); - CopyBack(parameters[2]); - CopyBack(parameters[3]); - FinalizeKernel(); } } } \ No newline at end of file diff --git a/DotMP/GPU/Buffer.cs b/DotMP/GPU/Buffer.cs new file mode 100644 index 00000000..6419f130 --- /dev/null +++ b/DotMP/GPU/Buffer.cs @@ -0,0 +1,95 @@ +using System; +using DotMP.GPU; +using ILGPU.Runtime; + +namespace DotMP.GPU +{ + namespace Buffer + { + /// + /// Specifies the behavior of the buffer. + /// + public enum Behavior + { + /// + /// Specifies that data should be transfered to the GPU, but not from it. + /// + To, + /// + /// Specifies that data should be transfered from the GPU, but not to it. + /// + From, + /// + /// Specifies that data should be transfered both to and from the GPU. + /// + ToFrom + } + } + + /// + /// Buffer to manage GPU memory. Should only be created on the CPU. + /// + public class Buffer : IDisposable + where T : unmanaged + { + + /// + /// The ILGPU buffer. + /// + private MemoryBuffer1D buf; + + /// + /// Behavior of the data, as specified by Behavior. + /// + private Buffer.Behavior behavior; + + /// + /// The CPU array, so that we can copy the data back. + /// + private T[] data; + + /// + /// Constructor for buffer object. Allocates data on the GPU and makes it available for the next GPU kernel. + /// + /// The data to allocate on the GPU. + /// The behavior of the data, see Behavior. + public Buffer(T[] data, Buffer.Behavior behavior) + { + new AcceleratorHandler(); + + this.behavior = behavior; + this.data = data; + + switch (behavior) + { + case Buffer.Behavior.To: + buf = AcceleratorHandler.accelerator.Allocate1D(data); + break; + case Buffer.Behavior.From: + buf = AcceleratorHandler.accelerator.Allocate1D(data.Length); + break; + case Buffer.Behavior.ToFrom: + buf = AcceleratorHandler.accelerator.Allocate1D(data); + break; + } + } + + /// + /// Dispose of the buffer, freeing GPU memory and copying any relevant data back to the CPU. + /// + public void Dispose() + { + if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom) + { + buf.GetAsArray1D().CopyTo(data, 0); + } + + buf.Dispose(); + } + + /// + /// Get the view of the memory for the GPU. + /// + internal ArrayView1D View { get => buf.View; } + } +} \ No newline at end of file diff --git a/DotMP/GPU/Gpu.cs b/DotMP/GPU/Gpu.cs index c541462a..63c402d5 100644 --- a/DotMP/GPU/Gpu.cs +++ b/DotMP/GPU/Gpu.cs @@ -17,13 +17,14 @@ public static class Parallel /// /// The start of the loop, inclusive. /// The end of the loop, exclusive. + /// The buffer to run the kernel with. /// The kernel to run on the GPU. /// The base type of the first argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Action> action) + public static void ParallelFor(int start, int end, Buffer buf, Action> action) where T : unmanaged { var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, action); + handler.DispatchKernel(start, end, buf, action); } /// @@ -33,15 +34,17 @@ public static void ParallelFor(int start, int end, Action> /// /// The start of the loop, inclusive. /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. /// The kernel to run on the GPU. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Action, GPUArray> action) + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Action, GPUArray> action) where T : unmanaged where U : unmanaged { var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, action); + handler.DispatchKernel(start, end, buf1, buf2, action); } /// @@ -51,17 +54,20 @@ public static void ParallelFor(int start, int end, Action /// The start of the loop, inclusive. /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. /// The kernel to run on the GPU. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Action, GPUArray, GPUArray> action) + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Action, GPUArray, GPUArray> action) where T : unmanaged where U : unmanaged where V : unmanaged { var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, action); + handler.DispatchKernel(start, end, buf1, buf2, buf3, action); } /// @@ -71,58 +77,23 @@ public static void ParallelFor(int start, int end, Action /// The start of the loop, inclusive. /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. /// The kernel to run on the GPU. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. /// The base type of the fourth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Action, GPUArray, GPUArray, GPUArray> action) + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Action, GPUArray, GPUArray, GPUArray> action) where T : unmanaged where U : unmanaged where V : unmanaged where W : unmanaged { var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, action); - } - - /// - /// Specifies data movement to the GPU at the start of the kernel, but not back to the CPU at the end of the kernel. - /// Can be called multiple times with different datatypes, but is cleared after a call to Kernel(). - /// - /// The base type of the data. Must be an unmanaged type. - /// The data to move to the GPU. - public static void DataTo(params T[][] to_data) - where T : unmanaged - { - var handler = new AcceleratorHandler(); - handler.AllocateTo(to_data); - } - - /// - /// Specifies data movement back to the CPU at the end of the kernel, but not to the GPU at the start of the kernel.. - /// Can be called multiple times with different datatypes, but is cleared after a call to Kernel(). - /// - /// The base type of the data. Must be an unmanaged type. - /// The data to move from the GPU. - public static void DataFrom(params T[][] to_data) - where T : unmanaged - { - var handler = new AcceleratorHandler(); - handler.AllocateFrom(to_data); - } - - /// - /// Specifies data movement to the GPU at the start of the kernel, and from the GPU back to the CPU at the end of the kernel. - /// Can be called multiple times with different datatypes, but is cleared after a call to Kernel(). - /// - /// The base type of the data. Must be an unmanaged type. - /// The data to move to and from the GPU. - public static void DataToFrom(params T[][] to_data) - where T : unmanaged - { - var handler = new AcceleratorHandler(); - handler.AllocateToFrom(to_data); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, action); } } } \ No newline at end of file diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs index 0c3fd3e4..d7281740 100644 --- a/DotMP/GPU/GpuArray.cs +++ b/DotMP/GPU/GpuArray.cs @@ -1,4 +1,5 @@ using ILGPU; +using System; namespace DotMP.GPU { @@ -6,7 +7,8 @@ namespace DotMP.GPU /// Wrapper object for representing arrays on the GPU. /// /// - public struct GPUArray where T : unmanaged + public struct GPUArray + where T : unmanaged { /// /// Internal ArrayView object. From 49706b3910d14542fd8a3cebe279b8c7ddb09eb0 Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 04:20:26 -0600 Subject: [PATCH 03/61] tidying up duplicate code --- DotMP/GPU/Buffer.cs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/DotMP/GPU/Buffer.cs b/DotMP/GPU/Buffer.cs index 6419f130..e0bd6908 100644 --- a/DotMP/GPU/Buffer.cs +++ b/DotMP/GPU/Buffer.cs @@ -63,14 +63,12 @@ public Buffer(T[] data, Buffer.Behavior behavior) switch (behavior) { case Buffer.Behavior.To: + case Buffer.Behavior.ToFrom: buf = AcceleratorHandler.accelerator.Allocate1D(data); break; case Buffer.Behavior.From: buf = AcceleratorHandler.accelerator.Allocate1D(data.Length); break; - case Buffer.Behavior.ToFrom: - buf = AcceleratorHandler.accelerator.Allocate1D(data); - break; } } From 68092ff3b67ff586130a1e50842402f58cbae29d Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 04:23:25 -0600 Subject: [PATCH 04/61] remove unnecessary implicit operators --- DotMP/GPU/GpuArray.cs | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs index d7281740..55c45edb 100644 --- a/DotMP/GPU/GpuArray.cs +++ b/DotMP/GPU/GpuArray.cs @@ -24,24 +24,6 @@ public GPUArray(ArrayView arrayView) this.arrayView = arrayView; } - /// - /// Implicit conversion to ArrayView. - /// - /// The GPUArray object. - public static implicit operator ArrayView(GPUArray array) - { - return array.arrayView; - } - - /// - /// Implicit conversion to GPUArray. - /// - /// The ArrayView object. - public static implicit operator GPUArray(ArrayView array) - { - return new GPUArray(array); - } - /// /// Overload for [] operator. /// From 0ac98e5faf92800be778097415e2bbdb7b3ca2e7 Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 04:30:13 -0600 Subject: [PATCH 05/61] add overloads for dispatching loops with 5 or 6 variables --- DotMP/GPU/AcceleratorHandler.cs | 80 ++++++++++++++++++++++++++++++++- DotMP/GPU/Gpu.cs | 61 +++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 1 deletion(-) diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs index f2889a00..220d57bd 100644 --- a/DotMP/GPU/AcceleratorHandler.cs +++ b/DotMP/GPU/AcceleratorHandler.cs @@ -122,7 +122,7 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Buffer } /// - /// The type of the first data parameter. + /// Dispatches a kernel with four parameters. /// /// The type of the first data parameter. /// The type of the second data parameter. @@ -153,5 +153,83 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Buf Synchronize(); } + + /// + /// Dispatches a kernel with five parameters. + /// + /// The type of the first data parameter. + /// The type of the second data parameter. + /// The type of the third data parameter. + /// The type of the fourth data parameter. + /// The type of the fifth data parameter. + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The action to perform. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Action, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + { + var idx = new Index(); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1.View), + new GPUArray(buf2.View), + new GPUArray(buf3.View), + new GPUArray(buf4.View), + new GPUArray(buf5.View)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with six parameters. + /// + /// The type of the first data parameter. + /// The type of the second data parameter. + /// The type of the third data parameter. + /// The type of the fourth data parameter. + /// The type of the fifth data parameter. + /// The type of the sixth data parameter. + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The action to perform. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + { + var idx = new Index(); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1.View), + new GPUArray(buf2.View), + new GPUArray(buf3.View), + new GPUArray(buf4.View), + new GPUArray(buf5.View), + new GPUArray(buf6.View)); + + Synchronize(); + } } } \ No newline at end of file diff --git a/DotMP/GPU/Gpu.cs b/DotMP/GPU/Gpu.cs index 63c402d5..6175bf09 100644 --- a/DotMP/GPU/Gpu.cs +++ b/DotMP/GPU/Gpu.cs @@ -95,5 +95,66 @@ public static void ParallelFor(int start, int end, Buffer buf1, B var handler = new AcceleratorHandler(); handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, action); } + + /// + /// Creates a GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// This overload specifies that five arrays are used on the GPU. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Action, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + { + var handler = new AcceleratorHandler(); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, action); + } + + /// + /// Creates a GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// This overload specifies that six arrays are used on the GPU. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + { + var handler = new AcceleratorHandler(); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, action); + } } } \ No newline at end of file From b76af519cd4b79f09fa5f19bf0e5915ce7caff94 Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 05:11:06 -0600 Subject: [PATCH 06/61] add parfor_dump to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 25fb8c99..a24fbd40 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ docs/* .vscode *.opencover.xml *.sln +parfor_dump.cs ProcessedREADME.md # User-specific files From c2f591a0769f6fc5f6f294259d6566a0d269323e Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 05:11:18 -0600 Subject: [PATCH 07/61] add parallel for overload code gen --- DotMP/GPU/parfor_gen.py | 72 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 DotMP/GPU/parfor_gen.py diff --git a/DotMP/GPU/parfor_gen.py b/DotMP/GPU/parfor_gen.py new file mode 100644 index 00000000..f001dc9b --- /dev/null +++ b/DotMP/GPU/parfor_gen.py @@ -0,0 +1,72 @@ +ofile = open("./parfor_dump.cs", "w") + +cardinals = ["one", "two", "three", "four", "five", "six", "seven", "eight", + "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"] +ordinals = ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", + "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth"] + +letters = ["T", "U", "V", "W", "X", "Y", "Z", + "A", "B", "C", "D", "E", "F", "G", "H", "I"] + +for i in range(0, 16): + funcstr = "" + + funcstr += """/// +/// Creates a GPU parallel for loop. +/// The body of the kernel is run on a GPU target. +/// This overload specifies that {c} arrays are used on the GPU. +/// +/// The start of the loop, inclusive. +/// The end of the loop, exclusive.""".format(c=cardinals[i]) + + for j in range(i + 1): + adjusted = j + 1 + + funcstr += """ +/// The {o} buffer to run the kernel with.""".format(a=j + 1, o=ordinals[j]) + + funcstr += """ +/// The kernel to run on the GPU.""" + + for j in range(i + 1): + funcstr += """ +/// The base type of the {o} argument. Must be an unmanaged type.""".format(l=letters[j], o=ordinals[j]) + + funcstr += """ +public static void ParallelFor<""" + + for j in range(i): + funcstr += "{l}, ".format(l=letters[j]) + + funcstr += "{l}>(int start, int end, ".format(l=letters[i]) + + for j in range(i + 1): + adjusted = j + 1 + funcstr += "Buffer<{l}> buf{a}, ".format(l=letters[j], a=adjusted) + + funcstr += "Action Date: Fri, 10 Nov 2023 05:11:37 -0600 Subject: [PATCH 08/61] add overloads for up to 16 kernel variables --- DotMP/GPU/Gpu.cs | 485 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 485 insertions(+) diff --git a/DotMP/GPU/Gpu.cs b/DotMP/GPU/Gpu.cs index 6175bf09..624c6033 100644 --- a/DotMP/GPU/Gpu.cs +++ b/DotMP/GPU/Gpu.cs @@ -156,5 +156,490 @@ public static void ParallelFor(int start, int end, Buffer b var handler = new AcceleratorHandler(); handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, action); } + + /// + /// Creates a GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// This overload specifies that seven arrays are used on the GPU. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + { + var handler = new AcceleratorHandler(); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, action); + } + + /// + /// Creates a GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// This overload specifies that eight arrays are used on the GPU. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + { + var handler = new AcceleratorHandler(); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, action); + } + + /// + /// Creates a GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// This overload specifies that nine arrays are used on the GPU. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + { + var handler = new AcceleratorHandler(); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, action); + } + + /// + /// Creates a GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// This overload specifies that ten arrays are used on the GPU. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The tenth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + /// The base type of the tenth argument. Must be an unmanaged type. + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + where C : unmanaged + { + var handler = new AcceleratorHandler(); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, action); + } + + /// + /// Creates a GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// This overload specifies that eleven arrays are used on the GPU. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The tenth buffer to run the kernel with. + /// The eleventh buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + /// The base type of the tenth argument. Must be an unmanaged type. + /// The base type of the eleventh argument. Must be an unmanaged type. + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + where C : unmanaged + where D : unmanaged + { + var handler = new AcceleratorHandler(); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, action); + } + + /// + /// Creates a GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// This overload specifies that twelve arrays are used on the GPU. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The tenth buffer to run the kernel with. + /// The eleventh buffer to run the kernel with. + /// The twelfth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + /// The base type of the tenth argument. Must be an unmanaged type. + /// The base type of the eleventh argument. Must be an unmanaged type. + /// The base type of the twelfth argument. Must be an unmanaged type. + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + where C : unmanaged + where D : unmanaged + where E : unmanaged + { + var handler = new AcceleratorHandler(); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, action); + } + + /// + /// Creates a GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// This overload specifies that thirteen arrays are used on the GPU. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The tenth buffer to run the kernel with. + /// The eleventh buffer to run the kernel with. + /// The twelfth buffer to run the kernel with. + /// The thirteenth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + /// The base type of the tenth argument. Must be an unmanaged type. + /// The base type of the eleventh argument. Must be an unmanaged type. + /// The base type of the twelfth argument. Must be an unmanaged type. + /// The base type of the thirteenth argument. Must be an unmanaged type. + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Buffer buf13, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + where C : unmanaged + where D : unmanaged + where E : unmanaged + where F : unmanaged + { + var handler = new AcceleratorHandler(); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, action); + } + + /// + /// Creates a GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// This overload specifies that fourteen arrays are used on the GPU. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The tenth buffer to run the kernel with. + /// The eleventh buffer to run the kernel with. + /// The twelfth buffer to run the kernel with. + /// The thirteenth buffer to run the kernel with. + /// The fourteenth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + /// The base type of the tenth argument. Must be an unmanaged type. + /// The base type of the eleventh argument. Must be an unmanaged type. + /// The base type of the twelfth argument. Must be an unmanaged type. + /// The base type of the thirteenth argument. Must be an unmanaged type. + /// The base type of the fourteenth argument. Must be an unmanaged type. + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Buffer buf13, Buffer buf14, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + where C : unmanaged + where D : unmanaged + where E : unmanaged + where F : unmanaged + where G : unmanaged + { + var handler = new AcceleratorHandler(); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, action); + } + + /// + /// Creates a GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// This overload specifies that fifteen arrays are used on the GPU. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The tenth buffer to run the kernel with. + /// The eleventh buffer to run the kernel with. + /// The twelfth buffer to run the kernel with. + /// The thirteenth buffer to run the kernel with. + /// The fourteenth buffer to run the kernel with. + /// The fifteenth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + /// The base type of the tenth argument. Must be an unmanaged type. + /// The base type of the eleventh argument. Must be an unmanaged type. + /// The base type of the twelfth argument. Must be an unmanaged type. + /// The base type of the thirteenth argument. Must be an unmanaged type. + /// The base type of the fourteenth argument. Must be an unmanaged type. + /// The base type of the fifteenth argument. Must be an unmanaged type. + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Buffer buf13, Buffer buf14, Buffer buf15, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + where C : unmanaged + where D : unmanaged + where E : unmanaged + where F : unmanaged + where G : unmanaged + where H : unmanaged + { + var handler = new AcceleratorHandler(); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, buf15, action); + } + + /// + /// Creates a GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// This overload specifies that sixteen arrays are used on the GPU. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The tenth buffer to run the kernel with. + /// The eleventh buffer to run the kernel with. + /// The twelfth buffer to run the kernel with. + /// The thirteenth buffer to run the kernel with. + /// The fourteenth buffer to run the kernel with. + /// The fifteenth buffer to run the kernel with. + /// The sixteenth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + /// The base type of the tenth argument. Must be an unmanaged type. + /// The base type of the eleventh argument. Must be an unmanaged type. + /// The base type of the twelfth argument. Must be an unmanaged type. + /// The base type of the thirteenth argument. Must be an unmanaged type. + /// The base type of the fourteenth argument. Must be an unmanaged type. + /// The base type of the fifteenth argument. Must be an unmanaged type. + /// The base type of the sixteenth argument. Must be an unmanaged type. + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Buffer buf13, Buffer buf14, Buffer buf15, Buffer buf16, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + where C : unmanaged + where D : unmanaged + where E : unmanaged + where F : unmanaged + where G : unmanaged + where H : unmanaged + where I : unmanaged + { + var handler = new AcceleratorHandler(); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, buf15, buf16, action); + } } } \ No newline at end of file From 82165daa00fff2e4af60e21f243904531cbf58ce Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 05:12:47 -0600 Subject: [PATCH 09/61] move python to python folder --- .../{parfor_gen.py => Python/dispatch_gen.py} | 0 DotMP/GPU/Python/parfor_gen.py | 72 +++++++++++++++++++ 2 files changed, 72 insertions(+) rename DotMP/GPU/{parfor_gen.py => Python/dispatch_gen.py} (100%) create mode 100644 DotMP/GPU/Python/parfor_gen.py diff --git a/DotMP/GPU/parfor_gen.py b/DotMP/GPU/Python/dispatch_gen.py similarity index 100% rename from DotMP/GPU/parfor_gen.py rename to DotMP/GPU/Python/dispatch_gen.py diff --git a/DotMP/GPU/Python/parfor_gen.py b/DotMP/GPU/Python/parfor_gen.py new file mode 100644 index 00000000..f001dc9b --- /dev/null +++ b/DotMP/GPU/Python/parfor_gen.py @@ -0,0 +1,72 @@ +ofile = open("./parfor_dump.cs", "w") + +cardinals = ["one", "two", "three", "four", "five", "six", "seven", "eight", + "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"] +ordinals = ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", + "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth"] + +letters = ["T", "U", "V", "W", "X", "Y", "Z", + "A", "B", "C", "D", "E", "F", "G", "H", "I"] + +for i in range(0, 16): + funcstr = "" + + funcstr += """/// +/// Creates a GPU parallel for loop. +/// The body of the kernel is run on a GPU target. +/// This overload specifies that {c} arrays are used on the GPU. +/// +/// The start of the loop, inclusive. +/// The end of the loop, exclusive.""".format(c=cardinals[i]) + + for j in range(i + 1): + adjusted = j + 1 + + funcstr += """ +/// The {o} buffer to run the kernel with.""".format(a=j + 1, o=ordinals[j]) + + funcstr += """ +/// The kernel to run on the GPU.""" + + for j in range(i + 1): + funcstr += """ +/// The base type of the {o} argument. Must be an unmanaged type.""".format(l=letters[j], o=ordinals[j]) + + funcstr += """ +public static void ParallelFor<""" + + for j in range(i): + funcstr += "{l}, ".format(l=letters[j]) + + funcstr += "{l}>(int start, int end, ".format(l=letters[i]) + + for j in range(i + 1): + adjusted = j + 1 + funcstr += "Buffer<{l}> buf{a}, ".format(l=letters[j], a=adjusted) + + funcstr += "Action Date: Fri, 10 Nov 2023 05:29:56 -0600 Subject: [PATCH 10/61] update python to only generate up to 13 data params --- DotMP/GPU/Python/dispatch_gen.py | 31 ++++++++++++++++++------------- DotMP/GPU/Python/parfor_gen.py | 2 +- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/DotMP/GPU/Python/dispatch_gen.py b/DotMP/GPU/Python/dispatch_gen.py index f001dc9b..a9d0f08d 100644 --- a/DotMP/GPU/Python/dispatch_gen.py +++ b/DotMP/GPU/Python/dispatch_gen.py @@ -1,4 +1,4 @@ -ofile = open("./parfor_dump.cs", "w") +ofile = open("./dispatch_dump.cs", "w") cardinals = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"] @@ -8,13 +8,11 @@ letters = ["T", "U", "V", "W", "X", "Y", "Z", "A", "B", "C", "D", "E", "F", "G", "H", "I"] -for i in range(0, 16): +for i in range(0, 13): funcstr = "" funcstr += """/// -/// Creates a GPU parallel for loop. -/// The body of the kernel is run on a GPU target. -/// This overload specifies that {c} arrays are used on the GPU. +/// Dispatches a kernel with {c} parameters. /// /// The start of the loop, inclusive. /// The end of the loop, exclusive.""".format(c=cardinals[i]) @@ -33,7 +31,7 @@ /// The base type of the {o} argument. Must be an unmanaged type.""".format(l=letters[j], o=ordinals[j]) funcstr += """ -public static void ParallelFor<""" +internal void DispatchKernel<""" for j in range(i): funcstr += "{l}, ".format(l=letters[j]) @@ -57,16 +55,23 @@ funcstr += """ { - var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, """ + var idx = new Index(); - for j in range(i + 1): + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, +""" + + for j in range(i): adjusted = j + 1 - funcstr += "buf{a}, ".format(a=adjusted) + funcstr += """ new GPUArray<{l}>(buf{a}.View), +""".format(l=letters[j], a=adjusted) - funcstr += """action); -} + funcstr += """ new GPUArray<{l}>(buf{a}.View)); -""" + Synchronize(); +""".format(l=letters[i], a=i + 1) + + funcstr += "}\n\n" ofile.write(funcstr) diff --git a/DotMP/GPU/Python/parfor_gen.py b/DotMP/GPU/Python/parfor_gen.py index f001dc9b..c119b624 100644 --- a/DotMP/GPU/Python/parfor_gen.py +++ b/DotMP/GPU/Python/parfor_gen.py @@ -8,7 +8,7 @@ letters = ["T", "U", "V", "W", "X", "Y", "Z", "A", "B", "C", "D", "E", "F", "G", "H", "I"] -for i in range(0, 16): +for i in range(0, 13): funcstr = "" funcstr += """/// From 7b9093336eba6c3de29142ef6e4ae1ec9f8dc695 Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 05:30:10 -0600 Subject: [PATCH 11/61] add parfor overloads for up to 13 data parameters --- DotMP/GPU/AcceleratorHandler.cs | 465 +++++++++++++++++++++++++++++--- DotMP/GPU/Gpu.cs | 177 ------------ 2 files changed, 432 insertions(+), 210 deletions(-) diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs index 220d57bd..579bb127 100644 --- a/DotMP/GPU/AcceleratorHandler.cs +++ b/DotMP/GPU/AcceleratorHandler.cs @@ -47,14 +47,14 @@ internal AcceleratorHandler() private void Synchronize() => accelerator.Synchronize(); /// - /// Dispatches a kernel with one data parameter. + /// Dispatches a kernel with one parameters. /// - /// The type of the data parameter. /// The start of the loop, inclusive. /// The end of the loop, exclusive. - /// The buffer to run the kernel with. - /// The action to perform. - internal void DispatchKernel(int start, int end, Buffer buf, Action> action) + /// The first buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Action> action) where T : unmanaged { var idx = new Index(); @@ -62,21 +62,21 @@ internal void DispatchKernel(int start, int end, Buffer buf, Action(buf.View)); + new GPUArray(buf1.View)); Synchronize(); } /// - /// Dispatches a kernel with two data parameters. + /// Dispatches a kernel with two parameters. /// - /// The type of the first data parameter. - /// The type of the second data parameter. /// The start of the loop, inclusive. /// The end of the loop, exclusive. /// The first buffer to run the kernel with. /// The second buffer to run the kernel with. - /// The action to perform. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Action, GPUArray> action) where T : unmanaged where U : unmanaged @@ -93,17 +93,17 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Buffer } /// - /// Dispatches a kernel with three data parameters. + /// Dispatches a kernel with three parameters. /// - /// The type of the first data parameter. - /// The type of the second data parameter. - /// The type of the third data parameter. /// The start of the loop, inclusive. /// The end of the loop, exclusive. /// The first buffer to run the kernel with. /// The second buffer to run the kernel with. /// The third buffer to run the kernel with. - /// The action to perform. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Action, GPUArray, GPUArray> action) where T : unmanaged where U : unmanaged @@ -124,17 +124,17 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Buffer /// /// Dispatches a kernel with four parameters. /// - /// The type of the first data parameter. - /// The type of the second data parameter. - /// The type of the third data parameter. - /// The type of the fourth data parameter. /// The start of the loop, inclusive. /// The end of the loop, exclusive. /// The first buffer to run the kernel with. /// The second buffer to run the kernel with. /// The third buffer to run the kernel with. /// The fourth buffer to run the kernel with. - /// The action to perform. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Action, GPUArray, GPUArray, GPUArray> action) where T : unmanaged where U : unmanaged @@ -157,11 +157,6 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Buf /// /// Dispatches a kernel with five parameters. /// - /// The type of the first data parameter. - /// The type of the second data parameter. - /// The type of the third data parameter. - /// The type of the fourth data parameter. - /// The type of the fifth data parameter. /// The start of the loop, inclusive. /// The end of the loop, exclusive. /// The first buffer to run the kernel with. @@ -169,7 +164,12 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Buf /// The third buffer to run the kernel with. /// The fourth buffer to run the kernel with. /// The fifth buffer to run the kernel with. - /// The action to perform. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Action, GPUArray, GPUArray, GPUArray, GPUArray> action) where T : unmanaged where U : unmanaged @@ -194,12 +194,6 @@ internal void DispatchKernel(int start, int end, Buffer buf1, /// /// Dispatches a kernel with six parameters. /// - /// The type of the first data parameter. - /// The type of the second data parameter. - /// The type of the third data parameter. - /// The type of the fourth data parameter. - /// The type of the fifth data parameter. - /// The type of the sixth data parameter. /// The start of the loop, inclusive. /// The end of the loop, exclusive. /// The first buffer to run the kernel with. @@ -208,7 +202,13 @@ internal void DispatchKernel(int start, int end, Buffer buf1, /// The fourth buffer to run the kernel with. /// The fifth buffer to run the kernel with. /// The sixth buffer to run the kernel with. - /// The action to perform. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) where T : unmanaged where U : unmanaged @@ -231,5 +231,404 @@ internal void DispatchKernel(int start, int end, Buffer buf Synchronize(); } + + /// + /// Dispatches a kernel with seven parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + { + var idx = new Index(); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1.View), + new GPUArray(buf2.View), + new GPUArray(buf3.View), + new GPUArray(buf4.View), + new GPUArray(buf5.View), + new GPUArray(buf6.View), + new GPUArray(buf7.View)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with eight parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + { + var idx = new Index(); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1.View), + new GPUArray(buf2.View), + new GPUArray(buf3.View), + new GPUArray(buf4.View), + new GPUArray(buf5.View), + new GPUArray(buf6.View), + new GPUArray(buf7.View), + new GPUArray(buf8.View)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with nine parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + { + var idx = new Index(); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1.View), + new GPUArray(buf2.View), + new GPUArray(buf3.View), + new GPUArray(buf4.View), + new GPUArray(buf5.View), + new GPUArray(buf6.View), + new GPUArray(buf7.View), + new GPUArray(buf8.View), + new GPUArray(buf9.View)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with ten parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The tenth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + /// The base type of the tenth argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + where C : unmanaged + { + var idx = new Index(); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1.View), + new GPUArray(buf2.View), + new GPUArray(buf3.View), + new GPUArray(buf4.View), + new GPUArray(buf5.View), + new GPUArray(buf6.View), + new GPUArray(buf7.View), + new GPUArray(buf8.View), + new GPUArray(buf9.View), + new GPUArray(buf10.View)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with eleven parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The tenth buffer to run the kernel with. + /// The eleventh buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + /// The base type of the tenth argument. Must be an unmanaged type. + /// The base type of the eleventh argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + where C : unmanaged + where D : unmanaged + { + var idx = new Index(); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1.View), + new GPUArray(buf2.View), + new GPUArray(buf3.View), + new GPUArray(buf4.View), + new GPUArray(buf5.View), + new GPUArray(buf6.View), + new GPUArray(buf7.View), + new GPUArray(buf8.View), + new GPUArray(buf9.View), + new GPUArray(buf10.View), + new GPUArray(buf11.View)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with twelve parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The tenth buffer to run the kernel with. + /// The eleventh buffer to run the kernel with. + /// The twelfth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + /// The base type of the tenth argument. Must be an unmanaged type. + /// The base type of the eleventh argument. Must be an unmanaged type. + /// The base type of the twelfth argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + where C : unmanaged + where D : unmanaged + where E : unmanaged + { + var idx = new Index(); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1.View), + new GPUArray(buf2.View), + new GPUArray(buf3.View), + new GPUArray(buf4.View), + new GPUArray(buf5.View), + new GPUArray(buf6.View), + new GPUArray(buf7.View), + new GPUArray(buf8.View), + new GPUArray(buf9.View), + new GPUArray(buf10.View), + new GPUArray(buf11.View), + new GPUArray(buf12.View)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with thirteen parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The tenth buffer to run the kernel with. + /// The eleventh buffer to run the kernel with. + /// The twelfth buffer to run the kernel with. + /// The thirteenth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + /// The base type of the tenth argument. Must be an unmanaged type. + /// The base type of the eleventh argument. Must be an unmanaged type. + /// The base type of the twelfth argument. Must be an unmanaged type. + /// The base type of the thirteenth argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Buffer buf13, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + where C : unmanaged + where D : unmanaged + where E : unmanaged + where F : unmanaged + { + var idx = new Index(); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1.View), + new GPUArray(buf2.View), + new GPUArray(buf3.View), + new GPUArray(buf4.View), + new GPUArray(buf5.View), + new GPUArray(buf6.View), + new GPUArray(buf7.View), + new GPUArray(buf8.View), + new GPUArray(buf9.View), + new GPUArray(buf10.View), + new GPUArray(buf11.View), + new GPUArray(buf12.View), + new GPUArray(buf13.View)); + + Synchronize(); + } } } \ No newline at end of file diff --git a/DotMP/GPU/Gpu.cs b/DotMP/GPU/Gpu.cs index 624c6033..1237a75d 100644 --- a/DotMP/GPU/Gpu.cs +++ b/DotMP/GPU/Gpu.cs @@ -464,182 +464,5 @@ public static void ParallelFor(int start, var handler = new AcceleratorHandler(); handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, action); } - - /// - /// Creates a GPU parallel for loop. - /// The body of the kernel is run on a GPU target. - /// This overload specifies that fourteen arrays are used on the GPU. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The tenth buffer to run the kernel with. - /// The eleventh buffer to run the kernel with. - /// The twelfth buffer to run the kernel with. - /// The thirteenth buffer to run the kernel with. - /// The fourteenth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - /// The base type of the tenth argument. Must be an unmanaged type. - /// The base type of the eleventh argument. Must be an unmanaged type. - /// The base type of the twelfth argument. Must be an unmanaged type. - /// The base type of the thirteenth argument. Must be an unmanaged type. - /// The base type of the fourteenth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Buffer buf13, Buffer buf14, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - where C : unmanaged - where D : unmanaged - where E : unmanaged - where F : unmanaged - where G : unmanaged - { - var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, action); - } - - /// - /// Creates a GPU parallel for loop. - /// The body of the kernel is run on a GPU target. - /// This overload specifies that fifteen arrays are used on the GPU. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The tenth buffer to run the kernel with. - /// The eleventh buffer to run the kernel with. - /// The twelfth buffer to run the kernel with. - /// The thirteenth buffer to run the kernel with. - /// The fourteenth buffer to run the kernel with. - /// The fifteenth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - /// The base type of the tenth argument. Must be an unmanaged type. - /// The base type of the eleventh argument. Must be an unmanaged type. - /// The base type of the twelfth argument. Must be an unmanaged type. - /// The base type of the thirteenth argument. Must be an unmanaged type. - /// The base type of the fourteenth argument. Must be an unmanaged type. - /// The base type of the fifteenth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Buffer buf13, Buffer buf14, Buffer buf15, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - where C : unmanaged - where D : unmanaged - where E : unmanaged - where F : unmanaged - where G : unmanaged - where H : unmanaged - { - var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, buf15, action); - } - - /// - /// Creates a GPU parallel for loop. - /// The body of the kernel is run on a GPU target. - /// This overload specifies that sixteen arrays are used on the GPU. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The tenth buffer to run the kernel with. - /// The eleventh buffer to run the kernel with. - /// The twelfth buffer to run the kernel with. - /// The thirteenth buffer to run the kernel with. - /// The fourteenth buffer to run the kernel with. - /// The fifteenth buffer to run the kernel with. - /// The sixteenth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - /// The base type of the tenth argument. Must be an unmanaged type. - /// The base type of the eleventh argument. Must be an unmanaged type. - /// The base type of the twelfth argument. Must be an unmanaged type. - /// The base type of the thirteenth argument. Must be an unmanaged type. - /// The base type of the fourteenth argument. Must be an unmanaged type. - /// The base type of the fifteenth argument. Must be an unmanaged type. - /// The base type of the sixteenth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Buffer buf13, Buffer buf14, Buffer buf15, Buffer buf16, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - where C : unmanaged - where D : unmanaged - where E : unmanaged - where F : unmanaged - where G : unmanaged - where H : unmanaged - where I : unmanaged - { - var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, buf15, buf16, action); - } } } \ No newline at end of file From 8a87b5c4b511a0893b5a590aff2a4732311d6085 Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 05:35:18 -0600 Subject: [PATCH 12/61] fix cardinality in documentation --- DotMP/GPU/AcceleratorHandler.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs index 579bb127..05ebe6b9 100644 --- a/DotMP/GPU/AcceleratorHandler.cs +++ b/DotMP/GPU/AcceleratorHandler.cs @@ -47,14 +47,14 @@ internal AcceleratorHandler() private void Synchronize() => accelerator.Synchronize(); /// - /// Dispatches a kernel with one parameters. + /// Dispatches a kernel with one parameter. /// /// The start of the loop, inclusive. /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. + /// The buffer to run the kernel with. /// The kernel to run on the GPU. /// The base type of the first argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Action> action) + internal void DispatchKernel(int start, int end, Buffer buf, Action> action) where T : unmanaged { var idx = new Index(); @@ -62,7 +62,7 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Action(buf1.View)); + new GPUArray(buf.View)); Synchronize(); } From ef3035f33363eb24934097e06af9a7b2b3cba9fb Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 08:54:35 -0600 Subject: [PATCH 13/61] implement 2D arrays into GPUArray and Buffer objects --- DotMP/GPU/AcceleratorHandler.cs | 188 ++++++++++++++++---------------- DotMP/GPU/Buffer.cs | 104 +++++++++++++++--- DotMP/GPU/GpuArray.cs | 63 +++++++++-- 3 files changed, 243 insertions(+), 112 deletions(-) diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs index 05ebe6b9..95d415ff 100644 --- a/DotMP/GPU/AcceleratorHandler.cs +++ b/DotMP/GPU/AcceleratorHandler.cs @@ -35,7 +35,11 @@ internal AcceleratorHandler() if (initialized) return; context = Context.CreateDefault(); - accelerator = context.Devices[0].CreateAccelerator(context); + accelerator = context.Devices[1].CreateAccelerator(context); + foreach (var d in context.Devices) + { + Console.WriteLine("Detected {0} accelerator.", d.ToString()); + } Console.WriteLine("Using {0} accelerator.", accelerator.AcceleratorType.ToString()); initialized = true; block_size = accelerator.AcceleratorType == AcceleratorType.CPU ? 16 : 256; @@ -62,7 +66,7 @@ internal void DispatchKernel(int start, int end, Buffer buf, Action(buf.View)); + new GPUArray(buf)); Synchronize(); } @@ -86,8 +90,8 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Buffer var kernel = accelerator.LoadStreamKernel(action); kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1.View), - new GPUArray(buf2.View)); + new GPUArray(buf1), + new GPUArray(buf2)); Synchronize(); } @@ -114,9 +118,9 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Buffer var kernel = accelerator.LoadStreamKernel(action); kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1.View), - new GPUArray(buf2.View), - new GPUArray(buf3.View)); + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3)); Synchronize(); } @@ -146,10 +150,10 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Buf var kernel = accelerator.LoadStreamKernel(action); kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1.View), - new GPUArray(buf2.View), - new GPUArray(buf3.View), - new GPUArray(buf4.View)); + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4)); Synchronize(); } @@ -182,11 +186,11 @@ internal void DispatchKernel(int start, int end, Buffer buf1, var kernel = accelerator.LoadStreamKernel(action); kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1.View), - new GPUArray(buf2.View), - new GPUArray(buf3.View), - new GPUArray(buf4.View), - new GPUArray(buf5.View)); + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5)); Synchronize(); } @@ -222,12 +226,12 @@ internal void DispatchKernel(int start, int end, Buffer buf var kernel = accelerator.LoadStreamKernel(action); kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1.View), - new GPUArray(buf2.View), - new GPUArray(buf3.View), - new GPUArray(buf4.View), - new GPUArray(buf5.View), - new GPUArray(buf6.View)); + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5), + new GPUArray(buf6)); Synchronize(); } @@ -266,13 +270,13 @@ internal void DispatchKernel(int start, int end, Buffer var kernel = accelerator.LoadStreamKernel(action); kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1.View), - new GPUArray(buf2.View), - new GPUArray(buf3.View), - new GPUArray(buf4.View), - new GPUArray(buf5.View), - new GPUArray(buf6.View), - new GPUArray(buf7.View)); + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5), + new GPUArray(buf6), + new GPUArray(buf7)); Synchronize(); } @@ -314,14 +318,14 @@ internal void DispatchKernel(int start, int end, Buffer< var kernel = accelerator.LoadStreamKernel(action); kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1.View), - new GPUArray(buf2.View), - new GPUArray(buf3.View), - new GPUArray(buf4.View), - new GPUArray(buf5.View), - new GPUArray(buf6.View), - new GPUArray(buf7.View), - new GPUArray(buf8.View)); + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5), + new GPUArray(buf6), + new GPUArray(buf7), + new GPUArray(buf8)); Synchronize(); } @@ -366,15 +370,15 @@ internal void DispatchKernel(int start, int end, Buff var kernel = accelerator.LoadStreamKernel(action); kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1.View), - new GPUArray(buf2.View), - new GPUArray(buf3.View), - new GPUArray(buf4.View), - new GPUArray(buf5.View), - new GPUArray(buf6.View), - new GPUArray(buf7.View), - new GPUArray(buf8.View), - new GPUArray(buf9.View)); + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5), + new GPUArray(buf6), + new GPUArray(buf7), + new GPUArray(buf8), + new GPUArray(buf9)); Synchronize(); } @@ -422,16 +426,16 @@ internal void DispatchKernel(int start, int end, B var kernel = accelerator.LoadStreamKernel(action); kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1.View), - new GPUArray(buf2.View), - new GPUArray(buf3.View), - new GPUArray(buf4.View), - new GPUArray(buf5.View), - new GPUArray(buf6.View), - new GPUArray(buf7.View), - new GPUArray(buf8.View), - new GPUArray(buf9.View), - new GPUArray(buf10.View)); + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5), + new GPUArray(buf6), + new GPUArray(buf7), + new GPUArray(buf8), + new GPUArray(buf9), + new GPUArray(buf10)); Synchronize(); } @@ -482,17 +486,17 @@ internal void DispatchKernel(int start, int end var kernel = accelerator.LoadStreamKernel(action); kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1.View), - new GPUArray(buf2.View), - new GPUArray(buf3.View), - new GPUArray(buf4.View), - new GPUArray(buf5.View), - new GPUArray(buf6.View), - new GPUArray(buf7.View), - new GPUArray(buf8.View), - new GPUArray(buf9.View), - new GPUArray(buf10.View), - new GPUArray(buf11.View)); + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5), + new GPUArray(buf6), + new GPUArray(buf7), + new GPUArray(buf8), + new GPUArray(buf9), + new GPUArray(buf10), + new GPUArray(buf11)); Synchronize(); } @@ -546,18 +550,18 @@ internal void DispatchKernel(int start, int var kernel = accelerator.LoadStreamKernel(action); kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1.View), - new GPUArray(buf2.View), - new GPUArray(buf3.View), - new GPUArray(buf4.View), - new GPUArray(buf5.View), - new GPUArray(buf6.View), - new GPUArray(buf7.View), - new GPUArray(buf8.View), - new GPUArray(buf9.View), - new GPUArray(buf10.View), - new GPUArray(buf11.View), - new GPUArray(buf12.View)); + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5), + new GPUArray(buf6), + new GPUArray(buf7), + new GPUArray(buf8), + new GPUArray(buf9), + new GPUArray(buf10), + new GPUArray(buf11), + new GPUArray(buf12)); Synchronize(); } @@ -614,19 +618,19 @@ internal void DispatchKernel(int start, i var kernel = accelerator.LoadStreamKernel(action); kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1.View), - new GPUArray(buf2.View), - new GPUArray(buf3.View), - new GPUArray(buf4.View), - new GPUArray(buf5.View), - new GPUArray(buf6.View), - new GPUArray(buf7.View), - new GPUArray(buf8.View), - new GPUArray(buf9.View), - new GPUArray(buf10.View), - new GPUArray(buf11.View), - new GPUArray(buf12.View), - new GPUArray(buf13.View)); + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5), + new GPUArray(buf6), + new GPUArray(buf7), + new GPUArray(buf8), + new GPUArray(buf9), + new GPUArray(buf10), + new GPUArray(buf11), + new GPUArray(buf12), + new GPUArray(buf13)); Synchronize(); } diff --git a/DotMP/GPU/Buffer.cs b/DotMP/GPU/Buffer.cs index e0bd6908..2ec5b33c 100644 --- a/DotMP/GPU/Buffer.cs +++ b/DotMP/GPU/Buffer.cs @@ -1,5 +1,5 @@ using System; -using DotMP.GPU; +using ILGPU; using ILGPU.Runtime; namespace DotMP.GPU @@ -32,11 +32,15 @@ public enum Behavior public class Buffer : IDisposable where T : unmanaged { + /// + /// The ILGPU buffer for 1D arrays. + /// + private MemoryBuffer1D buf1d; /// - /// The ILGPU buffer. + /// The ILGPU buffer for 2D arrays. /// - private MemoryBuffer1D buf; + private MemoryBuffer2D buf2d; /// /// Behavior of the data, as specified by Behavior. @@ -44,12 +48,41 @@ public class Buffer : IDisposable private Buffer.Behavior behavior; /// - /// The CPU array, so that we can copy the data back. + /// The CPU 1D array, so that we can copy the data back. + /// + private T[] data1d; + + /// + /// The CPU 2D array, so that we can copy the data back. + /// + private T[,] data2d; + + /// + /// Handler int for the number of dimensions in the array. + /// + private int dims; + + /// + /// The number of dimensions in the array. /// - private T[] data; + internal int Dimensions + { + get + { + return dims; + } + + private set + { + if (value < 1 || value > 3) + throw new ArgumentOutOfRangeException("Number of dimensions must be between 1 and 3."); + + dims = value; + } + } /// - /// Constructor for buffer object. Allocates data on the GPU and makes it available for the next GPU kernel. + /// Constructor for buffer object. Allocates a 1D array on the GPU and makes it available for the next GPU kernel. /// /// The data to allocate on the GPU. /// The behavior of the data, see Behavior. @@ -58,18 +91,46 @@ public Buffer(T[] data, Buffer.Behavior behavior) new AcceleratorHandler(); this.behavior = behavior; - this.data = data; + this.data1d = data; switch (behavior) { case Buffer.Behavior.To: case Buffer.Behavior.ToFrom: - buf = AcceleratorHandler.accelerator.Allocate1D(data); + buf1d = AcceleratorHandler.accelerator.Allocate1D(data); break; case Buffer.Behavior.From: - buf = AcceleratorHandler.accelerator.Allocate1D(data.Length); + buf1d = AcceleratorHandler.accelerator.Allocate1D(data.Length); break; } + + Dimensions = 1; + } + + /// + /// Constructor for buffer object. Allocates a 2D array on the GPU and makes it available for the next GPU kernel. + /// + /// The data to allocate on the GPU. + /// The behavior of the data, see Behavior. + public Buffer(T[,] data, Buffer.Behavior behavior) + { + new AcceleratorHandler(); + + this.behavior = behavior; + this.data2d = data; + + switch (behavior) + { + case Buffer.Behavior.To: + case Buffer.Behavior.ToFrom: + buf2d = AcceleratorHandler.accelerator.Allocate2DDenseY(data); + break; + case Buffer.Behavior.From: + buf2d = AcceleratorHandler.accelerator.Allocate2DDenseY((data.GetLength(0), data.GetLength(1))); + break; + } + + Dimensions = 2; } /// @@ -77,17 +138,34 @@ public Buffer(T[] data, Buffer.Behavior behavior) /// public void Dispose() { - if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom) + if (Dimensions == 1) { - buf.GetAsArray1D().CopyTo(data, 0); + if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom) + { + buf1d.GetAsArray1D().CopyTo(data1d, 0); + } + + buf1d.Dispose(); } + else if (Dimensions == 2) + { + if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom) + { + buf2d.GetAsArray2D().CopyTo(data2d, 0); + } - buf.Dispose(); + buf2d.Dispose(); + } } /// /// Get the view of the memory for the GPU. /// - internal ArrayView1D View { get => buf.View; } + internal ArrayView1D View1D { get => buf1d.View; } + + /// + /// Get the view of the memory for the GPU. + /// + internal ArrayView2D View2D { get => buf2d.View; } } } \ No newline at end of file diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs index 55c45edb..ffc898ef 100644 --- a/DotMP/GPU/GpuArray.cs +++ b/DotMP/GPU/GpuArray.cs @@ -1,4 +1,5 @@ using ILGPU; +using ILGPU.Runtime; using System; namespace DotMP.GPU @@ -11,17 +12,43 @@ public struct GPUArray where T : unmanaged { /// - /// Internal ArrayView object. + /// The ILGPU buffer for 1D arrays. /// - private ArrayView arrayView; + private ArrayView1D view1d; + + /// + /// The ILGPU buffer for 2D arrays. + /// + private ArrayView2D view2d; + + /// + /// Number of dimensions. + /// + private int dims; /// /// Constructor. /// /// The ArrayView to wrap. - public GPUArray(ArrayView arrayView) + public GPUArray(Buffer arrayView) { - this.arrayView = arrayView; + if (arrayView.Dimensions == 1) + { + view1d = arrayView.View1D; + view2d = ArrayView2D.Empty; + } + else if (arrayView.Dimensions == 2) + { + view1d = ArrayView1D.Empty; + view2d = arrayView.View2D; + } + else + { + view1d = ArrayView1D.Empty; + view2d = ArrayView2D.Empty; + } + + dims = arrayView.Dimensions; } /// @@ -31,8 +58,20 @@ public GPUArray(ArrayView arrayView) /// The data at that ID. public T this[int idx] { - get => arrayView[idx]; - set => arrayView[idx] = value; + get => view1d[idx]; + set => view1d[idx] = value; + } + + /// + /// Overload for [,] operator. + /// + /// The first ID to index into. + /// The second ID to index into. + /// The data at that ID. + public T this[int i, int j] + { + get => view2d[i, j]; + set => view2d[i, j] = value; } /// @@ -40,7 +79,17 @@ public T this[int idx] /// public int Length { - get => arrayView.IntLength; + get + { + switch (dims) + { + case 1: + default: + return view1d.IntLength; + case 2: + return view2d.IntLength; + } + } } } } \ No newline at end of file From 4eb444543392df07778564b72bcf1316b858b226 Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 08:54:51 -0600 Subject: [PATCH 14/61] initial commit of GPU heat transfer benchmark --- .../GPUHeatTransfer/GPUHeatTransfer.csproj | 10 + benchmarks/GPUHeatTransfer/Program.cs | 302 ++++++++++++++++++ 2 files changed, 312 insertions(+) create mode 100644 benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj create mode 100644 benchmarks/GPUHeatTransfer/Program.cs diff --git a/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj b/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj new file mode 100644 index 00000000..d4398000 --- /dev/null +++ b/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj @@ -0,0 +1,10 @@ + + + + Exe + net7.0 + enable + enable + + + diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs new file mode 100644 index 00000000..3889f953 --- /dev/null +++ b/benchmarks/GPUHeatTransfer/Program.cs @@ -0,0 +1,302 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Jobs; +using BenchmarkDotNet.Running; +using BenchmarkDotNet.Diagnosers; + +/* jscpd:ignore-start */ + +[SimpleJob(RuntimeMoniker.Net60)] +[ThreadingDiagnoser] +[HardwareCounters] +[EventPipeProfiler(EventPipeProfile.CpuSampling)] +// test heat transfer using Parallel.For +public class HeatTransfer +{ + // scratch array + private double[,] scratch = new double[0, 0]; + // grid array + private double[,] grid = new double[0, 0]; + + //private + + // parallel type enum + public enum ParType { DMPFor, DMPGPU } + + // test dims of 100x100, 1000x1000, and 5000x5000 + [Params(500)] + public int dim; + + // test with 10 steps and 100 steps + [Params(100)] + public int steps; + + // test with all 3 parallel types + [Params(ParType.DMPFor, ParType.DMPGPU)] + public ParType type; + + // change this to configure the number of threads to use + public uint num_threads = 6; + + // run the setup + [GlobalSetup] + public void Setup() + { + scratch = new double[dim, dim]; + grid = new double[dim, dim]; + + grid[0, dim / 2 - 1] = 100.0; + grid[0, dim / 2] = 100.0; + } + + //run the simulation + [Benchmark] + public void DoSimulation() + { + Action action = () => + { + //do the steps + for (int i = 0; i < steps; i++) + { + DoStep(); + } + }; + + if (type == ParType.DMPGPU) + { + action(); + } + else + { + // spawn a parallel region + DotMP.Parallel.ParallelRegion(num_threads: num_threads, action: action); + } + } + + //do a step of the heat transfer simulation + public void DoStep() + { + switch (type) + { + case ParType.DMPFor: + //iterate over all cells not on the border + DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i => + { + for (int j = 1; j < dim - 1; j++) + { + //set the scratch array to the average of the surrounding cells + scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); + } + }); + + //copy the scratch array to the grid array + DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i => + { + for (int j = 1; j < dim - 1; j++) + { + grid[i, j] = scratch[i, j]; + } + }); + break; + + case ParType.DMPGPU: + DotMP.GPU.ParallelFor(); + break; + } + } +} + +// test heat transfer using Parallel.For +public class HeatTransferVerify +{ + // scratch array + private double[,] scratch = new double[0, 0]; + // grid array + private double[,] grid = new double[0, 0]; + + // parallel type enum + public enum ParType { TPL, For, ForCollapse, Serial } + + // test dims of 100x100, 1000x1000, and 5000x5000 + public int dim = 500; + + // test with 10 steps and 100 steps + public int steps = 100; + + // test with all 3 parallel types + public ParType type = ParType.For; + + // change this to configure the number of threads to use + public uint num_threads = 6; + + // run the setup + public void Setup() + { + scratch = new double[dim, dim]; + grid = new double[dim, dim]; + + grid[0, dim / 2 - 1] = 100.0; + grid[0, dim / 2] = 100.0; + } + + //run the simulation + public void DoSimulation() + { + Action action = () => + { + //do the steps + for (int i = 0; i < steps; i++) + { + DoStep(); + } + }; + + if (type == ParType.TPL || type == ParType.Serial) + { + action(); + } + else + { + // spawn a parallel region + DotMP.Parallel.ParallelRegion(num_threads: num_threads, action: action); + } + } + + //do a step of the heat transfer simulation + public void DoStep() + { + switch (type) + { + case ParType.TPL: + //iterate over all cells not on the border + System.Threading.Tasks.Parallel.For(1, dim - 1, i => + { + System.Threading.Tasks.Parallel.For(1, dim - 1, j => + { + //set the scratch array to the average of the surrounding cells + scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); + }); + }); + + //copy the scratch array to the grid array + System.Threading.Tasks.Parallel.For(1, dim - 1, i => + { + System.Threading.Tasks.Parallel.For(1, dim - 1, j => + { + grid[i, j] = scratch[i, j]; + }); + }); + break; + + case ParType.For: + //iterate over all cells not on the border + DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i => + { + for (int j = 1; j < dim - 1; j++) + { + //set the scratch array to the average of the surrounding cells + scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); + } + }); + + //copy the scratch array to the grid array + DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i => + { + for (int j = 1; j < dim - 1; j++) + { + grid[i, j] = scratch[i, j]; + } + }); + break; + + case ParType.ForCollapse: + //iterate over all cells not on the border + DotMP.Parallel.ForCollapse((1, dim - 1), (1, dim - 1), schedule: DotMP.Schedule.Guided, action: (i, j) => + { + //set the scratch array to the average of the surrounding cells + scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); + }); + + //copy the scratch array to the grid array + DotMP.Parallel.ForCollapse((1, dim - 1), (1, dim - 1), schedule: DotMP.Schedule.Guided, action: (i, j) => + { + grid[i, j] = scratch[i, j]; + }); + break; + + case ParType.Serial: + for (int i = 1; i < dim - 1; i++) + { + for (int j = 1; j < dim - 1; j++) + { + //set the scratch array to the average of the surrounding cells + scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); + } + } + + //copy the scratch array to the grid array + for (int i = 1; i < dim - 1; i++) + { + for (int j = 1; j < dim - 1; j++) + { + grid[i, j] = scratch[i, j]; + } + } + break; + } + } + + public void Verify() + { + Setup(); + type = ParType.For; + DoSimulation(); + double[,] gridA = grid; + + Setup(); + type = ParType.Serial; + DoSimulation(); + double[,] gridB = grid; + + bool wrong = false; + + for (int i = 0; i < dim; i++) + for (int j = 0; j < dim; j++) + if (gridA[i, j] != gridB[i, j]) + wrong = true; + + if (wrong) + Console.WriteLine("WRONG RESULT"); + else + Console.WriteLine("RIGHT RESULT"); + } +} + +/* jscpd:ignore-end */ + +// driver +public class Program +{ + public static void Main(string[] args) + { + if (args.Length > 0 && args[0] == "verify") + new HeatTransferVerify().Verify(); + else + BenchmarkRunner.Run(); + } +} \ No newline at end of file From b5365b787a45d2fb2fba7fc5ebb6cdf29840e2f5 Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 10:37:06 -0600 Subject: [PATCH 15/61] add nocopy behavior --- DotMP/GPU/Buffer.cs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/DotMP/GPU/Buffer.cs b/DotMP/GPU/Buffer.cs index 2ec5b33c..63e35e8e 100644 --- a/DotMP/GPU/Buffer.cs +++ b/DotMP/GPU/Buffer.cs @@ -22,7 +22,11 @@ public enum Behavior /// /// Specifies that data should be transfered both to and from the GPU. /// - ToFrom + ToFrom, + /// + /// Specifies that the data shouldn't be transfered to or from the GPU. For internal use. + /// + NoCopy } } @@ -100,6 +104,7 @@ public Buffer(T[] data, Buffer.Behavior behavior) buf1d = AcceleratorHandler.accelerator.Allocate1D(data); break; case Buffer.Behavior.From: + case Buffer.Behavior.NoCopy: buf1d = AcceleratorHandler.accelerator.Allocate1D(data.Length); break; } @@ -126,6 +131,7 @@ public Buffer(T[,] data, Buffer.Behavior behavior) buf2d = AcceleratorHandler.accelerator.Allocate2DDenseY(data); break; case Buffer.Behavior.From: + case Buffer.Behavior.NoCopy: buf2d = AcceleratorHandler.accelerator.Allocate2DDenseY((data.GetLength(0), data.GetLength(1))); break; } From c092eb94002decb48d389dc244bf59bf7461e7ca Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 10:37:22 -0600 Subject: [PATCH 16/61] fix exception on OpenCL devices --- DotMP/GPU/GpuArray.cs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs index ffc898ef..5be53869 100644 --- a/DotMP/GPU/GpuArray.cs +++ b/DotMP/GPU/GpuArray.cs @@ -35,17 +35,21 @@ public GPUArray(Buffer arrayView) if (arrayView.Dimensions == 1) { view1d = arrayView.View1D; - view2d = ArrayView2D.Empty; + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + view2d = new Buffer(new T[1, 1], Buffer.Behavior.NoCopy).View2D; } else if (arrayView.Dimensions == 2) { - view1d = ArrayView1D.Empty; + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + view1d = new Buffer(new T[1], Buffer.Behavior.NoCopy).View1D; view2d = arrayView.View2D; } else { - view1d = ArrayView1D.Empty; - view2d = ArrayView2D.Empty; + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + view1d = new Buffer(new T[1], Buffer.Behavior.NoCopy).View1D; + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + view2d = new Buffer(new T[1, 1], Buffer.Behavior.NoCopy).View2D; } dims = arrayView.Dimensions; From f66b22795b74f4e60c6221925c571c4c410c6a2b Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 11:44:40 -0600 Subject: [PATCH 17/61] better accelerator selection --- DotMP/GPU/AcceleratorHandler.cs | 37 ++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs index 95d415ff..d369489b 100644 --- a/DotMP/GPU/AcceleratorHandler.cs +++ b/DotMP/GPU/AcceleratorHandler.cs @@ -35,11 +35,20 @@ internal AcceleratorHandler() if (initialized) return; context = Context.CreateDefault(); - accelerator = context.Devices[1].CreateAccelerator(context); + var selectedDevice = context.Devices[0]; + foreach (var d in context.Devices) { Console.WriteLine("Detected {0} accelerator.", d.ToString()); + + if (selectedDevice.AcceleratorType == AcceleratorType.CPU && d.AcceleratorType == AcceleratorType.OpenCL) + selectedDevice = d; + if (selectedDevice.AcceleratorType != AcceleratorType.Cuda && d.AcceleratorType == AcceleratorType.Cuda) + selectedDevice = d; } + + accelerator = selectedDevice.CreateAccelerator(context); + Console.WriteLine("Using {0} accelerator.", accelerator.AcceleratorType.ToString()); initialized = true; block_size = accelerator.AcceleratorType == AcceleratorType.CPU ? 16 : 256; @@ -61,7 +70,7 @@ internal AcceleratorHandler() internal void DispatchKernel(int start, int end, Buffer buf, Action> action) where T : unmanaged { - var idx = new Index(); + var idx = new Index(start); var kernel = accelerator.LoadStreamKernel(action); @@ -85,7 +94,7 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Buffer where T : unmanaged where U : unmanaged { - var idx = new Index(); + var idx = new Index(start); var kernel = accelerator.LoadStreamKernel(action); @@ -113,7 +122,7 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Buffer where U : unmanaged where V : unmanaged { - var idx = new Index(); + var idx = new Index(start); var kernel = accelerator.LoadStreamKernel(action); @@ -145,7 +154,7 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Buf where V : unmanaged where W : unmanaged { - var idx = new Index(); + var idx = new Index(start); var kernel = accelerator.LoadStreamKernel(action); @@ -181,7 +190,7 @@ internal void DispatchKernel(int start, int end, Buffer buf1, where W : unmanaged where X : unmanaged { - var idx = new Index(); + var idx = new Index(start); var kernel = accelerator.LoadStreamKernel(action); @@ -221,7 +230,7 @@ internal void DispatchKernel(int start, int end, Buffer buf where X : unmanaged where Y : unmanaged { - var idx = new Index(); + var idx = new Index(start); var kernel = accelerator.LoadStreamKernel(action); @@ -265,7 +274,7 @@ internal void DispatchKernel(int start, int end, Buffer where Y : unmanaged where Z : unmanaged { - var idx = new Index(); + var idx = new Index(start); var kernel = accelerator.LoadStreamKernel(action); @@ -313,7 +322,7 @@ internal void DispatchKernel(int start, int end, Buffer< where Z : unmanaged where A : unmanaged { - var idx = new Index(); + var idx = new Index(start); var kernel = accelerator.LoadStreamKernel(action); @@ -365,7 +374,7 @@ internal void DispatchKernel(int start, int end, Buff where A : unmanaged where B : unmanaged { - var idx = new Index(); + var idx = new Index(start); var kernel = accelerator.LoadStreamKernel(action); @@ -421,7 +430,7 @@ internal void DispatchKernel(int start, int end, B where B : unmanaged where C : unmanaged { - var idx = new Index(); + var idx = new Index(start); var kernel = accelerator.LoadStreamKernel(action); @@ -481,7 +490,7 @@ internal void DispatchKernel(int start, int end where C : unmanaged where D : unmanaged { - var idx = new Index(); + var idx = new Index(start); var kernel = accelerator.LoadStreamKernel(action); @@ -545,7 +554,7 @@ internal void DispatchKernel(int start, int where D : unmanaged where E : unmanaged { - var idx = new Index(); + var idx = new Index(start); var kernel = accelerator.LoadStreamKernel(action); @@ -613,7 +622,7 @@ internal void DispatchKernel(int start, i where E : unmanaged where F : unmanaged { - var idx = new Index(); + var idx = new Index(start); var kernel = accelerator.LoadStreamKernel(action); From 79c4a357ac1ba96fcbfd407cd04eaaad43b7bee0 Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 11:45:11 -0600 Subject: [PATCH 18/61] fix copying back 2D arrays --- DotMP/GPU/Buffer.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DotMP/GPU/Buffer.cs b/DotMP/GPU/Buffer.cs index 63e35e8e..253f4968 100644 --- a/DotMP/GPU/Buffer.cs +++ b/DotMP/GPU/Buffer.cs @@ -1,4 +1,5 @@ using System; +using System.Runtime.CompilerServices; using ILGPU; using ILGPU.Runtime; @@ -157,7 +158,7 @@ public void Dispose() { if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom) { - buf2d.GetAsArray2D().CopyTo(data2d, 0); + System.Buffer.BlockCopy(buf2d.GetAsArray2D(), 0, data2d, 0, Unsafe.SizeOf() * data2d.Length); } buf2d.Dispose(); From 528cba09f8b895346c2ccd23ab8957f3a1c979ba Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 11:46:13 -0600 Subject: [PATCH 19/61] added start offset for index calculations --- DotMP/GPU/Handle.cs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/DotMP/GPU/Handle.cs b/DotMP/GPU/Handle.cs index c90f773f..a67f41a2 100644 --- a/DotMP/GPU/Handle.cs +++ b/DotMP/GPU/Handle.cs @@ -8,13 +8,27 @@ namespace DotMP.GPU /// public struct Index { + /// + /// The start of the for loop, for index calculations. + /// + private int start; + + /// + /// Constructor. + /// + /// The start of the parallel for loop. + internal Index(int start) + { + this.start = start; + } + /// /// Gets the index of the loop. /// /// Unused. public static implicit operator int(Index h) { - return Grid.GlobalIndex.X; + return Grid.GlobalIndex.X + h.start; } } } \ No newline at end of file From 6121c0dfbac71ed4feca00d4b5b8cb3a53ca4bc5 Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 11:46:30 -0600 Subject: [PATCH 20/61] get HeatTransferVerify running properly --- .../GPUHeatTransfer/GPUHeatTransfer.csproj | 8 ++ benchmarks/GPUHeatTransfer/Program.cs | 108 +++++++++--------- 2 files changed, 64 insertions(+), 52 deletions(-) diff --git a/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj b/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj index d4398000..41e8c54b 100644 --- a/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj +++ b/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj @@ -7,4 +7,12 @@ enable + + + + + + + + diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs index 3889f953..6d4be932 100644 --- a/benchmarks/GPUHeatTransfer/Program.cs +++ b/benchmarks/GPUHeatTransfer/Program.cs @@ -53,6 +53,12 @@ public enum ParType { DMPFor, DMPGPU } // change this to configure the number of threads to use public uint num_threads = 6; + // buffer for grid + private DotMP.GPU.Buffer gridbuf; + + // buffer for scratch + private DotMP.GPU.Buffer scratchbuf; + // run the setup [GlobalSetup] public void Setup() @@ -62,6 +68,9 @@ public void Setup() grid[0, dim / 2 - 1] = 100.0; grid[0, dim / 2] = 100.0; + + gridbuf = new DotMP.GPU.Buffer(grid, DotMP.GPU.Buffer.Behavior.To); + scratchbuf = new DotMP.GPU.Buffer(scratch, DotMP.GPU.Buffer.Behavior.NoCopy); } //run the simulation @@ -115,7 +124,22 @@ public void DoStep() break; case ParType.DMPGPU: - DotMP.GPU.ParallelFor(); + DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) => + { + for (int j = 1; j < dim - 1; j++) + { + //set the scratch array to the average of the surrounding cells + scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); + } + }); + + DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) => + { + for (int j = 1; j < dim - 1; j++) + { + grid[i, j] = scratch[i, j]; + } + }); break; } } @@ -130,20 +154,26 @@ public class HeatTransferVerify private double[,] grid = new double[0, 0]; // parallel type enum - public enum ParType { TPL, For, ForCollapse, Serial } + public enum ParType { DMPFor, DMPGPU } // test dims of 100x100, 1000x1000, and 5000x5000 - public int dim = 500; + public int dim = 514; // test with 10 steps and 100 steps public int steps = 100; // test with all 3 parallel types - public ParType type = ParType.For; + public ParType type = ParType.DMPFor; // change this to configure the number of threads to use public uint num_threads = 6; + // buffer for grid + private DotMP.GPU.Buffer gridbuf; + + // buffer for scratch + private DotMP.GPU.Buffer scratchbuf; + // run the setup public void Setup() { @@ -152,6 +182,12 @@ public void Setup() grid[0, dim / 2 - 1] = 100.0; grid[0, dim / 2] = 100.0; + + if (type == ParType.DMPGPU) + { + gridbuf = new DotMP.GPU.Buffer(grid, DotMP.GPU.Buffer.Behavior.ToFrom); + scratchbuf = new DotMP.GPU.Buffer(scratch, DotMP.GPU.Buffer.Behavior.NoCopy); + } } //run the simulation @@ -166,9 +202,11 @@ public void DoSimulation() } }; - if (type == ParType.TPL || type == ParType.Serial) + if (type == ParType.DMPGPU) { action(); + gridbuf.Dispose(); + scratchbuf.Dispose(); } else { @@ -182,28 +220,7 @@ public void DoStep() { switch (type) { - case ParType.TPL: - //iterate over all cells not on the border - System.Threading.Tasks.Parallel.For(1, dim - 1, i => - { - System.Threading.Tasks.Parallel.For(1, dim - 1, j => - { - //set the scratch array to the average of the surrounding cells - scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); - }); - }); - - //copy the scratch array to the grid array - System.Threading.Tasks.Parallel.For(1, dim - 1, i => - { - System.Threading.Tasks.Parallel.For(1, dim - 1, j => - { - grid[i, j] = scratch[i, j]; - }); - }); - break; - - case ParType.For: + case ParType.DMPFor: //iterate over all cells not on the border DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i => { @@ -224,52 +241,36 @@ public void DoStep() }); break; - case ParType.ForCollapse: - //iterate over all cells not on the border - DotMP.Parallel.ForCollapse((1, dim - 1), (1, dim - 1), schedule: DotMP.Schedule.Guided, action: (i, j) => - { - //set the scratch array to the average of the surrounding cells - scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); - }); - - //copy the scratch array to the grid array - DotMP.Parallel.ForCollapse((1, dim - 1), (1, dim - 1), schedule: DotMP.Schedule.Guided, action: (i, j) => - { - grid[i, j] = scratch[i, j]; - }); - break; - - case ParType.Serial: - for (int i = 1; i < dim - 1; i++) + case ParType.DMPGPU: + DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) => { - for (int j = 1; j < dim - 1; j++) + for (int j = 1; j < 514 - 1; j++) { //set the scratch array to the average of the surrounding cells scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); } - } + }); - //copy the scratch array to the grid array - for (int i = 1; i < dim - 1; i++) + DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) => { - for (int j = 1; j < dim - 1; j++) + for (int j = 1; j < 514 - 1; j++) { grid[i, j] = scratch[i, j]; } - } + }); break; } } public void Verify() { + type = ParType.DMPFor; Setup(); - type = ParType.For; DoSimulation(); double[,] gridA = grid; + type = ParType.DMPGPU; Setup(); - type = ParType.Serial; DoSimulation(); double[,] gridB = grid; @@ -278,7 +279,10 @@ public void Verify() for (int i = 0; i < dim; i++) for (int j = 0; j < dim; j++) if (gridA[i, j] != gridB[i, j]) + { wrong = true; + Console.WriteLine("Wrong at ({0}, {1}), expected {2}, got {3}.", i, j, gridA[i, j], gridB[i, j]); + } if (wrong) Console.WriteLine("WRONG RESULT"); From 62b5ebe515e8e18d3bfb7b87e4e3cd6272f8fc57 Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 11:58:48 -0600 Subject: [PATCH 21/61] add LGPL license header --- DotMP/GPU/AcceleratorHandler.cs | 16 ++++++++++ DotMP/GPU/AssemblyAttributes.cs | 16 ++++++++++ DotMP/GPU/Buffer.cs | 16 ++++++++++ DotMP/GPU/Exceptions.cs | 39 ++++++++++--------------- DotMP/GPU/Gpu.cs | 16 ++++++++++ DotMP/GPU/GpuArray.cs | 16 ++++++++++ DotMP/GPU/Handle.cs | 34 ---------------------- DotMP/GPU/Index.cs | 50 ++++++++++++++++++++++++++++++++ DotMP/GPU/Python/dispatch_gen.py | 16 ++++++++++ DotMP/GPU/Python/parfor_gen.py | 16 ++++++++++ 10 files changed, 178 insertions(+), 57 deletions(-) delete mode 100644 DotMP/GPU/Handle.cs create mode 100644 DotMP/GPU/Index.cs diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs index d369489b..c7be93df 100644 --- a/DotMP/GPU/AcceleratorHandler.cs +++ b/DotMP/GPU/AcceleratorHandler.cs @@ -1,3 +1,19 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + using System; using System.Linq; using ILGPU; diff --git a/DotMP/GPU/AssemblyAttributes.cs b/DotMP/GPU/AssemblyAttributes.cs index a45c976b..7077a588 100644 --- a/DotMP/GPU/AssemblyAttributes.cs +++ b/DotMP/GPU/AssemblyAttributes.cs @@ -1,3 +1,19 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + using System.Runtime.CompilerServices; [assembly: InternalsVisibleTo("ILGPURuntime")] \ No newline at end of file diff --git a/DotMP/GPU/Buffer.cs b/DotMP/GPU/Buffer.cs index 253f4968..87b756e3 100644 --- a/DotMP/GPU/Buffer.cs +++ b/DotMP/GPU/Buffer.cs @@ -1,3 +1,19 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + using System; using System.Runtime.CompilerServices; using ILGPU; diff --git a/DotMP/GPU/Exceptions.cs b/DotMP/GPU/Exceptions.cs index 58acddb5..4705041b 100644 --- a/DotMP/GPU/Exceptions.cs +++ b/DotMP/GPU/Exceptions.cs @@ -1,28 +1,21 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + using System; namespace DotMP.GPU { - /// - /// Exception thrown if too many or too few data movements were specified before a GPU kernel. - /// - public class WrongNumberOfDataMovementsSpecifiedException : Exception - { - /// - /// Constructor with a message. - /// - /// The message to associate with the exception. - public WrongNumberOfDataMovementsSpecifiedException(string msg) : base(msg) { } - } - - /// - /// Exception thrown if data movement is presented out-of-order. - /// - public class ImproperDataMovementOrderingException : Exception - { - /// - /// Constructor with a message. - /// - /// The message to associate with the exception. - public ImproperDataMovementOrderingException(string msg) : base(msg) { } - } } \ No newline at end of file diff --git a/DotMP/GPU/Gpu.cs b/DotMP/GPU/Gpu.cs index 1237a75d..c9b8963c 100644 --- a/DotMP/GPU/Gpu.cs +++ b/DotMP/GPU/Gpu.cs @@ -1,3 +1,19 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + using System; using ILGPU; diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs index 5be53869..fca82deb 100644 --- a/DotMP/GPU/GpuArray.cs +++ b/DotMP/GPU/GpuArray.cs @@ -1,3 +1,19 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + using ILGPU; using ILGPU.Runtime; using System; diff --git a/DotMP/GPU/Handle.cs b/DotMP/GPU/Handle.cs deleted file mode 100644 index a67f41a2..00000000 --- a/DotMP/GPU/Handle.cs +++ /dev/null @@ -1,34 +0,0 @@ -using ILGPU; -using System; - -namespace DotMP.GPU -{ - /// - /// Handle for a GPU kernel to retrieve its kernel variables. - /// - public struct Index - { - /// - /// The start of the for loop, for index calculations. - /// - private int start; - - /// - /// Constructor. - /// - /// The start of the parallel for loop. - internal Index(int start) - { - this.start = start; - } - - /// - /// Gets the index of the loop. - /// - /// Unused. - public static implicit operator int(Index h) - { - return Grid.GlobalIndex.X + h.start; - } - } -} \ No newline at end of file diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs new file mode 100644 index 00000000..03355c53 --- /dev/null +++ b/DotMP/GPU/Index.cs @@ -0,0 +1,50 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +using ILGPU; +using System; + +namespace DotMP.GPU +{ + /// + /// Handle for a GPU kernel to retrieve its kernel variables. + /// + public struct Index + { + /// + /// The start of the for loop, for index calculations. + /// + private int start; + + /// + /// Constructor. + /// + /// The start of the parallel for loop. + internal Index(int start) + { + this.start = start; + } + + /// + /// Gets the index of the loop. + /// + /// Unused. + public static implicit operator int(Index h) + { + return Grid.GlobalIndex.X + h.start; + } + } +} \ No newline at end of file diff --git a/DotMP/GPU/Python/dispatch_gen.py b/DotMP/GPU/Python/dispatch_gen.py index a9d0f08d..bb4152cd 100644 --- a/DotMP/GPU/Python/dispatch_gen.py +++ b/DotMP/GPU/Python/dispatch_gen.py @@ -1,3 +1,19 @@ +""" +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +""" + ofile = open("./dispatch_dump.cs", "w") cardinals = ["one", "two", "three", "four", "five", "six", "seven", "eight", diff --git a/DotMP/GPU/Python/parfor_gen.py b/DotMP/GPU/Python/parfor_gen.py index c119b624..e960e861 100644 --- a/DotMP/GPU/Python/parfor_gen.py +++ b/DotMP/GPU/Python/parfor_gen.py @@ -1,3 +1,19 @@ +""" +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +""" + ofile = open("./parfor_dump.cs", "w") cardinals = ["one", "two", "three", "four", "five", "six", "seven", "eight", From d11913c78392968f4fd9c08b2619a095bac523a8 Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 11:59:18 -0600 Subject: [PATCH 22/61] prepare benchmark --- benchmarks/GPUHeatTransfer/Program.cs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs index 6d4be932..63c1bb61 100644 --- a/benchmarks/GPUHeatTransfer/Program.cs +++ b/benchmarks/GPUHeatTransfer/Program.cs @@ -33,13 +33,11 @@ public class HeatTransfer // grid array private double[,] grid = new double[0, 0]; - //private - // parallel type enum public enum ParType { DMPFor, DMPGPU } // test dims of 100x100, 1000x1000, and 5000x5000 - [Params(500)] + [Params(514)] public int dim; // test with 10 steps and 100 steps @@ -69,8 +67,11 @@ public void Setup() grid[0, dim / 2 - 1] = 100.0; grid[0, dim / 2] = 100.0; - gridbuf = new DotMP.GPU.Buffer(grid, DotMP.GPU.Buffer.Behavior.To); - scratchbuf = new DotMP.GPU.Buffer(scratch, DotMP.GPU.Buffer.Behavior.NoCopy); + if (type == ParType.DMPGPU) + { + gridbuf = new DotMP.GPU.Buffer(grid, DotMP.GPU.Buffer.Behavior.ToFrom); + scratchbuf = new DotMP.GPU.Buffer(scratch, DotMP.GPU.Buffer.Behavior.NoCopy); + } } //run the simulation @@ -89,6 +90,8 @@ public void DoSimulation() if (type == ParType.DMPGPU) { action(); + gridbuf.Dispose(); + scratchbuf.Dispose(); } else { @@ -126,7 +129,7 @@ public void DoStep() case ParType.DMPGPU: DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) => { - for (int j = 1; j < dim - 1; j++) + for (int j = 1; j < 514 - 1; j++) { //set the scratch array to the average of the surrounding cells scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); @@ -135,7 +138,7 @@ public void DoStep() DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) => { - for (int j = 1; j < dim - 1; j++) + for (int j = 1; j < 514 - 1; j++) { grid[i, j] = scratch[i, j]; } From 8b1242253b26e391a2d3ca60094006fd76979f2b Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 12:06:06 -0600 Subject: [PATCH 23/61] remove dispose for benchmarking --- benchmarks/GPUHeatTransfer/Program.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs index 63c1bb61..b3475816 100644 --- a/benchmarks/GPUHeatTransfer/Program.cs +++ b/benchmarks/GPUHeatTransfer/Program.cs @@ -90,8 +90,8 @@ public void DoSimulation() if (type == ParType.DMPGPU) { action(); - gridbuf.Dispose(); - scratchbuf.Dispose(); + //gridbuf.Dispose(); + //scratchbuf.Dispose(); } else { From 20eebd6767ceec8557836676766836cf7a44ea6b Mon Sep 17 00:00:00 2001 From: Lane Date: Fri, 10 Nov 2023 12:06:16 -0600 Subject: [PATCH 24/61] change to .net 6 --- benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj b/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj index 41e8c54b..9cf0a6f0 100644 --- a/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj +++ b/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj @@ -2,7 +2,7 @@ Exe - net7.0 + net6.0 enable enable From 708d9d564734c43b2ac47652b8fc58c4455a98fe Mon Sep 17 00:00:00 2001 From: Lane Date: Sat, 11 Nov 2023 10:18:14 -0600 Subject: [PATCH 25/61] add attributes to prevent exceptions when collecting code coverage --- DotMP/GPU/GpuArray.cs | 2 ++ DotMP/GPU/Index.cs | 2 ++ 2 files changed, 4 insertions(+) diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs index fca82deb..a7d7d705 100644 --- a/DotMP/GPU/GpuArray.cs +++ b/DotMP/GPU/GpuArray.cs @@ -17,6 +17,7 @@ using ILGPU; using ILGPU.Runtime; using System; +using System.Diagnostics.CodeAnalysis; namespace DotMP.GPU { @@ -24,6 +25,7 @@ namespace DotMP.GPU /// Wrapper object for representing arrays on the GPU. /// /// + [ExcludeFromCodeCoverage] public struct GPUArray where T : unmanaged { diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs index 03355c53..b1e9df75 100644 --- a/DotMP/GPU/Index.cs +++ b/DotMP/GPU/Index.cs @@ -16,12 +16,14 @@ using ILGPU; using System; +using System.Diagnostics.CodeAnalysis; namespace DotMP.GPU { /// /// Handle for a GPU kernel to retrieve its kernel variables. /// + [ExcludeFromCodeCoverage] public struct Index { /// From 2e328e709c975461c351e0fe0556af653d4b4380 Mon Sep 17 00:00:00 2001 From: Lane Date: Sat, 11 Nov 2023 10:18:23 -0600 Subject: [PATCH 26/61] fix bug --- DotMP/Init.cs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/DotMP/Init.cs b/DotMP/Init.cs index 49629f3f..83de8207 100644 --- a/DotMP/Init.cs +++ b/DotMP/Init.cs @@ -160,12 +160,14 @@ internal bool in_for { get { - if (in_for_pv == null) + int tid = Parallel.GetThreadNum(); + + if (in_for_pv == null || tid >= in_for_pv.Length) { return false; } - return in_for_pv[Parallel.GetThreadNum()]; + return in_for_pv[tid]; } set { From f3b7735caabf1f217f4210a19a54e5c2392234f0 Mon Sep 17 00:00:00 2001 From: Lane Date: Sat, 11 Nov 2023 10:18:50 -0600 Subject: [PATCH 27/61] mark old single/ordered/critical regions as obsolete, implement new versions --- DotMP/Parallel.cs | 167 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 159 insertions(+), 8 deletions(-) diff --git a/DotMP/Parallel.cs b/DotMP/Parallel.cs index f5712643..d24189e3 100644 --- a/DotMP/Parallel.cs +++ b/DotMP/Parallel.cs @@ -1,5 +1,7 @@ using System; using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.ComponentModel; using System.Threading; namespace DotMP @@ -14,15 +16,15 @@ public static class Parallel /// /// The dictionary for critical regions. /// - private static volatile Dictionary critical_lock = new Dictionary(); + private static volatile Dictionary critical_lock = new Dictionary(); /// /// The dictionary for single regions. /// - private static volatile HashSet single_thread = new HashSet(); + private static volatile HashSet single_thread = new HashSet(); /// /// The dictionary for ordered regions. /// - private static volatile Dictionary ordered = new Dictionary(); + private static volatile Dictionary ordered = new Dictionary(); /// /// Barrier object for DotMP.Parallel.Barrier() /// @@ -106,6 +108,17 @@ private static void FixArgs(int start, int end, ref Schedule sched, ref uint? ch } } + /// + /// Formats the caller information for determining uniqueness of a call. + /// + /// The calling file. + /// The calling line number. + /// A formatted string representing "{filename}:{linenum}" + private static string FormatCaller(string filename, int linenum) + { + return string.Format("{0}:{1}", filename, linenum); + } + /// /// Creates a for loop inside a parallel region. /// A for loop created with For inside of a parallel region is executed in parallel, with iterations being distributed among the threads, and potentially out-of-order. @@ -950,11 +963,15 @@ public static void ParallelSections(uint? num_threads = null, params Action[] ac /// Creates a critical region. /// A critical region is a region of code that can only be executed by one thread at a time. /// If a thread encounters a critical region while another thread is inside a critical region, it will wait until the other thread is finished. + /// + /// THIS METHOD IS NOW DEPRECATED. /// /// The ID of the critical region. Must be unique per region but consistent across all threads. /// The action to be performed in the critical region. /// The ID of the critical region. /// Thrown when not in a parallel region. + [Obsolete("This version of Critical is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")] + [EditorBrowsable(EditorBrowsableState.Never)] public static int Critical(int id, Action action) { if (!InParallel()) @@ -964,6 +981,45 @@ public static int Critical(int id, Action action) object lock_obj; + lock (critical_lock) + { + if (!critical_lock.ContainsKey(id.ToString())) + { + critical_lock.Add(id.ToString(), new object()); + } + + lock_obj = critical_lock[id.ToString()]; + } + + lock (lock_obj) + { + action(); + } + + return id; + } + + /// + /// Creates a critical region. + /// A critical region is a region of code that can only be executed by one thread at a time. + /// If a thread encounters a critical region while another thread is inside a critical region, it will wait until the other thread is finished. + /// + /// The action to be performed in the critical region. + /// The line number this method was called from. + /// The path to the file this method was called from. + /// The ID of the critical region. + /// Thrown when not in a parallel region. + public static void Critical(Action action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) + { + string id = FormatCaller(path, line); + + if (!InParallel()) + { + throw new NotInParallelRegionException("Cannot use DotMP Critical outside of a parallel region."); + } + + object lock_obj; + lock (critical_lock) { if (!critical_lock.ContainsKey(id)) @@ -978,8 +1034,6 @@ public static int Critical(int id, Action action) { action(); } - - return id; } /// @@ -1034,11 +1088,15 @@ public static void Master(Action action) /// Creates a single region. /// A single region is only executed once per Parallel.ParallelRegion. /// The first thread to encounter the single region marks the region as encountered, then executes it. + /// + /// THIS METHOD IS NOW DEPRECATED. /// /// The ID of the single region. Must be unique per region but consistent across all threads. /// The action to be performed in the single region. /// Thrown when not in a parallel region. /// Thrown when nested inside another worksharing region. + [Obsolete("This version of Single is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")] + [EditorBrowsable(EditorBrowsableState.Never)] public static void Single(int id, Action action) { var freg = new ForkedRegion(); @@ -1058,6 +1116,55 @@ public static void Single(int id, Action action) Interlocked.Increment(ref freg.in_workshare); + lock (single_thread) + { + if (!single_thread.Contains(id.ToString())) + { + single_thread.Add(id.ToString()); + new_single = true; + } + } + + if (new_single) + { + action(); + } + + Interlocked.Decrement(ref freg.in_workshare); + + Barrier(); + } + + /// + /// Creates a single region. + /// A single region is only executed once per Parallel.ParallelRegion. + /// The first thread to encounter the single region marks the region as encountered, then executes it. + /// + /// The action to be performed in the single region. + /// The line number this method was called from. + /// The path to the file this method was called from. + /// Thrown when not in a parallel region. + /// Thrown when nested inside another worksharing region. + public static void Single(Action action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) + { + string id = FormatCaller(path, line); + var freg = new ForkedRegion(); + bool new_single = false; + + if (!freg.in_parallel) + { + throw new NotInParallelRegionException("Cannot use DotMP Single outside of a parallel region."); + } + + var ws = new WorkShare(); + + if (ws.in_for) + { + throw new CannotPerformNestedWorksharingException("Cannot use DotMP Single nested within other worksharing constructs."); + } + + Interlocked.Increment(ref freg.in_workshare); + lock (single_thread) { if (!single_thread.Contains(id)) @@ -1081,10 +1188,14 @@ public static void Single(int id, Action action) /// Creates an ordered region. /// An ordered region is a region of code that is executed in order inside of a For() or ForReduction<T>() loop. /// This also acts as an implicit Critical() region. + /// + /// THIS METHOD IS NOW DEPRECATED. /// /// The ID of the ordered region. Must be unique per region but consistent across all threads. /// The action to be performed in the ordered region. /// Thrown when not in a parallel region. + [Obsolete("This version of Ordered is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")] + [EditorBrowsable(EditorBrowsableState.Never)] public static void Ordered(int id, Action action) { var freg = new ForkedRegion(); @@ -1098,22 +1209,62 @@ public static void Ordered(int id, Action action) lock (ordered) { - if (!ordered.ContainsKey(id)) + if (!ordered.ContainsKey(id.ToString())) { - ordered.Add(id, 0); + ordered.Add(id.ToString(), 0); } Thread.MemoryBarrier(); } WorkShare ws = new WorkShare(); - while (ordered[id] != ws.thread.working_iter) + while (ordered[id.ToString()] != ws.thread.working_iter) { freg.reg.spin[tid].SpinOnce(); } action(); + lock (ordered) + { + ordered[id.ToString()]++; + } + } + + /// + /// Creates an ordered region. + /// An ordered region is a region of code that is executed in order inside of a For() or ForReduction<T>() loop. + /// This also acts as an implicit Critical() region. + /// + /// The action to be performed in the ordered region. + /// The line number this method was called from. + /// The path to the file this method was called from. + /// Thrown when not in a parallel region. + public static void Ordered(Action action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) + { + string id = FormatCaller(path, line); + var freg = new ForkedRegion(); + + if (!freg.in_parallel) + { + throw new NotInParallelRegionException("Cannot use DotMP Ordered outside of a parallel region."); + } + + lock (ordered) + { + if (!ordered.ContainsKey(id)) + { + ordered.Add(id, 0); + } + Thread.MemoryBarrier(); + } + + WorkShare ws = new WorkShare(); + + while (ordered[id] != ws.thread.working_iter) ; + + action(); + lock (ordered) { ordered[id]++; From 429bde2b05aa7a2432c43c9500ce8f6e17d3251e Mon Sep 17 00:00:00 2001 From: Lane Date: Sat, 11 Nov 2023 10:18:59 -0600 Subject: [PATCH 28/61] remove sln files --- examples/CSParallel/KNN/KNN.sln | 25 ------------------------- examples/DotMP/KNN/KNN.sln | 25 ------------------------- examples/Serial/KNN/KNN.sln | 25 ------------------------- 3 files changed, 75 deletions(-) delete mode 100644 examples/CSParallel/KNN/KNN.sln delete mode 100644 examples/DotMP/KNN/KNN.sln delete mode 100644 examples/Serial/KNN/KNN.sln diff --git a/examples/CSParallel/KNN/KNN.sln b/examples/CSParallel/KNN/KNN.sln deleted file mode 100644 index 62ec72ad..00000000 --- a/examples/CSParallel/KNN/KNN.sln +++ /dev/null @@ -1,25 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 17 -VisualStudioVersion = 17.5.002.0 -MinimumVisualStudioVersion = 10.0.40219.1 -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "KNN", "KNN.csproj", "{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|Any CPU = Debug|Any CPU - Release|Any CPU = Release|Any CPU - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Debug|Any CPU.Build.0 = Debug|Any CPU - {0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Release|Any CPU.ActiveCfg = Release|Any CPU - {0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Release|Any CPU.Build.0 = Release|Any CPU - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection - GlobalSection(ExtensibilityGlobals) = postSolution - SolutionGuid = {AEB4E020-A8E5-48C3-A343-84A0FEBE91E2} - EndGlobalSection -EndGlobal diff --git a/examples/DotMP/KNN/KNN.sln b/examples/DotMP/KNN/KNN.sln deleted file mode 100644 index 62ec72ad..00000000 --- a/examples/DotMP/KNN/KNN.sln +++ /dev/null @@ -1,25 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 17 -VisualStudioVersion = 17.5.002.0 -MinimumVisualStudioVersion = 10.0.40219.1 -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "KNN", "KNN.csproj", "{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|Any CPU = Debug|Any CPU - Release|Any CPU = Release|Any CPU - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Debug|Any CPU.Build.0 = Debug|Any CPU - {0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Release|Any CPU.ActiveCfg = Release|Any CPU - {0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Release|Any CPU.Build.0 = Release|Any CPU - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection - GlobalSection(ExtensibilityGlobals) = postSolution - SolutionGuid = {AEB4E020-A8E5-48C3-A343-84A0FEBE91E2} - EndGlobalSection -EndGlobal diff --git a/examples/Serial/KNN/KNN.sln b/examples/Serial/KNN/KNN.sln deleted file mode 100644 index 62ec72ad..00000000 --- a/examples/Serial/KNN/KNN.sln +++ /dev/null @@ -1,25 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 17 -VisualStudioVersion = 17.5.002.0 -MinimumVisualStudioVersion = 10.0.40219.1 -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "KNN", "KNN.csproj", "{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|Any CPU = Debug|Any CPU - Release|Any CPU = Release|Any CPU - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Debug|Any CPU.Build.0 = Debug|Any CPU - {0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Release|Any CPU.ActiveCfg = Release|Any CPU - {0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Release|Any CPU.Build.0 = Release|Any CPU - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection - GlobalSection(ExtensibilityGlobals) = postSolution - SolutionGuid = {AEB4E020-A8E5-48C3-A343-84A0FEBE91E2} - EndGlobalSection -EndGlobal From fab2c475b273b98db24eb0fd24aa4fbdf69c55f4 Mon Sep 17 00:00:00 2001 From: Lane Date: Sat, 11 Nov 2023 10:20:37 -0600 Subject: [PATCH 29/61] exclude obsolete methods from code coverage --- DotMP/Parallel.cs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/DotMP/Parallel.cs b/DotMP/Parallel.cs index d24189e3..1c88570a 100644 --- a/DotMP/Parallel.cs +++ b/DotMP/Parallel.cs @@ -972,6 +972,7 @@ public static void ParallelSections(uint? num_threads = null, params Action[] ac /// Thrown when not in a parallel region. [Obsolete("This version of Critical is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")] [EditorBrowsable(EditorBrowsableState.Never)] + [ExcludeFromCodeCoverage] public static int Critical(int id, Action action) { if (!InParallel()) @@ -1097,6 +1098,7 @@ public static void Master(Action action) /// Thrown when nested inside another worksharing region. [Obsolete("This version of Single is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")] [EditorBrowsable(EditorBrowsableState.Never)] + [ExcludeFromCodeCoverage] public static void Single(int id, Action action) { var freg = new ForkedRegion(); @@ -1196,6 +1198,7 @@ public static void Single(Action action, [CallerFilePath] string path = "", [Cal /// Thrown when not in a parallel region. [Obsolete("This version of Ordered is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")] [EditorBrowsable(EditorBrowsableState.Never)] + [ExcludeFromCodeCoverage] public static void Ordered(int id, Action action) { var freg = new ForkedRegion(); From 8b40582bd68e2e4a1b2a825de791739c1bab945e Mon Sep 17 00:00:00 2001 From: Lane Date: Sat, 11 Nov 2023 10:29:13 -0600 Subject: [PATCH 30/61] add missing using --- DotMP/Parallel.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/DotMP/Parallel.cs b/DotMP/Parallel.cs index 1c88570a..6b400d72 100644 --- a/DotMP/Parallel.cs +++ b/DotMP/Parallel.cs @@ -3,6 +3,7 @@ using System.Runtime.CompilerServices; using System.ComponentModel; using System.Threading; +using System.Diagnostics.CodeAnalysis; namespace DotMP { From 594b6bac1712dae783cd3e946f531fca8ed7efde Mon Sep 17 00:00:00 2001 From: Lane Date: Sat, 11 Nov 2023 10:29:49 -0600 Subject: [PATCH 31/61] use new critical/ordered/single methods --- DotMP-Tests/CPUTests.cs | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/DotMP-Tests/CPUTests.cs b/DotMP-Tests/CPUTests.cs index defeab2b..9b036e5c 100644 --- a/DotMP-Tests/CPUTests.cs +++ b/DotMP-Tests/CPUTests.cs @@ -429,7 +429,7 @@ public void Critical_works() DotMP.Parallel.ParallelRegion(num_threads: threads, action: () => { for (int i = 0; i < iters; i++) - DotMP.Parallel.Critical(0, () => ++total); + DotMP.Parallel.Critical(() => ++total); }); total.Should().Be((int)threads * iters); @@ -438,14 +438,13 @@ public void Critical_works() DotMP.Parallel.ParallelRegion(num_threads: 4, action: () => { - if (DotMP.Parallel.GetThreadNum() == 0) DotMP.Parallel.Critical(0, () => Thread.Sleep(1000)); - if (DotMP.Parallel.GetThreadNum() == 1) DotMP.Parallel.Critical(1, () => Thread.Sleep(1000)); - if (DotMP.Parallel.GetThreadNum() == 2) DotMP.Parallel.Critical(0, () => Thread.Sleep(1000)); - if (DotMP.Parallel.GetThreadNum() == 3) DotMP.Parallel.Critical(1, () => Thread.Sleep(1000)); + if (DotMP.Parallel.GetThreadNum() % 2 == 0) DotMP.Parallel.Critical(() => Thread.Sleep(1000)); + if (DotMP.Parallel.GetThreadNum() % 2 == 1) DotMP.Parallel.Critical(() => Thread.Sleep(1000)); }); double elapsed = DotMP.Parallel.GetWTime() - start; - elapsed.Should().BeLessThan(2200); + elapsed.Should().BeLessThan(2.2); + elapsed.Should().BeGreaterThan(2.0); } /// @@ -478,7 +477,7 @@ public void Single_works() { for (int i = 0; i < 10; i++) { - DotMP.Parallel.Single(0, () => DotMP.Atomic.Inc(ref total)); + DotMP.Parallel.Single(() => DotMP.Atomic.Inc(ref total)); } }); @@ -490,7 +489,7 @@ public void Single_works() { for (int i = 0; i < 10; i++) { - DotMP.Parallel.Single(0, () => DotMP.Atomic.Inc(ref total)); + DotMP.Parallel.Single(() => DotMP.Atomic.Inc(ref total)); } }); @@ -646,7 +645,7 @@ public void Ordered_works() DotMP.Parallel.ParallelFor(0, 1024, schedule: DotMP.Schedule.Static, num_threads: threads, action: i => { - DotMP.Parallel.Ordered(0, () => incrementing[i] = i); + DotMP.Parallel.Ordered(() => incrementing[i] = i); }); for (int i = 0; i < incrementing.Length; i++) @@ -1004,7 +1003,7 @@ public void Tasking_works() DotMP.Parallel.ParallelRegion(num_threads: threads, action: () => { - DotMP.Parallel.Single(0, () => + DotMP.Parallel.Single(() => { for (int i = 0; i < threads * 2; i++) { @@ -1033,7 +1032,7 @@ public void Tasking_works() DotMP.Parallel.ParallelRegion(num_threads: threads, action: () => { - DotMP.Parallel.Single(0, () => + DotMP.Parallel.Single(() => { for (int i = 0; i < tasks_to_spawn; i++) { @@ -1093,7 +1092,7 @@ public void Nested_tasks_work() DotMP.Parallel.ParallelRegion(num_threads: threads, action: () => { - DotMP.Parallel.Single(0, () => + DotMP.Parallel.Single(() => { DotMP.Parallel.Task(() => { @@ -1263,7 +1262,7 @@ public void Non_parallel_single_should_except() { Assert.Throws(() => { - DotMP.Parallel.Single(0, () => { }); + DotMP.Parallel.Single(() => { }); }); } @@ -1275,7 +1274,7 @@ public void Non_parallel_critical_should_except() { Assert.Throws(() => { - DotMP.Parallel.Critical(0, () => { }); + DotMP.Parallel.Critical(() => { }); }); } @@ -1289,13 +1288,13 @@ public void Nested_worksharing_should_except() { Assert.Throws(() => { - DotMP.Parallel.Single(0, () => { }); + DotMP.Parallel.Single(() => { }); }); }); DotMP.Parallel.ParallelRegion(num_threads: 4, action: () => { - DotMP.Parallel.Single(0, () => + DotMP.Parallel.Single(() => { Assert.Throws(() => { @@ -1321,7 +1320,7 @@ public void Non_for_ordered_should_except() { Assert.Throws(() => { - DotMP.Parallel.Ordered(0, () => { }); + DotMP.Parallel.Ordered(() => { }); }); } From 5e90c890edf324165cb80a0a29f0adb23eafe992 Mon Sep 17 00:00:00 2001 From: Lane Date: Sat, 11 Nov 2023 13:08:57 -0600 Subject: [PATCH 32/61] testing forcollapse performance in heat transfer, will fully implement forcollapse later --- DotMP/GPU/AcceleratorHandler.cs | 139 ++++++++++++++++++++++---- DotMP/GPU/Gpu.cs | 113 ++++++++++++++++----- DotMP/GPU/Index.cs | 60 ++++++++++- benchmarks/GPUHeatTransfer/Program.cs | 34 ++----- 4 files changed, 271 insertions(+), 75 deletions(-) diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs index c7be93df..3608a6c1 100644 --- a/DotMP/GPU/AcceleratorHandler.cs +++ b/DotMP/GPU/AcceleratorHandler.cs @@ -15,6 +15,7 @@ */ using System; +using System.Collections.Generic; using System.Linq; using ILGPU; using ILGPU.Runtime; @@ -39,9 +40,13 @@ internal class AcceleratorHandler /// internal static Accelerator accelerator; /// - /// + /// Block size to use for kernels. /// private static int block_size; + /// + /// Kernel cache. + /// + private static Dictionary kernels = new Dictionary(); /// /// Default constructor. If this is the first time it's called, it initializes all relevant singleton data. @@ -64,6 +69,7 @@ internal AcceleratorHandler() } accelerator = selectedDevice.CreateAccelerator(context); + //accelerator = context.Devices[0].CreateAccelerator(context); Console.WriteLine("Using {0} accelerator.", accelerator.AcceleratorType.ToString()); initialized = true; @@ -75,6 +81,82 @@ internal AcceleratorHandler() /// private void Synchronize() => accelerator.Synchronize(); + /// + /// Get the kernel associated with this lambda. + /// + /// The base type of the first argument. Must be an unmanaged type. + /// The action provided on the CPU. + /// The calling location. + /// The GPU kernel. + private Action> GetKernel(Action> action, string src) + where T : unmanaged + { + if (!kernels.ContainsKey(src)) + kernels.Add(src, accelerator.LoadStreamKernel(action)); + + return (Action>)kernels[src]; + } + + /// + /// Get the kernel associated with this lambda. + /// + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The action provided on the CPU. + /// The calling location. + /// The GPU kernel. + private Action, GPUArray> GetKernel(Action, GPUArray> action, string src) + where T : unmanaged + where U : unmanaged + { + if (!kernels.ContainsKey(src)) + kernels.Add(src, accelerator.LoadStreamKernel(action)); + + return (Action, GPUArray>)kernels[src]; + } + + /// + /// Get the kernel associated with this lambda. + /// + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The action provided on the CPU. + /// The calling location. + /// The GPU kernel. + private Action, GPUArray, GPUArray> GetKernel(Action, GPUArray, GPUArray> action, string src) + where T : unmanaged + where U : unmanaged + where V : unmanaged + { + if (!kernels.ContainsKey(src)) + kernels.Add(src, accelerator.LoadStreamKernel(action)); + + return (Action, GPUArray, GPUArray>)kernels[src]; + } + + /// + /// Get the kernel associated with this lambda. + /// + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The action provided on the CPU. + /// The calling location. + /// The GPU kernel. + private Action, GPUArray, GPUArray, GPUArray> GetKernel(Action, GPUArray, GPUArray, GPUArray> action, string src) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + { + if (!kernels.ContainsKey(src)) + kernels.Add(src, accelerator.LoadStreamKernel(action)); + + return (Action, GPUArray, GPUArray, GPUArray>)kernels[src]; + } + /// /// Dispatches a kernel with one parameter. /// @@ -82,13 +164,14 @@ internal AcceleratorHandler() /// The end of the loop, exclusive. /// The buffer to run the kernel with. /// The kernel to run on the GPU. + /// The originating caller location. /// The base type of the first argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf, Action> action) + internal void DispatchKernel(int start, int end, Buffer buf, Action> action, string src) where T : unmanaged { var idx = new Index(start); - var kernel = accelerator.LoadStreamKernel(action); + var kernel = GetKernel(action, src); kernel(((end - start) / block_size, block_size), idx, new GPUArray(buf)); @@ -99,22 +182,23 @@ internal void DispatchKernel(int start, int end, Buffer buf, Action /// Dispatches a kernel with two parameters. /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. + /// The starts and ends of the loop. /// The first buffer to run the kernel with. /// The second buffer to run the kernel with. /// The kernel to run on the GPU. + /// The originating caller location. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Action, GPUArray> action) + internal void DispatchKernel((int, int)[] ranges, Buffer buf1, Buffer buf2, Action, GPUArray> action, string src) where T : unmanaged where U : unmanaged { - var idx = new Index(start); + int len = ranges.Select(tup => tup.Item2 - tup.Item1).Aggregate((x, y) => x * y); + var idx = new Index(ranges); - var kernel = accelerator.LoadStreamKernel(action); + var kernel = GetKernel(action, src); - kernel(((end - start) / block_size, block_size), idx, + kernel((len / block_size, block_size), idx, new GPUArray(buf1), new GPUArray(buf2)); @@ -130,17 +214,18 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Buffer /// The second buffer to run the kernel with. /// The third buffer to run the kernel with. /// The kernel to run on the GPU. + /// The originating caller location. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Action, GPUArray, GPUArray> action) + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Action, GPUArray, GPUArray> action, string src) where T : unmanaged where U : unmanaged where V : unmanaged { var idx = new Index(start); - var kernel = accelerator.LoadStreamKernel(action); + var kernel = GetKernel(action, src); kernel(((end - start) / block_size, block_size), idx, new GPUArray(buf1), @@ -160,11 +245,12 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Buffer /// The third buffer to run the kernel with. /// The fourth buffer to run the kernel with. /// The kernel to run on the GPU. + /// The originating caller location. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. /// The base type of the fourth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Action, GPUArray, GPUArray, GPUArray> action) + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Action, GPUArray, GPUArray, GPUArray> action, string src) where T : unmanaged where U : unmanaged where V : unmanaged @@ -172,7 +258,7 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Buf { var idx = new Index(start); - var kernel = accelerator.LoadStreamKernel(action); + var kernel = GetKernel(action, src); kernel(((end - start) / block_size, block_size), idx, new GPUArray(buf1), @@ -194,12 +280,13 @@ internal void DispatchKernel(int start, int end, Buffer buf1, Buf /// The fourth buffer to run the kernel with. /// The fifth buffer to run the kernel with. /// The kernel to run on the GPU. + /// The originating caller location. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. /// The base type of the fourth argument. Must be an unmanaged type. /// The base type of the fifth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Action, GPUArray, GPUArray, GPUArray, GPUArray> action) + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Action, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) where T : unmanaged where U : unmanaged where V : unmanaged @@ -232,13 +319,14 @@ internal void DispatchKernel(int start, int end, Buffer buf1, /// The fifth buffer to run the kernel with. /// The sixth buffer to run the kernel with. /// The kernel to run on the GPU. + /// The originating caller location. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. /// The base type of the fourth argument. Must be an unmanaged type. /// The base type of the fifth argument. Must be an unmanaged type. /// The base type of the sixth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) where T : unmanaged where U : unmanaged where V : unmanaged @@ -274,6 +362,7 @@ internal void DispatchKernel(int start, int end, Buffer buf /// The sixth buffer to run the kernel with. /// The seventh buffer to run the kernel with. /// The kernel to run on the GPU. + /// The originating caller location. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. @@ -281,7 +370,7 @@ internal void DispatchKernel(int start, int end, Buffer buf /// The base type of the fifth argument. Must be an unmanaged type. /// The base type of the sixth argument. Must be an unmanaged type. /// The base type of the seventh argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) where T : unmanaged where U : unmanaged where V : unmanaged @@ -320,6 +409,7 @@ internal void DispatchKernel(int start, int end, Buffer /// The seventh buffer to run the kernel with. /// The eighth buffer to run the kernel with. /// The kernel to run on the GPU. + /// The originating caller location. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. @@ -328,7 +418,7 @@ internal void DispatchKernel(int start, int end, Buffer /// The base type of the sixth argument. Must be an unmanaged type. /// The base type of the seventh argument. Must be an unmanaged type. /// The base type of the eighth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) where T : unmanaged where U : unmanaged where V : unmanaged @@ -370,6 +460,7 @@ internal void DispatchKernel(int start, int end, Buffer< /// The eighth buffer to run the kernel with. /// The ninth buffer to run the kernel with. /// The kernel to run on the GPU. + /// The originating caller location. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. @@ -379,7 +470,7 @@ internal void DispatchKernel(int start, int end, Buffer< /// The base type of the seventh argument. Must be an unmanaged type. /// The base type of the eighth argument. Must be an unmanaged type. /// The base type of the ninth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) where T : unmanaged where U : unmanaged where V : unmanaged @@ -424,6 +515,7 @@ internal void DispatchKernel(int start, int end, Buff /// The ninth buffer to run the kernel with. /// The tenth buffer to run the kernel with. /// The kernel to run on the GPU. + /// The originating caller location. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. @@ -434,7 +526,7 @@ internal void DispatchKernel(int start, int end, Buff /// The base type of the eighth argument. Must be an unmanaged type. /// The base type of the ninth argument. Must be an unmanaged type. /// The base type of the tenth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) where T : unmanaged where U : unmanaged where V : unmanaged @@ -482,6 +574,7 @@ internal void DispatchKernel(int start, int end, B /// The tenth buffer to run the kernel with. /// The eleventh buffer to run the kernel with. /// The kernel to run on the GPU. + /// The originating caller location. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. @@ -493,7 +586,7 @@ internal void DispatchKernel(int start, int end, B /// The base type of the ninth argument. Must be an unmanaged type. /// The base type of the tenth argument. Must be an unmanaged type. /// The base type of the eleventh argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) where T : unmanaged where U : unmanaged where V : unmanaged @@ -544,6 +637,7 @@ internal void DispatchKernel(int start, int end /// The eleventh buffer to run the kernel with. /// The twelfth buffer to run the kernel with. /// The kernel to run on the GPU. + /// The originating caller location. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. @@ -556,7 +650,7 @@ internal void DispatchKernel(int start, int end /// The base type of the tenth argument. Must be an unmanaged type. /// The base type of the eleventh argument. Must be an unmanaged type. /// The base type of the twelfth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) where T : unmanaged where U : unmanaged where V : unmanaged @@ -610,6 +704,7 @@ internal void DispatchKernel(int start, int /// The twelfth buffer to run the kernel with. /// The thirteenth buffer to run the kernel with. /// The kernel to run on the GPU. + /// The originating caller location. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. @@ -623,7 +718,7 @@ internal void DispatchKernel(int start, int /// The base type of the eleventh argument. Must be an unmanaged type. /// The base type of the twelfth argument. Must be an unmanaged type. /// The base type of the thirteenth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Buffer buf13, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Buffer buf13, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) where T : unmanaged where U : unmanaged where V : unmanaged diff --git a/DotMP/GPU/Gpu.cs b/DotMP/GPU/Gpu.cs index c9b8963c..1e2cd7a7 100644 --- a/DotMP/GPU/Gpu.cs +++ b/DotMP/GPU/Gpu.cs @@ -15,7 +15,7 @@ */ using System; -using ILGPU; +using System.Runtime.CompilerServices; namespace DotMP.GPU { @@ -26,6 +26,17 @@ namespace DotMP.GPU /// public static class Parallel { + /// + /// Formats the caller information for determining uniqueness of a call. + /// + /// The calling file. + /// The calling line number. + /// A formatted string representing "{filename}:{linenum}" + private static string FormatCaller(string filename, int linenum) + { + return string.Format("{0}:{1}", filename, linenum); + } + /// /// Creates a GPU parallel for loop. /// The body of the kernel is run on a GPU target. @@ -35,12 +46,15 @@ public static class Parallel /// The end of the loop, exclusive. /// The buffer to run the kernel with. /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. /// The base type of the first argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf, Action> action) + public static void ParallelFor(int start, int end, Buffer buf, Action> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) where T : unmanaged { var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, buf, action); + string src = FormatCaller(path, line); + handler.DispatchKernel(start, end, buf, action, src); } /// @@ -53,14 +67,26 @@ public static void ParallelFor(int start, int end, Buffer buf, ActionThe first buffer to run the kernel with. /// The second buffer to run the kernel with. /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Action, GPUArray> action) + /*public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Action, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) + where T : unmanaged + where U : unmanaged + { + var handler = new AcceleratorHandler(); + string src = FormatCaller(path, line); + handler.DispatchKernel(start, end, buf1, buf2, action, src); + }*/ + + public static void ParallelForCollapse((int, int) range1, (int, int) range2, Buffer buf1, Buffer buf2, Action, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) where T : unmanaged where U : unmanaged { var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, buf1, buf2, action); + string src = FormatCaller(path, line); + handler.DispatchKernel(new (int, int)[] { range1, range2 }, buf1, buf2, action, src); } /// @@ -74,16 +100,19 @@ public static void ParallelFor(int start, int end, Buffer buf1, Buffer< /// The second buffer to run the kernel with. /// The third buffer to run the kernel with. /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Action, GPUArray, GPUArray> action) + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Action, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) where T : unmanaged where U : unmanaged where V : unmanaged { var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, buf1, buf2, buf3, action); + string src = FormatCaller(path, line); + handler.DispatchKernel(start, end, buf1, buf2, buf3, action, src); } /// @@ -98,18 +127,21 @@ public static void ParallelFor(int start, int end, Buffer buf1, Buff /// The third buffer to run the kernel with. /// The fourth buffer to run the kernel with. /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. /// The base type of the fourth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Action, GPUArray, GPUArray, GPUArray> action) + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Action, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) where T : unmanaged where U : unmanaged where V : unmanaged where W : unmanaged { var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, action); + string src = FormatCaller(path, line); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, action, src); } /// @@ -125,12 +157,14 @@ public static void ParallelFor(int start, int end, Buffer buf1, B /// The fourth buffer to run the kernel with. /// The fifth buffer to run the kernel with. /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. /// The base type of the fourth argument. Must be an unmanaged type. /// The base type of the fifth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Action, GPUArray, GPUArray, GPUArray, GPUArray> action) + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Action, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) where T : unmanaged where U : unmanaged where V : unmanaged @@ -138,7 +172,8 @@ public static void ParallelFor(int start, int end, Buffer buf1 where X : unmanaged { var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, action); + string src = FormatCaller(path, line); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, action, src); } /// @@ -155,13 +190,15 @@ public static void ParallelFor(int start, int end, Buffer buf1 /// The fifth buffer to run the kernel with. /// The sixth buffer to run the kernel with. /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. /// The base type of the fourth argument. Must be an unmanaged type. /// The base type of the fifth argument. Must be an unmanaged type. /// The base type of the sixth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) where T : unmanaged where U : unmanaged where V : unmanaged @@ -170,7 +207,8 @@ public static void ParallelFor(int start, int end, Buffer b where Y : unmanaged { var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, action); + string src = FormatCaller(path, line); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, action, src); } /// @@ -188,6 +226,8 @@ public static void ParallelFor(int start, int end, Buffer b /// The sixth buffer to run the kernel with. /// The seventh buffer to run the kernel with. /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. @@ -195,7 +235,7 @@ public static void ParallelFor(int start, int end, Buffer b /// The base type of the fifth argument. Must be an unmanaged type. /// The base type of the sixth argument. Must be an unmanaged type. /// The base type of the seventh argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) where T : unmanaged where U : unmanaged where V : unmanaged @@ -205,7 +245,8 @@ public static void ParallelFor(int start, int end, Buffer @@ -224,6 +265,8 @@ public static void ParallelFor(int start, int end, BufferThe seventh buffer to run the kernel with. /// The eighth buffer to run the kernel with. /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. @@ -232,7 +275,7 @@ public static void ParallelFor(int start, int end, BufferThe base type of the sixth argument. Must be an unmanaged type. /// The base type of the seventh argument. Must be an unmanaged type. /// The base type of the eighth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) where T : unmanaged where U : unmanaged where V : unmanaged @@ -243,7 +286,8 @@ public static void ParallelFor(int start, int end, Buffe where A : unmanaged { var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, action); + string src = FormatCaller(path, line); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, action, src); } /// @@ -263,6 +307,8 @@ public static void ParallelFor(int start, int end, Buffe /// The eighth buffer to run the kernel with. /// The ninth buffer to run the kernel with. /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. @@ -272,7 +318,7 @@ public static void ParallelFor(int start, int end, Buffe /// The base type of the seventh argument. Must be an unmanaged type. /// The base type of the eighth argument. Must be an unmanaged type. /// The base type of the ninth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) where T : unmanaged where U : unmanaged where V : unmanaged @@ -284,7 +330,8 @@ public static void ParallelFor(int start, int end, Bu where B : unmanaged { var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, action); + string src = FormatCaller(path, line); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, action, src); } /// @@ -305,6 +352,8 @@ public static void ParallelFor(int start, int end, Bu /// The ninth buffer to run the kernel with. /// The tenth buffer to run the kernel with. /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. @@ -315,7 +364,7 @@ public static void ParallelFor(int start, int end, Bu /// The base type of the eighth argument. Must be an unmanaged type. /// The base type of the ninth argument. Must be an unmanaged type. /// The base type of the tenth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) where T : unmanaged where U : unmanaged where V : unmanaged @@ -328,7 +377,8 @@ public static void ParallelFor(int start, int end, where C : unmanaged { var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, action); + string src = FormatCaller(path, line); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, action, src); } /// @@ -350,6 +400,8 @@ public static void ParallelFor(int start, int end, /// The tenth buffer to run the kernel with. /// The eleventh buffer to run the kernel with. /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. @@ -361,7 +413,7 @@ public static void ParallelFor(int start, int end, /// The base type of the ninth argument. Must be an unmanaged type. /// The base type of the tenth argument. Must be an unmanaged type. /// The base type of the eleventh argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) where T : unmanaged where U : unmanaged where V : unmanaged @@ -375,7 +427,8 @@ public static void ParallelFor(int start, int e where D : unmanaged { var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, action); + string src = FormatCaller(path, line); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, action, src); } /// @@ -398,6 +451,8 @@ public static void ParallelFor(int start, int e /// The eleventh buffer to run the kernel with. /// The twelfth buffer to run the kernel with. /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. @@ -410,7 +465,7 @@ public static void ParallelFor(int start, int e /// The base type of the tenth argument. Must be an unmanaged type. /// The base type of the eleventh argument. Must be an unmanaged type. /// The base type of the twelfth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) where T : unmanaged where U : unmanaged where V : unmanaged @@ -425,7 +480,8 @@ public static void ParallelFor(int start, in where E : unmanaged { var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, action); + string src = FormatCaller(path, line); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, action, src); } /// @@ -449,6 +505,8 @@ public static void ParallelFor(int start, in /// The twelfth buffer to run the kernel with. /// The thirteenth buffer to run the kernel with. /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. /// The base type of the third argument. Must be an unmanaged type. @@ -462,7 +520,7 @@ public static void ParallelFor(int start, in /// The base type of the eleventh argument. Must be an unmanaged type. /// The base type of the twelfth argument. Must be an unmanaged type. /// The base type of the thirteenth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Buffer buf13, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action) + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Buffer buf13, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) where T : unmanaged where U : unmanaged where V : unmanaged @@ -478,7 +536,8 @@ public static void ParallelFor(int start, where F : unmanaged { var handler = new AcceleratorHandler(); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, action); + string src = FormatCaller(path, line); + handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, action, src); } } } \ No newline at end of file diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs index b1e9df75..8a0c09f4 100644 --- a/DotMP/GPU/Index.cs +++ b/DotMP/GPU/Index.cs @@ -17,6 +17,8 @@ using ILGPU; using System; using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Runtime.CompilerServices; namespace DotMP.GPU { @@ -29,7 +31,13 @@ public struct Index /// /// The start of the for loop, for index calculations. /// - private int start; + private int start1; + private int start2; + + private int i_prv; + private int j_prv; + + private int diff; /// private void Synchronize() => accelerator.Synchronize(); +<# for (int c = 1; c <= max; c++) { #> /// /// Get the kernel associated with this lambda. /// /// The action provided on the CPU. /// The calling location. /// The GPU kernel. -<# for (int c = 1; c <= max; c++) { #> private Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> > GetKernel< @@ -112,602 +112,39 @@ namespace DotMP.GPU } <# } #> +<# for (int c = 1; c <= max; c++) { #> /// - /// Dispatches a kernel with one parameter. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf, Action> action, string src) - where T : unmanaged - { - var idx = new Index(start); - - var kernel = GetKernel(action, src); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with two parameters. - /// - /// The starts and ends of the loop. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - internal void DispatchKernel((int, int)[] ranges, Buffer buf1, Buffer buf2, Action, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - { - int len = ranges.Select(tup => tup.Item2 - tup.Item1).Aggregate((x, y) => x * y); - var idx = new Index(ranges); - - var kernel = GetKernel(action, src); - - kernel((len / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with three parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Action, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - { - var idx = new Index(start); - - var kernel = GetKernel(action, src); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with four parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Action, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - { - var idx = new Index(start); - - var kernel = GetKernel(action, src); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with five parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Action, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with six parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5), - new GPUArray(buf6)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with seven parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5), - new GPUArray(buf6), - new GPUArray(buf7)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with eight parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5), - new GPUArray(buf6), - new GPUArray(buf7), - new GPUArray(buf8)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with nine parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5), - new GPUArray(buf6), - new GPUArray(buf7), - new GPUArray(buf8), - new GPUArray(buf9)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with ten parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The tenth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - /// The base type of the tenth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - where C : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5), - new GPUArray(buf6), - new GPUArray(buf7), - new GPUArray(buf8), - new GPUArray(buf9), - new GPUArray(buf10)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with eleven parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The tenth buffer to run the kernel with. - /// The eleventh buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - /// The base type of the tenth argument. Must be an unmanaged type. - /// The base type of the eleventh argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - where C : unmanaged - where D : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5), - new GPUArray(buf6), - new GPUArray(buf7), - new GPUArray(buf8), - new GPUArray(buf9), - new GPUArray(buf10), - new GPUArray(buf11)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with twelve parameters. + /// Dispatches a kernel with the given number of parameters. /// /// The start of the loop, inclusive. /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The tenth buffer to run the kernel with. - /// The eleventh buffer to run the kernel with. - /// The twelfth buffer to run the kernel with. +<# for (int i = 0; i < c; i++) { #> + /// Buffer #<#= i + 1 #> to run the kernel with. +<# } #> /// The kernel to run on the GPU. /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - /// The base type of the tenth argument. Must be an unmanaged type. - /// The base type of the eleventh argument. Must be an unmanaged type. - /// The base type of the twelfth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - where C : unmanaged - where D : unmanaged - where E : unmanaged + internal void DispatchKernel< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >(int start, int end, +<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> + Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > action, string src) +<# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #> { var idx = new Index(start); - var kernel = accelerator.LoadStreamKernel(action); + var kernel = GetKernel(action, src); - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5), - new GPUArray(buf6), - new GPUArray(buf7), - new GPUArray(buf8), - new GPUArray(buf9), - new GPUArray(buf10), - new GPUArray(buf11), - new GPUArray(buf12)); + kernel(((end - start) / block_size, block_size), idx +<# for (int i = 0; i < c; i++) { #> + , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>) +<# } #> + ); Synchronize(); - } - - /// - /// Dispatches a kernel with thirteen parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The tenth buffer to run the kernel with. - /// The eleventh buffer to run the kernel with. - /// The twelfth buffer to run the kernel with. - /// The thirteenth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - /// The base type of the tenth argument. Must be an unmanaged type. - /// The base type of the eleventh argument. Must be an unmanaged type. - /// The base type of the twelfth argument. Must be an unmanaged type. - /// The base type of the thirteenth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Buffer buf13, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - where C : unmanaged - where D : unmanaged - where E : unmanaged - where F : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5), - new GPUArray(buf6), - new GPUArray(buf7), - new GPUArray(buf8), - new GPUArray(buf9), - new GPUArray(buf10), - new GPUArray(buf11), - new GPUArray(buf12), - new GPUArray(buf13)); +} +<# } #> - Synchronize(); - } } } \ No newline at end of file From 267a789f309d7ad7577472cc61aaf78de50214d8 Mon Sep 17 00:00:00 2001 From: Lane Date: Sat, 11 Nov 2023 17:37:42 -0600 Subject: [PATCH 38/61] revert collapse --- DotMP/GPU/Gpu.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/DotMP/GPU/Gpu.cs b/DotMP/GPU/Gpu.cs index 1e2cd7a7..5a68462f 100644 --- a/DotMP/GPU/Gpu.cs +++ b/DotMP/GPU/Gpu.cs @@ -71,23 +71,23 @@ public static void ParallelFor(int start, int end, Buffer buf, ActionThe path to the file this method was called from. /// The base type of the first argument. Must be an unmanaged type. /// The base type of the second argument. Must be an unmanaged type. - /*public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Action, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) + public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Action, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) where T : unmanaged where U : unmanaged { var handler = new AcceleratorHandler(); string src = FormatCaller(path, line); handler.DispatchKernel(start, end, buf1, buf2, action, src); - }*/ + } - public static void ParallelForCollapse((int, int) range1, (int, int) range2, Buffer buf1, Buffer buf2, Action, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) + /*public static void ParallelForCollapse((int, int) range1, (int, int) range2, Buffer buf1, Buffer buf2, Action, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) where T : unmanaged where U : unmanaged { var handler = new AcceleratorHandler(); string src = FormatCaller(path, line); handler.DispatchKernel(new (int, int)[] { range1, range2 }, buf1, buf2, action, src); - } + }*/ /// /// Creates a GPU parallel for loop. From 3d610b43c561cad80884a4fb9a7bedacd39cca4f Mon Sep 17 00:00:00 2001 From: Lane Date: Sat, 11 Nov 2023 17:48:17 -0600 Subject: [PATCH 39/61] remove excess newlines --- DotMP/GPU/AcceleratorHandler.tt | 2 -- 1 file changed, 2 deletions(-) diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt index 96cb8336..3f2af598 100644 --- a/DotMP/GPU/AcceleratorHandler.tt +++ b/DotMP/GPU/AcceleratorHandler.tt @@ -14,7 +14,6 @@ * write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ - <#@ template debug="false" hostspecific="false" language="C#" #> <#@ output extension=".cs" #> <# var letters = new char[] { 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'A', 'B', 'C', 'D', 'E', 'F' }; @@ -145,6 +144,5 @@ namespace DotMP.GPU Synchronize(); } <# } #> - } } \ No newline at end of file From d03d31bbf9a6757172fc4a41707d77650c20418f Mon Sep 17 00:00:00 2001 From: Lane Date: Sat, 11 Nov 2023 18:04:03 -0600 Subject: [PATCH 40/61] get parallelfor t4 gen working --- .gitignore | 1 + DotMP/GPU/AcceleratorHandler.tt | 46 +-- DotMP/GPU/Gpu.cs | 543 -------------------------------- DotMP/GPU/Gpu.tt | 76 +++++ 4 files changed, 100 insertions(+), 566 deletions(-) delete mode 100644 DotMP/GPU/Gpu.cs create mode 100644 DotMP/GPU/Gpu.tt diff --git a/.gitignore b/.gitignore index df42484a..ac1e66dc 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ docs/* *.opencover.xml *.sln AcceleratorHandler.cs +Gpu.cs ProcessedREADME.md # User-specific files diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt index 3f2af598..20fd8fe3 100644 --- a/DotMP/GPU/AcceleratorHandler.tt +++ b/DotMP/GPU/AcceleratorHandler.tt @@ -95,11 +95,11 @@ namespace DotMP.GPU /// The GPU kernel. private Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> -> GetKernel< + > GetKernel< <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> ->(Action(Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #> -> action, string src) + > action, string src) <# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #> { if (!kernels.ContainsKey(src)) @@ -108,41 +108,41 @@ namespace DotMP.GPU return (Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> >) kernels[src]; - } + } <# } #> <# for (int c = 1; c <= max; c++) { #> - /// - /// Dispatches a kernel with the given number of parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. + /// + /// Dispatches a kernel with the given number of parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. <# for (int i = 0; i < c; i++) { #> - /// Buffer #<#= i + 1 #> to run the kernel with. + /// Buffer #<#= i + 1 #> to run the kernel with. <# } #> - /// The kernel to run on the GPU. - /// The originating caller location. - internal void DispatchKernel< + /// The kernel to run on the GPU. + /// The originating caller location. + internal void DispatchKernel< <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> - >(int start, int end, + >(int start, int end, <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> - Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> - > action, string src) + > action, string src) <# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #> - { - var idx = new Index(start); + { + var idx = new Index(start); - var kernel = GetKernel(action, src); + var kernel = GetKernel(action, src); - kernel(((end - start) / block_size, block_size), idx + kernel(((end - start) / block_size, block_size), idx <# for (int i = 0; i < c; i++) { #> , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>) <# } #> ); - Synchronize(); -} + Synchronize(); + } <# } #> -} + } } \ No newline at end of file diff --git a/DotMP/GPU/Gpu.cs b/DotMP/GPU/Gpu.cs deleted file mode 100644 index 5a68462f..00000000 --- a/DotMP/GPU/Gpu.cs +++ /dev/null @@ -1,543 +0,0 @@ -/* -* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. -* Copyright (C) 2023 Phillip Allen Lane -* -* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser -* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or -* (at your option) any later version. -* -* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the -* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -* License for more details. -* -* You should have received a copy of the GNU Lesser General Public License along with this library; if not, -* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -*/ - -using System; -using System.Runtime.CompilerServices; - -namespace DotMP.GPU -{ - /// - /// The main class of DotMP's GPU API, powered by the ILGPU project. - /// Contains all the main methods for constructing and running GPU kernels. - /// The GPU API is not thread-safe at the current moment, so its methods should not be called from within a Parallel.ParallelRegion! - /// - public static class Parallel - { - /// - /// Formats the caller information for determining uniqueness of a call. - /// - /// The calling file. - /// The calling line number. - /// A formatted string representing "{filename}:{linenum}" - private static string FormatCaller(string filename, int linenum) - { - return string.Format("{0}:{1}", filename, linenum); - } - - /// - /// Creates a GPU parallel for loop. - /// The body of the kernel is run on a GPU target. - /// This overload specifies that one array is used on the GPU. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The line number this method was called from. - /// The path to the file this method was called from. - /// The base type of the first argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf, Action> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) - where T : unmanaged - { - var handler = new AcceleratorHandler(); - string src = FormatCaller(path, line); - handler.DispatchKernel(start, end, buf, action, src); - } - - /// - /// Creates a GPU parallel for loop. - /// The body of the kernel is run on a GPU target. - /// This overload specifies that two arrays are used on the GPU. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The line number this method was called from. - /// The path to the file this method was called from. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Action, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) - where T : unmanaged - where U : unmanaged - { - var handler = new AcceleratorHandler(); - string src = FormatCaller(path, line); - handler.DispatchKernel(start, end, buf1, buf2, action, src); - } - - /*public static void ParallelForCollapse((int, int) range1, (int, int) range2, Buffer buf1, Buffer buf2, Action, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) - where T : unmanaged - where U : unmanaged - { - var handler = new AcceleratorHandler(); - string src = FormatCaller(path, line); - handler.DispatchKernel(new (int, int)[] { range1, range2 }, buf1, buf2, action, src); - }*/ - - /// - /// Creates a GPU parallel for loop. - /// The body of the kernel is run on a GPU target. - /// This overload specifies that three arrays are used on the GPU. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The line number this method was called from. - /// The path to the file this method was called from. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Action, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) - where T : unmanaged - where U : unmanaged - where V : unmanaged - { - var handler = new AcceleratorHandler(); - string src = FormatCaller(path, line); - handler.DispatchKernel(start, end, buf1, buf2, buf3, action, src); - } - - /// - /// Creates a GPU parallel for loop. - /// The body of the kernel is run on a GPU target. - /// This overload specifies that four arrays are used on the GPU. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The line number this method was called from. - /// The path to the file this method was called from. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Action, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - { - var handler = new AcceleratorHandler(); - string src = FormatCaller(path, line); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, action, src); - } - - /// - /// Creates a GPU parallel for loop. - /// The body of the kernel is run on a GPU target. - /// This overload specifies that five arrays are used on the GPU. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The line number this method was called from. - /// The path to the file this method was called from. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Action, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - { - var handler = new AcceleratorHandler(); - string src = FormatCaller(path, line); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, action, src); - } - - /// - /// Creates a GPU parallel for loop. - /// The body of the kernel is run on a GPU target. - /// This overload specifies that six arrays are used on the GPU. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The line number this method was called from. - /// The path to the file this method was called from. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - { - var handler = new AcceleratorHandler(); - string src = FormatCaller(path, line); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, action, src); - } - - /// - /// Creates a GPU parallel for loop. - /// The body of the kernel is run on a GPU target. - /// This overload specifies that seven arrays are used on the GPU. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The line number this method was called from. - /// The path to the file this method was called from. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - { - var handler = new AcceleratorHandler(); - string src = FormatCaller(path, line); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, action, src); - } - - /// - /// Creates a GPU parallel for loop. - /// The body of the kernel is run on a GPU target. - /// This overload specifies that eight arrays are used on the GPU. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The line number this method was called from. - /// The path to the file this method was called from. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - { - var handler = new AcceleratorHandler(); - string src = FormatCaller(path, line); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, action, src); - } - - /// - /// Creates a GPU parallel for loop. - /// The body of the kernel is run on a GPU target. - /// This overload specifies that nine arrays are used on the GPU. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The line number this method was called from. - /// The path to the file this method was called from. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - { - var handler = new AcceleratorHandler(); - string src = FormatCaller(path, line); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, action, src); - } - - /// - /// Creates a GPU parallel for loop. - /// The body of the kernel is run on a GPU target. - /// This overload specifies that ten arrays are used on the GPU. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The tenth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The line number this method was called from. - /// The path to the file this method was called from. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - /// The base type of the tenth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - where C : unmanaged - { - var handler = new AcceleratorHandler(); - string src = FormatCaller(path, line); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, action, src); - } - - /// - /// Creates a GPU parallel for loop. - /// The body of the kernel is run on a GPU target. - /// This overload specifies that eleven arrays are used on the GPU. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The tenth buffer to run the kernel with. - /// The eleventh buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The line number this method was called from. - /// The path to the file this method was called from. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - /// The base type of the tenth argument. Must be an unmanaged type. - /// The base type of the eleventh argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - where C : unmanaged - where D : unmanaged - { - var handler = new AcceleratorHandler(); - string src = FormatCaller(path, line); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, action, src); - } - - /// - /// Creates a GPU parallel for loop. - /// The body of the kernel is run on a GPU target. - /// This overload specifies that twelve arrays are used on the GPU. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The tenth buffer to run the kernel with. - /// The eleventh buffer to run the kernel with. - /// The twelfth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The line number this method was called from. - /// The path to the file this method was called from. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - /// The base type of the tenth argument. Must be an unmanaged type. - /// The base type of the eleventh argument. Must be an unmanaged type. - /// The base type of the twelfth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - where C : unmanaged - where D : unmanaged - where E : unmanaged - { - var handler = new AcceleratorHandler(); - string src = FormatCaller(path, line); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, action, src); - } - - /// - /// Creates a GPU parallel for loop. - /// The body of the kernel is run on a GPU target. - /// This overload specifies that thirteen arrays are used on the GPU. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The tenth buffer to run the kernel with. - /// The eleventh buffer to run the kernel with. - /// The twelfth buffer to run the kernel with. - /// The thirteenth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The line number this method was called from. - /// The path to the file this method was called from. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - /// The base type of the tenth argument. Must be an unmanaged type. - /// The base type of the eleventh argument. Must be an unmanaged type. - /// The base type of the twelfth argument. Must be an unmanaged type. - /// The base type of the thirteenth argument. Must be an unmanaged type. - public static void ParallelFor(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Buffer buf13, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - where C : unmanaged - where D : unmanaged - where E : unmanaged - where F : unmanaged - { - var handler = new AcceleratorHandler(); - string src = FormatCaller(path, line); - handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, action, src); - } - } -} \ No newline at end of file diff --git a/DotMP/GPU/Gpu.tt b/DotMP/GPU/Gpu.tt new file mode 100644 index 00000000..872b0750 --- /dev/null +++ b/DotMP/GPU/Gpu.tt @@ -0,0 +1,76 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +<#@ template debug="false" hostspecific="false" language="C#" #> +<#@ output extension=".cs" #> +<# var letters = new char[] { 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'A', 'B', 'C', 'D', 'E', 'F' }; + int max = 13; #> + +using System; +using System.Runtime.CompilerServices; + +namespace DotMP.GPU +{ + /// + /// The main class of DotMP's GPU API, powered by the ILGPU project. + /// Contains all the main methods for constructing and running GPU kernels. + /// The GPU API is not thread-safe at the current moment, so its methods should not be called from within a Parallel.ParallelRegion! + /// + public static class Parallel + { + /// + /// Formats the caller information for determining uniqueness of a call. + /// + /// The calling file. + /// The calling line number. + /// A formatted string representing "{filename}:{linenum}" + private static string FormatCaller(string filename, int linenum) + { + return string.Format("{0}:{1}", filename, linenum); + } + +<# for (int c = 1; c <= max; c++) { #> + /// + /// Creates a GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// This overload specifies that one array is used on the GPU. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. +<# for (int i = 0; i < c; i++) { #> + /// Buffer #<#= i + 1 #> to run the kernel with. +<# } #> + /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. + public static void ParallelFor< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >(int start, int end, +<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> + Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) +<# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #> + { + var handler = new AcceleratorHandler(); + string src = FormatCaller(path, line); + handler.DispatchKernel(start, end, +<# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #> + action, src); + } +<# } #> + } +} \ No newline at end of file From 945f7e74637afdcb86be95aff2c9556643602374 Mon Sep 17 00:00:00 2001 From: Lane Date: Sat, 11 Nov 2023 18:15:05 -0600 Subject: [PATCH 41/61] implement collapsed for loops --- DotMP/GPU/AcceleratorHandler.tt | 10 +++++----- DotMP/GPU/Gpu.tt | 33 ++++++++++++++++++++++++++++++++- DotMP/GPU/Index.cs | 32 +++++++++++++++++--------------- 3 files changed, 54 insertions(+), 21 deletions(-) diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt index 20fd8fe3..88434822 100644 --- a/DotMP/GPU/AcceleratorHandler.tt +++ b/DotMP/GPU/AcceleratorHandler.tt @@ -115,8 +115,7 @@ namespace DotMP.GPU /// /// Dispatches a kernel with the given number of parameters. /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. + /// The ranges of the for loop. <# for (int i = 0; i < c; i++) { #> /// Buffer #<#= i + 1 #> to run the kernel with. <# } #> @@ -124,18 +123,19 @@ namespace DotMP.GPU /// The originating caller location. internal void DispatchKernel< <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> - >(int start, int end, + >((int, int)[] ranges, <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> > action, string src) <# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #> { - var idx = new Index(start); + var len = ranges.Select(tup => tup.Item2 - tup.Item1).Aggregate((x, y) => x * y); + var idx = new Index(ranges); var kernel = GetKernel(action, src); - kernel(((end - start) / block_size, block_size), idx + kernel((len / block_size, block_size), idx <# for (int i = 0; i < c; i++) { #> , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>) <# } #> diff --git a/DotMP/GPU/Gpu.tt b/DotMP/GPU/Gpu.tt index 872b0750..d6962b02 100644 --- a/DotMP/GPU/Gpu.tt +++ b/DotMP/GPU/Gpu.tt @@ -67,7 +67,38 @@ namespace DotMP.GPU { var handler = new AcceleratorHandler(); string src = FormatCaller(path, line); - handler.DispatchKernel(start, end, + handler.DispatchKernel(new (int, int)[] { (start, end) }, +<# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #> + action, src); + } +<# } #> + +<# for (int c = 1; c <= max; c++) { #> + /// + /// Creates a collapsed GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// This overload specifies that one array is used on the GPU. + /// + /// The range of the outer for loop. + /// The range of the outer for loop. +<# for (int i = 0; i < c; i++) { #> + /// Buffer #<#= i + 1 #> to run the kernel with. +<# } #> + /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. + public static void ParallelForCollapse< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >((int, int) range1, (int, int) range2, +<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> + Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) +<# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #> + { + var handler = new AcceleratorHandler(); + string src = FormatCaller(path, line); + handler.DispatchKernel(new (int, int)[] { range1, range2 }, <# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #> action, src); } diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs index 8a0c09f4..83d9f7e6 100644 --- a/DotMP/GPU/Index.cs +++ b/DotMP/GPU/Index.cs @@ -42,23 +42,25 @@ public struct Index /// /// Constructor. /// - /// The start of the parallel for loop. - internal Index(int start) - { - this.start1 = start; - this.start2 = 0; - i_prv = -1; - j_prv = -1; - diff = 0; - } - + /// The ranges of the for loop. internal Index((int, int)[] ranges) { - start1 = ranges[0].Item1; - start2 = ranges[1].Item1; - i_prv = -1; - j_prv = -1; - diff = ranges[1].Item2 - ranges[1].Item1; + if (ranges.Length == 1) + { + start1 = ranges[0].Item1; + start2 = -1; + i_prv = -1; + j_prv = -1; + diff = -1; + } + else + { + start1 = ranges[0].Item1; + start2 = ranges[1].Item1; + i_prv = -1; + j_prv = -1; + diff = ranges[1].Item2 - ranges[1].Item1; + } } /// From d90279e3923fbc7fddfc7681b93108cd4ca5d8da Mon Sep 17 00:00:00 2001 From: Lane Date: Sun, 12 Nov 2023 13:03:10 -0600 Subject: [PATCH 42/61] remove erroneous comment line --- DotMP/GPU/Gpu.tt | 2 -- 1 file changed, 2 deletions(-) diff --git a/DotMP/GPU/Gpu.tt b/DotMP/GPU/Gpu.tt index d6962b02..4ac1f49e 100644 --- a/DotMP/GPU/Gpu.tt +++ b/DotMP/GPU/Gpu.tt @@ -46,7 +46,6 @@ namespace DotMP.GPU /// /// Creates a GPU parallel for loop. /// The body of the kernel is run on a GPU target. - /// This overload specifies that one array is used on the GPU. /// /// The start of the loop, inclusive. /// The end of the loop, exclusive. @@ -77,7 +76,6 @@ namespace DotMP.GPU /// /// Creates a collapsed GPU parallel for loop. /// The body of the kernel is run on a GPU target. - /// This overload specifies that one array is used on the GPU. /// /// The range of the outer for loop. /// The range of the outer for loop. From f22ac639a6300e4db8a8790030992ef7efa9ae46 Mon Sep 17 00:00:00 2001 From: Lane Date: Sun, 12 Nov 2023 13:03:26 -0600 Subject: [PATCH 43/61] test with 500x500 instead of 514x514 --- benchmarks/GPUHeatTransfer/Program.cs | 38 ++++++++++++++++++++------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs index 58ca52e0..1800e673 100644 --- a/benchmarks/GPUHeatTransfer/Program.cs +++ b/benchmarks/GPUHeatTransfer/Program.cs @@ -37,7 +37,7 @@ public class HeatTransfer public enum ParType { DMPFor, DMPGPU } // test dims of 100x100, 1000x1000, and 5000x5000 - [Params(514)] + [Params(500)] public int dim; // test with 10 steps and 100 steps @@ -64,8 +64,13 @@ public void Setup() scratch = new double[dim, dim]; grid = new double[dim, dim]; - grid[0, dim / 2 - 1] = 100.0; - grid[0, dim / 2] = 100.0; + for (int i = 0; i < dim; i++) + { + grid[0, i] = 100.0; + grid[i, 0] = 100.0; + grid[dim - 1, i] = 100.0; + grid[i, dim - 1] = 100.0; + } if (type == ParType.DMPGPU) { @@ -129,13 +134,17 @@ public void DoStep() case ParType.DMPGPU: DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) => { + int i = idx.i; + int j = idx.j; //set the scratch array to the average of the surrounding cells - scratch[idx.i, idx.j] = 0.25 * (grid[idx.i - 1, idx.j] + grid[idx.i + 1, idx.j] + grid[idx.i, idx.j - 1] + grid[idx.i, idx.j + 1]); + scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); }); DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) => { - grid[idx.i, idx.j] = scratch[idx.i, idx.j]; + int i = idx.i; + int j = idx.j; + grid[i, j] = scratch[i, j]; }); break; } @@ -154,7 +163,7 @@ public class HeatTransferVerify public enum ParType { DMPFor, DMPGPU } // test dims of 100x100, 1000x1000, and 5000x5000 - public int dim = 514; + public int dim = 500; // test with 10 steps and 100 steps public int steps = 100; @@ -177,8 +186,13 @@ public void Setup() scratch = new double[dim, dim]; grid = new double[dim, dim]; - grid[0, dim / 2 - 1] = 100.0; - grid[0, dim / 2] = 100.0; + for (int i = 0; i < dim; i++) + { + grid[0, i] = 100.0; + grid[i, 0] = 100.0; + grid[dim - 1, i] = 100.0; + grid[i, dim - 1] = 100.0; + } if (type == ParType.DMPGPU) { @@ -241,13 +255,17 @@ public void DoStep() case ParType.DMPGPU: DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) => { + int i = idx.i; + int j = idx.j; //set the scratch array to the average of the surrounding cells - scratch[idx.i, idx.j] = 0.25 * (grid[idx.i - 1, idx.j] + grid[idx.i + 1, idx.j] + grid[idx.i, idx.j - 1] + grid[idx.i, idx.j + 1]); + scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); }); DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) => { - grid[idx.i, idx.j] = scratch[idx.i, idx.j]; + int i = idx.i; + int j = idx.j; + grid[i, j] = scratch[i, j]; }); break; } From aeba3b837f36790a080ea21c49d93edf51091b76 Mon Sep 17 00:00:00 2001 From: Lane Date: Sun, 12 Nov 2023 13:03:52 -0600 Subject: [PATCH 44/61] properly handle loops not divisible by block size --- DotMP/GPU/AcceleratorHandler.tt | 13 +++++++++++++ DotMP/GPU/Index.cs | 19 +++++++++++++------ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt index 88434822..260354f7 100644 --- a/DotMP/GPU/AcceleratorHandler.tt +++ b/DotMP/GPU/AcceleratorHandler.tt @@ -141,6 +141,19 @@ namespace DotMP.GPU <# } #> ); + int not_done = len % block_size; + + if (not_done > 0) + { + idx = new Index(ranges, len - (not_done)); + + kernel((1, not_done), idx +<# for (int i = 0; i < c; i++) { #> + , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>) +<# } #> + ); + } + Synchronize(); } <# } #> diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs index 83d9f7e6..be52d892 100644 --- a/DotMP/GPU/Index.cs +++ b/DotMP/GPU/Index.cs @@ -15,6 +15,7 @@ */ using ILGPU; +using ILGPU.Runtime.Cuda; using System; using System.Diagnostics.CodeAnalysis; using System.Linq; @@ -39,12 +40,16 @@ public struct Index private int diff; + private int offset; + /// /// Constructor. /// /// The ranges of the for loop. - internal Index((int, int)[] ranges) + internal Index((int, int)[] ranges, int offset = 0) { + this.offset = offset; + if (ranges.Length == 1) { start1 = ranges[0].Item1; @@ -70,7 +75,7 @@ internal Index((int, int)[] ranges) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static implicit operator int(Index h) { - return Grid.GlobalLinearIndex + h.start1; + return Grid.GlobalLinearIndex + h.start1 + h.offset; } public int i @@ -79,8 +84,9 @@ public int i { if (i_prv == -1) { - i_prv = IntrinsicMath.DivRoundDown(Grid.GlobalLinearIndex, diff); - j_prv = Grid.GlobalLinearIndex - i_prv * diff; + int idxoffset = Grid.GlobalLinearIndex + offset; + i_prv = IntrinsicMath.DivRoundDown(idxoffset, diff); + j_prv = idxoffset - i_prv * diff; i_prv += start1; j_prv += start2; } @@ -95,8 +101,9 @@ public int j { if (j_prv == -1) { - i_prv = IntrinsicMath.DivRoundDown(Grid.GlobalLinearIndex, diff); - j_prv = Grid.GlobalLinearIndex - i_prv * diff; + int idxoffset = Grid.GlobalLinearIndex + offset; + i_prv = IntrinsicMath.DivRoundDown(idxoffset, diff); + j_prv = idxoffset - i_prv * diff; i_prv += start1; j_prv += start2; } From a93d9fafc93e48f45c275e6b53c3e39ef69b0a13 Mon Sep 17 00:00:00 2001 From: Lane Date: Sun, 12 Nov 2023 13:11:44 -0600 Subject: [PATCH 45/61] turn array bounds into off-256-divisble size for better testing --- DotMP-Tests/GPUTests.cs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/DotMP-Tests/GPUTests.cs b/DotMP-Tests/GPUTests.cs index c155fd2d..244f063c 100644 --- a/DotMP-Tests/GPUTests.cs +++ b/DotMP-Tests/GPUTests.cs @@ -23,11 +23,11 @@ public class GPUTests [Fact] public void GPU_for_works() { - double[] a = new double[65536]; - double[] x = new double[65536]; - double[] y = new double[65536]; - float[] res = new float[65536]; - float[] res_cpu = new float[65536]; + double[] a = new double[50000]; + double[] x = new double[50000]; + double[] y = new double[50000]; + float[] res = new float[50000]; + float[] res_cpu = new float[50000]; random_init(a); random_init(x); @@ -81,4 +81,4 @@ private void random_init(T[] arr) } } } -} \ No newline at end of file +} From 11d4baf3a41b97aa568b92cd9bfcbaa009f74d30 Mon Sep 17 00:00:00 2001 From: Lane Date: Sun, 12 Nov 2023 18:20:47 -0600 Subject: [PATCH 46/61] enable more optimizations --- DotMP/GPU/AcceleratorHandler.tt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt index 260354f7..31b71070 100644 --- a/DotMP/GPU/AcceleratorHandler.tt +++ b/DotMP/GPU/AcceleratorHandler.tt @@ -60,7 +60,12 @@ namespace DotMP.GPU { if (initialized) return; - context = Context.CreateDefault(); + context = Context.Create() + .Optimize(OptimizationLevel.O2) + .Inlining(InliningMode.Aggressive) + .AllAccelerators() + //.Math(MathMode.Fast32BitOnly) + .ToContext(); var selectedDevice = context.Devices[0]; foreach (var d in context.Devices) @@ -158,4 +163,4 @@ namespace DotMP.GPU } <# } #> } -} \ No newline at end of file +} From 1c12e3911144a612afc495bf10e7402ba90d8698 Mon Sep 17 00:00:00 2001 From: Lane Date: Sun, 12 Nov 2023 18:20:56 -0600 Subject: [PATCH 47/61] add GPU kernel launch overhead benchmark --- benchmarks/GPUOverhead/GPUOverhead.csproj | 18 ++++++++ benchmarks/GPUOverhead/Program.cs | 56 +++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 benchmarks/GPUOverhead/GPUOverhead.csproj create mode 100644 benchmarks/GPUOverhead/Program.cs diff --git a/benchmarks/GPUOverhead/GPUOverhead.csproj b/benchmarks/GPUOverhead/GPUOverhead.csproj new file mode 100644 index 00000000..9cf0a6f0 --- /dev/null +++ b/benchmarks/GPUOverhead/GPUOverhead.csproj @@ -0,0 +1,18 @@ + + + + Exe + net6.0 + enable + enable + + + + + + + + + + + diff --git a/benchmarks/GPUOverhead/Program.cs b/benchmarks/GPUOverhead/Program.cs new file mode 100644 index 00000000..b9ff18a5 --- /dev/null +++ b/benchmarks/GPUOverhead/Program.cs @@ -0,0 +1,56 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Jobs; +using BenchmarkDotNet.Running; +using BenchmarkDotNet.Diagnosers; + +/* jscpd:ignore-start */ + +[SimpleJob(RuntimeMoniker.Net60)] +[ThreadingDiagnoser] +[HardwareCounters] +[EventPipeProfiler(EventPipeProfile.CpuSampling)] +public class Overhead +{ + DotMP.GPU.Buffer buf; + + // run the setup + [GlobalSetup] + public void Setup() + { + buf = new DotMP.GPU.Buffer(new byte[1], DotMP.GPU.Buffer.Behavior.NoCopy); + } + + //run the simulation + [Benchmark] + public void TestOverhead() + { + DotMP.GPU.Parallel.ParallelFor(0, 1, buf, (i, buf) => { }); + } +} + +/* jscpd:ignore-end */ + +// driver +public class Program +{ + public static void Main(string[] args) + { + BenchmarkRunner.Run(); + } +} From f2b2360f771411c0c6fadc902510295b8441dfe1 Mon Sep 17 00:00:00 2001 From: Lane Date: Mon, 13 Nov 2023 07:39:35 -0600 Subject: [PATCH 48/61] begin progress towards better index integration --- DotMP/GPU/AcceleratorHandler.tt | 191 +++++++++++++- DotMP/GPU/Gpu.tt | 49 +++- DotMP/GPU/Index.cs | 345 ++++++++++++++++++++++---- benchmarks/GPUHeatTransfer/Program.cs | 16 +- 4 files changed, 516 insertions(+), 85 deletions(-) diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt index 31b71070..0fbe6ee5 100644 --- a/DotMP/GPU/AcceleratorHandler.tt +++ b/DotMP/GPU/AcceleratorHandler.tt @@ -98,19 +98,75 @@ namespace DotMP.GPU /// The action provided on the CPU. /// The calling location. /// The GPU kernel. - private Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> > GetKernel< <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> - >(Action(Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #> > action, string src) -<# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #> +<# for (int i = 0; i < c; i++) { #> + where <#= letters[i] #> : unmanaged +<# } #> + { + if (!kernels.ContainsKey(src)) + kernels.Add(src, accelerator.LoadStreamKernel(action)); + + return (Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + >) kernels[src]; + } +<# } #> + +<# for (int c = 1; c <= max - 1; c++) { #> + /// + /// Get the kernel associated with this lambda. + /// + /// The action provided on the CPU. + /// The calling location. + /// The GPU kernel. + private Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > GetKernel< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >(Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #> + > action, string src) +<# for (int i = 0; i < c; i++) { #> + where <#= letters[i] #> : unmanaged +<# } #> + { + if (!kernels.ContainsKey(src)) + kernels.Add(src, accelerator.LoadStreamKernel(action)); + + return (Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + >) kernels[src]; + } +<# } #> + +<# for (int c = 1; c <= max - 2; c++) { #> + /// + /// Get the kernel associated with this lambda. + /// + /// The action provided on the CPU. + /// The calling location. + /// The GPU kernel. + private Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > GetKernel< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >(Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #> + > action, string src) +<# for (int i = 0; i < c; i++) { #> + where <#= letters[i] #> : unmanaged +<# } #> { if (!kernels.ContainsKey(src)) kernels.Add(src, accelerator.LoadStreamKernel(action)); - return (Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> >) kernels[src]; } @@ -118,9 +174,9 @@ namespace DotMP.GPU <# for (int c = 1; c <= max; c++) { #> /// - /// Dispatches a kernel with the given number of parameters. + /// Dispatches a linear kernel with the given number of parameters. /// - /// The ranges of the for loop. + /// The range of the for loop. <# for (int i = 0; i < c; i++) { #> /// Buffer #<#= i + 1 #> to run the kernel with. <# } #> @@ -128,15 +184,17 @@ namespace DotMP.GPU /// The originating caller location. internal void DispatchKernel< <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> - >((int, int)[] ranges, + >((int, int) range1, <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> - Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> > action, string src) -<# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #> +<# for (int i = 0; i < c; i++) { #> + where <#= letters[i] #> : unmanaged +<# } #> { - var len = ranges.Select(tup => tup.Item2 - tup.Item1).Aggregate((x, y) => x * y); - var idx = new Index(ranges); + var len = range1.Item2 - range1.Item1; + var idx = new IndexI(range1); var kernel = GetKernel(action, src); @@ -150,7 +208,7 @@ namespace DotMP.GPU if (not_done > 0) { - idx = new Index(ranges, len - (not_done)); + idx = new IndexI(range1, len - (not_done)); kernel((1, not_done), idx <# for (int i = 0; i < c; i++) { #> @@ -162,5 +220,114 @@ namespace DotMP.GPU Synchronize(); } <# } #> + +<# for (int c = 1; c <= max - 1; c++) { #> + /// + /// Dispatches a 2D kernel with the given number of parameters. + /// + /// The outer range of the for loop. + /// The inner range of the for loop. +<# for (int i = 0; i < c; i++) { #> + /// Buffer #<#= i + 1 #> to run the kernel with. +<# } #> + /// The kernel to run on the GPU. + /// The originating caller location. + internal void DispatchKernel< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >((int, int) range1, (int, int) range2, +<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> + Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > action, string src) +<# for (int i = 0; i < c; i++) { #> + where <#= letters[i] #> : unmanaged +<# } #> + { + var len = (range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1); + var i = new IndexI(range1, range2); + var j = new IndexJ(range1, range2); + + var kernel = GetKernel(action, src); + + kernel((len / block_size, block_size), i, j +<# for (int i = 0; i < c; i++) { #> + , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>) +<# } #> + ); + + int not_done = len % block_size; + + if (not_done > 0) + { + int offset = len - not_done; + i = new IndexI(range1, range2, offset); + j = new IndexJ(range1, range2, offset); + + kernel((1, not_done), i, j +<# for (int i = 0; i < c; i++) { #> + , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>) +<# } #> + ); + } + + Synchronize(); + } +<# } #> + +<# for (int c = 1; c <= max - 2; c++) { #> + /// + /// Dispatches a 3D kernel with the given number of parameters. + /// + /// The outer range of the for loop. + /// The middle range of the for loop. + /// The inner range of the for loop. +<# for (int i = 0; i < c; i++) { #> + /// Buffer #<#= i + 1 #> to run the kernel with. +<# } #> + /// The kernel to run on the GPU. + /// The originating caller location. + internal void DispatchKernel< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >((int, int) range1, (int, int) range2, (int, int) range3, +<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> + Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > action, string src) +<# for (int i = 0; i < c; i++) { #> + where <#= letters[i] #> : unmanaged +<# } #> + { + var len = (range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1); + var i = new IndexI(range1, range2, range3); + var j = new IndexJ(range1, range2, range3); + var k = new IndexK(range1, range2, range3); + + var kernel = GetKernel(action, src); + + kernel((len / block_size, block_size), i, j, k +<# for (int i = 0; i < c; i++) { #> + , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>) +<# } #> + ); + + int not_done = len % block_size; + + if (not_done > 0) + { + int offset = len - not_done; + i = new IndexI(range1, range2, range3, offset); + j = new IndexJ(range1, range2, range3, offset); + k = new IndexK(range1, range2, range3, offset); + + kernel((1, not_done), i, j, k +<# for (int i = 0; i < c; i++) { #> + , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>) +<# } #> + ); + } + + Synchronize(); + } +<# } #> } } diff --git a/DotMP/GPU/Gpu.tt b/DotMP/GPU/Gpu.tt index 4ac1f49e..060c3d8a 100644 --- a/DotMP/GPU/Gpu.tt +++ b/DotMP/GPU/Gpu.tt @@ -59,26 +59,26 @@ namespace DotMP.GPU <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> >(int start, int end, <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> - Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) <# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #> { var handler = new AcceleratorHandler(); string src = FormatCaller(path, line); - handler.DispatchKernel(new (int, int)[] { (start, end) }, + handler.DispatchKernel((start, end), <# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #> action, src); } <# } #> -<# for (int c = 1; c <= max; c++) { #> +<# for (int c = 1; c <= max - 1; c++) { #> /// /// Creates a collapsed GPU parallel for loop. /// The body of the kernel is run on a GPU target. /// /// The range of the outer for loop. - /// The range of the outer for loop. + /// The range of the inner for loop. <# for (int i = 0; i < c; i++) { #> /// Buffer #<#= i + 1 #> to run the kernel with. <# } #> @@ -89,14 +89,49 @@ namespace DotMP.GPU <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> >((int, int) range1, (int, int) range2, <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> - Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) -<# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #> +<# for (int i = 0; i < c; i++) { #> + where <#= letters[i] #> : unmanaged +<# } #> + { + var handler = new AcceleratorHandler(); + string src = FormatCaller(path, line); + handler.DispatchKernel(range1, range2, +<# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #> + action, src); + } +<# } #> + +<# for (int c = 1; c <= max - 2; c++) { #> + /// + /// Creates a collapsed GPU parallel for loop. + /// The body of the kernel is run on a GPU target. + /// + /// The range of the outer for loop. + /// The range of the middle for loop. + /// The range of the inner for loop. +<# for (int i = 0; i < c; i++) { #> + /// Buffer #<#= i + 1 #> to run the kernel with. +<# } #> + /// The kernel to run on the GPU. + /// The line number this method was called from. + /// The path to the file this method was called from. + public static void ParallelForCollapse< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> + >((int, int) range1, (int, int) range2, (int, int) range3, +<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> + Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) +<# for (int i = 0; i < c; i++) { #> + where <#= letters[i] #> : unmanaged +<# } #> { var handler = new AcceleratorHandler(); string src = FormatCaller(path, line); - handler.DispatchKernel(new (int, int)[] { range1, range2 }, + handler.DispatchKernel(range1, range2, range3, <# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #> action, src); } diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs index be52d892..f80cf57d 100644 --- a/DotMP/GPU/Index.cs +++ b/DotMP/GPU/Index.cs @@ -15,101 +15,338 @@ */ using ILGPU; -using ILGPU.Runtime.Cuda; -using System; using System.Diagnostics.CodeAnalysis; -using System.Linq; using System.Runtime.CompilerServices; namespace DotMP.GPU { /// - /// Handle for a GPU kernel to retrieve its kernel variables. + /// Represents an index passed as the first index argument. /// [ExcludeFromCodeCoverage] - public struct Index + public struct IndexI { /// - /// The start of the for loop, for index calculations. + /// The start of the first for loop, for index calculations. /// private int start1; + + /// + /// The start of the second for loop, for index calculations. + /// private int start2; - private int i_prv; - private int j_prv; + /// + /// The start of the third for loop, for index calculations. + /// + private int start3; + + /// + /// The index to return. + /// + private int idx_prv; + + /// + /// The difference between the second set of ranges. + /// + private int diff2; - private int diff; + /// + /// The difference between the third set of ranges. + /// + private int diff3; + /// + /// The offset, in case of a followup kernel. + /// private int offset; + /// + /// The number of dimensions. + /// + private int dims; + /// /// Constructor. /// - /// The ranges of the for loop. - internal Index((int, int)[] ranges, int offset = 0) + /// The range of the for loop. + /// The offset for followup kernels. + internal IndexI((int, int) range, int offset = 0) { this.offset = offset; - if (ranges.Length == 1) - { - start1 = ranges[0].Item1; - start2 = -1; - i_prv = -1; - j_prv = -1; - diff = -1; - } - else + start1 = range.Item1; + start2 = -1; + start3 = -1; + idx_prv = -1; + diff2 = -1; + diff3 = -1; + dims = 1; + } + + /// + /// Constructor. + /// + /// The outer range of the for loop. + /// The inner range of the for loop. + /// The offset for followup kernels. + internal IndexI((int, int) range1, (int, int) range2, int offset = 0) + { + this.offset = offset; + + start1 = range1.Item1; + start2 = range2.Item1; + start3 = -1; + idx_prv = -1; + diff2 = range2.Item2 - range2.Item1; + diff3 = -1; + dims = 2; + } + + /// + /// Constructor. + /// + /// The outer range of the for loop. + /// The middle range of the for loop. + /// The inner range of the for loop. + /// The offset for followup kernels. + internal IndexI((int, int) range1, (int, int) range2, (int, int) range3, int offset = 0) + { + this.offset = offset; + + start1 = range1.Item1; + start2 = range2.Item1; + start3 = range3.Item1; + idx_prv = -1; + diff2 = range2.Item2 - range2.Item1; + diff3 = range3.Item2 - range3.Item1; + dims = 3; + } + + /// + /// Casts an index to an int. + /// + /// The Index struct to cast. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static implicit operator int(IndexI h) + { + switch (h.dims) { - start1 = ranges[0].Item1; - start2 = ranges[1].Item1; - i_prv = -1; - j_prv = -1; - diff = ranges[1].Item2 - ranges[1].Item1; + default: + case 1: + if (h.idx_prv == -1) + h.idx_prv = Grid.GlobalLinearIndex + h.start1 + h.offset; + + return h.idx_prv; + + case 2: + if (h.idx_prv == -1) + { + int idxoffset = Grid.GlobalLinearIndex + h.offset; + h.idx_prv = IntrinsicMath.DivRoundDown(idxoffset, h.diff2) + h.start1; + } + + return h.idx_prv; + + case 3: + if (h.idx_prv == -1) + { + + } + + return h.idx_prv; } } + } + /// + /// Represents an index passed as the second index argument. + /// + [ExcludeFromCodeCoverage] + public struct IndexJ + { /// - /// Gets the index of the loop. + /// The start of the first for loop, for index calculations. /// - /// Unused. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static implicit operator int(Index h) + private int start1; + + /// + /// The start of the second for loop, for index calculations. + /// + private int start2; + + /// + /// The start of the third for loop, for index calculations. + /// + private int start3; + + /// + /// The index to return. + /// + private int idx_prv; + + /// + /// The difference between the second set of ranges. + /// + private int diff2; + + /// + /// The difference between the third set of ranges. + /// + private int diff3; + + /// + /// The offset, in case of a followup kernel. + /// + private int offset; + + /// + /// The number of dimensions. + /// + private int dims; + + /// + /// Constructor. + /// + /// The outer range of the for loop. + /// The inner range of the for loop. + /// The offset for followup kernels. + internal IndexJ((int, int) range1, (int, int) range2, int offset = 0) { - return Grid.GlobalLinearIndex + h.start1 + h.offset; + this.offset = offset; + + start1 = range1.Item1; + start2 = range2.Item1; + start3 = -1; + idx_prv = -1; + diff2 = range2.Item2 - range2.Item1; + diff3 = -1; + dims = 2; } - public int i + /// + /// Constructor. + /// + /// The outer range of the for loop. + /// The middle range of the for loop. + /// The inner range of the for loop. + /// The offset for followup kernels. + internal IndexJ((int, int) range1, (int, int) range2, (int, int) range3, int offset = 0) { - get + this.offset = offset; + + start1 = range1.Item1; + start2 = range2.Item1; + start3 = range3.Item1; + idx_prv = -1; + diff2 = range2.Item2 - range2.Item1; + diff3 = range3.Item2 - range3.Item1; + dims = 3; + } + + /// + /// Casts an index to an int. + /// + /// The Index struct to cast. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static implicit operator int(IndexJ h) + { + switch (h.dims) { - if (i_prv == -1) - { - int idxoffset = Grid.GlobalLinearIndex + offset; - i_prv = IntrinsicMath.DivRoundDown(idxoffset, diff); - j_prv = idxoffset - i_prv * diff; - i_prv += start1; - j_prv += start2; - } - - return i_prv; + default: + case 2: + if (h.idx_prv == -1) + { + int idxoffset = Grid.GlobalLinearIndex + h.offset; + h.idx_prv = (idxoffset % h.diff2) + h.start2; + } + + return h.idx_prv; + + case 3: + if (h.idx_prv == -1) + { + + } + + return h.idx_prv; } } + } + + /// + /// Represents an index passed as the third index argument. + /// + [ExcludeFromCodeCoverage] + public struct IndexK + { + /// + /// The start of the first for loop, for index calculations. + /// + private int start1; + + /// + /// The start of the second for loop, for index calculations. + /// + private int start2; + + /// + /// The start of the third for loop, for index calculations. + /// + private int start3; + + /// + /// The index to return. + /// + private int idx_prv; + + /// + /// The difference between the second set of ranges. + /// + private int diff2; + + /// + /// The difference between the third set of ranges. + /// + private int diff3; + + /// + /// The offset, in case of a followup kernel. + /// + private int offset; + + /// + /// Constructor. + /// + /// The outer range of the for loop. + /// The middle range of the for loop. + /// The inner range of the for loop. + /// The offset for followup kernels. + internal IndexK((int, int) range1, (int, int) range2, (int, int) range3, int offset = 0) + { + this.offset = offset; + + start1 = range1.Item1; + start2 = range2.Item1; + start3 = range3.Item1; + idx_prv = -1; + diff2 = range2.Item2 - range2.Item1; + diff3 = range3.Item2 - range3.Item1; + } - public int j + /// + /// Casts an index to an int. + /// + /// The Index struct to cast. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static implicit operator int(IndexK h) { - get + if (h.idx_prv == -1) { - if (j_prv == -1) - { - int idxoffset = Grid.GlobalLinearIndex + offset; - i_prv = IntrinsicMath.DivRoundDown(idxoffset, diff); - j_prv = idxoffset - i_prv * diff; - i_prv += start1; - j_prv += start2; - } - - return j_prv; + } + + return h.idx_prv; } } } \ No newline at end of file diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs index 1800e673..0d332da2 100644 --- a/benchmarks/GPUHeatTransfer/Program.cs +++ b/benchmarks/GPUHeatTransfer/Program.cs @@ -132,18 +132,14 @@ public void DoStep() break; case ParType.DMPGPU: - DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) => + DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (i, j, grid, scratch) => { - int i = idx.i; - int j = idx.j; //set the scratch array to the average of the surrounding cells scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); }); - DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) => + DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (i, j, grid, scratch) => { - int i = idx.i; - int j = idx.j; grid[i, j] = scratch[i, j]; }); break; @@ -253,18 +249,14 @@ public void DoStep() break; case ParType.DMPGPU: - DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) => + DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (i, j, grid, scratch) => { - int i = idx.i; - int j = idx.j; //set the scratch array to the average of the surrounding cells scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); }); - DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) => + DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (i, j, grid, scratch) => { - int i = idx.i; - int j = idx.j; grid[i, j] = scratch[i, j]; }); break; From 1f201884a0b2f5d41ccad7d5138e63d203293c4f Mon Sep 17 00:00:00 2001 From: Lane Date: Mon, 13 Nov 2023 07:53:33 -0600 Subject: [PATCH 49/61] add test for forcollapse --- DotMP-Tests/GPUTests.cs | 44 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/DotMP-Tests/GPUTests.cs b/DotMP-Tests/GPUTests.cs index 244f063c..f2b7d9b8 100644 --- a/DotMP-Tests/GPUTests.cs +++ b/DotMP-Tests/GPUTests.cs @@ -5,6 +5,7 @@ using System.Text.Json.Serialization; using System.Threading; using DotMP; +using DotMP.GPU; using FluentAssertions; using Xunit; using Xunit.Abstractions; @@ -71,6 +72,49 @@ public void GPU_for_works() Assert.Equal(a, a_old); } + /// + /// Tests to make sure that DotMP.GPU.Parallel.ForCollapse produces correct results. + /// + [Fact] + public void Collapse_works() + { + int[,] iters_hit = new int[1024, 1024]; + + using (var buf = new Buffer(iters_hit, DotMP.GPU.Buffer.Behavior.ToFrom)) + { + DotMP.GPU.Parallel.ParallelForCollapse((258, 512), (512, 600), buf, action: (i, j, iters_hit) => + { + iters_hit[i, j]++; + }); + } + + for (int i = 0; i < 1024; i++) + for (int j = 0; j < 1024; j++) + if (i >= 258 && i < 512 && j >= 512 && j < 600) + iters_hit[i, j].Should().Be(1); + else + iters_hit[i, j].Should().Be(0); + + /*iters_hit = null; + + int[,,] iters_hit_3 = new int[128, 128, 64]; + + DotMP.Parallel.ParallelForCollapse((35, 64), (16, 100), (10, 62), num_threads: 8, chunk_size: 3, schedule: Schedule.Dynamic, action: (i, j, k) => + { + DotMP.Atomic.Inc(ref iters_hit_3[i, j, k]); + }); + + for (int i = 0; i < 128; i++) + for (int j = 0; j < 128; j++) + for (int k = 0; k < 64; k++) + if (i >= 35 && i < 64 && j >= 16 && j < 100 && k >= 10 && k < 62) + iters_hit_3[i, j, k].Should().Be(1); + else + iters_hit_3[i, j, k].Should().Be(0); + + iters_hit_3 = null;*/ + } + private void random_init(T[] arr) { Random r = new Random(); From 141aafdc2920a69d97e6f29e9dc59f7262b2ee15 Mon Sep 17 00:00:00 2001 From: Lane Date: Mon, 13 Nov 2023 12:41:09 -0600 Subject: [PATCH 50/61] tidy up calls --- DotMP-Tests/GPUTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DotMP-Tests/GPUTests.cs b/DotMP-Tests/GPUTests.cs index f2b7d9b8..0ff2e67c 100644 --- a/DotMP-Tests/GPUTests.cs +++ b/DotMP-Tests/GPUTests.cs @@ -82,7 +82,7 @@ public void Collapse_works() using (var buf = new Buffer(iters_hit, DotMP.GPU.Buffer.Behavior.ToFrom)) { - DotMP.GPU.Parallel.ParallelForCollapse((258, 512), (512, 600), buf, action: (i, j, iters_hit) => + DotMP.GPU.Parallel.ParallelForCollapse((258, 512), (512, 600), buf, (i, j, iters_hit) => { iters_hit[i, j]++; }); From b5ca7313fd33b38fe452e6c6ea8742fa006f0425 Mon Sep 17 00:00:00 2001 From: Lane Date: Mon, 13 Nov 2023 12:41:38 -0600 Subject: [PATCH 51/61] implement caching of indices --- DotMP/GPU/AcceleratorHandler.tt | 183 +++++++++++++++++++++++++++----- 1 file changed, 155 insertions(+), 28 deletions(-) diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt index 0fbe6ee5..e3582530 100644 --- a/DotMP/GPU/AcceleratorHandler.tt +++ b/DotMP/GPU/AcceleratorHandler.tt @@ -30,7 +30,7 @@ namespace DotMP.GPU /// /// The handler class managing GPU acceleration. /// - internal class AcceleratorHandler + internal sealed class AcceleratorHandler { /// /// Determines if a GPU context has been initialized yet. @@ -51,7 +51,15 @@ namespace DotMP.GPU /// /// Kernel cache. /// - private static Dictionary kernels = new Dictionary(); + private static Dictionary kernels = new Dictionary(); + + private static Dictionary>> indices1d = new Dictionary>>(); + + private static Dictionary, Buffer>> indices2d = + new Dictionary, Buffer>>(); + + private static Dictionary, ValueTuple, ValueTuple, Buffer, Buffer, Buffer>> indices3d = + new Dictionary, ValueTuple, ValueTuple, Buffer, Buffer, Buffer>>(); /// /// Default constructor. If this is the first time it's called, it initializes all relevant singleton data. @@ -98,11 +106,11 @@ namespace DotMP.GPU /// The action provided on the CPU. /// The calling location. /// The GPU kernel. - private Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> > GetKernel< <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> - >(Action(Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #> > action, string src) <# for (int i = 0; i < c; i++) { #> @@ -112,7 +120,7 @@ namespace DotMP.GPU if (!kernels.ContainsKey(src)) kernels.Add(src, accelerator.LoadStreamKernel(action)); - return (Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> >) kernels[src]; } @@ -125,11 +133,11 @@ namespace DotMP.GPU /// The action provided on the CPU. /// The calling location. /// The GPU kernel. - private Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> > GetKernel< <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> - >(Action(Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #> > action, string src) <# for (int i = 0; i < c; i++) { #> @@ -139,7 +147,7 @@ namespace DotMP.GPU if (!kernels.ContainsKey(src)) kernels.Add(src, accelerator.LoadStreamKernel(action)); - return (Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> >) kernels[src]; } @@ -152,11 +160,11 @@ namespace DotMP.GPU /// The action provided on the CPU. /// The calling location. /// The GPU kernel. - private Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> > GetKernel< <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> - >(Action(Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #> > action, string src) <# for (int i = 0; i < c; i++) { #> @@ -166,12 +174,129 @@ namespace DotMP.GPU if (!kernels.ContainsKey(src)) kernels.Add(src, accelerator.LoadStreamKernel(action)); - return (Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> >) kernels[src]; } <# } #> + /// + /// Precomputes and caches the indices for a 1D for loop. + /// + /// The range of the for loop. + /// The calling location in the source code. + /// A buffer representing the indices. + internal Index Get1DIdx((int, int) range, string src) + { + if (indices1d.ContainsKey(src)) + { + var data = indices1d[src]; + if (data.Item1 == range.Item1 && data.Item2 == range.Item2) + return new Index(data.Item3); + else + data.Item3.Dispose(); + } + + int[] indices = new int[range.Item2 - range.Item1]; + + for (int i = 0; i < indices.Length; i++) + indices[i] = i + range.Item1; + + var buf = new Buffer(indices, Buffer.Behavior.To); + indices1d[src] = (range.Item1, range.Item2, buf); + return new Index(buf); + } + + internal ValueTuple Get2DIdx((int, int) range1, (int, int) range2, string src) + { + if (indices2d.ContainsKey(src)) + { + var data = indices2d[src]; + if (data.Item1 == range1.Item1 && data.Item2 == range1.Item2 && + data.Item3 == range2.Item1 && data.Item4 == range2.Item2) + return (new Index(data.Item5), new Index(data.Item6)); + else + { + data.Item5.Dispose(); + data.Item6.Dispose(); + } + } + + int[] indi = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1)]; + int[] indj = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1)]; + + int ci = range1.Item1, cj = range2.Item1; + + for (int i = 0; i < indi.Length; i++) + { + indi[i] = ci; + indj[i] = cj; + + if (++cj == range2.Item2) + { + cj = range2.Item1; + ++ci; + } + } + + Console.WriteLine("Computed new indices..."); + var b1 = new Buffer(indi, Buffer.Behavior.To); + var b2 = new Buffer(indj, Buffer.Behavior.To); + indices2d[src] = (range1.Item1, range1.Item2, range2.Item1, range2.Item2, b1, b2); + + return (new Index(b1), new Index(b2)); + } + + internal ValueTuple Get3DIdx((int, int) range1, (int, int) range2, (int, int) range3, string src) + { + if (indices3d.ContainsKey(src)) + { + var data = indices3d[src]; + if (data.Item1.Item1 == range1.Item1 && data.Item1.Item2 == range1.Item2 && + data.Item2.Item1 == range2.Item1 && data.Item2.Item2 == range2.Item2 && + data.Item3.Item1 == range3.Item1 && data.Item3.Item2 == range3.Item2) + return (new Index(data.Item4), new Index(data.Item5), new Index(data.Item6)); + else + { + data.Item4.Dispose(); + data.Item5.Dispose(); + data.Item6.Dispose(); + } + } + + int[] indi = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1)]; + int[] indj = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1)]; + int[] indk = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1)]; + + int ci = range1.Item1, cj = range2.Item1, ck = range3.Item1; + + for (int i = 0; i < indi.Length; i++) + { + indi[i] = ci; + indj[i] = cj; + indk[i] = ck; + + if (++ck == range3.Item2) + { + ck = range3.Item1; + + if (++cj == range2.Item2) + { + cj = range2.Item1; + ++ci; + } + } + } + + var b1 = new Buffer(indi, Buffer.Behavior.To); + var b2 = new Buffer(indj, Buffer.Behavior.To); + var b3 = new Buffer(indk, Buffer.Behavior.To); + indices3d[src] = ((range1.Item1, range1.Item2), (range2.Item1, range2.Item2), (range3.Item1, range3.Item2), b1, b2, b3); + + return (new Index(b1), new Index(b2), new Index(b3)); + } + + <# for (int c = 1; c <= max; c++) { #> /// /// Dispatches a linear kernel with the given number of parameters. @@ -186,15 +311,15 @@ namespace DotMP.GPU <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> >((int, int) range1, <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> - Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> > action, string src) <# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #> { + var idx = Get1DIdx(range1, src); var len = range1.Item2 - range1.Item1; - var idx = new IndexI(range1); var kernel = GetKernel(action, src); @@ -208,7 +333,8 @@ namespace DotMP.GPU if (not_done > 0) { - idx = new IndexI(range1, len - (not_done)); + int offset = len - not_done; + idx.AddOffset(offset); kernel((1, not_done), idx <# for (int i = 0; i < c; i++) { #> @@ -236,7 +362,7 @@ namespace DotMP.GPU <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> >((int, int) range1, (int, int) range2, <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> - Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> > action, string src) <# for (int i = 0; i < c; i++) { #> @@ -244,14 +370,17 @@ namespace DotMP.GPU <# } #> { var len = (range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1); - var i = new IndexI(range1, range2); - var j = new IndexJ(range1, range2); + (var i, var j) = Get2DIdx(range1, range2, src); var kernel = GetKernel(action, src); +<# for (int i = 0; i < c; i++) { #> + var gpu<#= i + 1 #> = new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>); +<# } #> + kernel((len / block_size, block_size), i, j <# for (int i = 0; i < c; i++) { #> - , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>) + , gpu<#= i + 1 #> <# } #> ); @@ -260,12 +389,12 @@ namespace DotMP.GPU if (not_done > 0) { int offset = len - not_done; - i = new IndexI(range1, range2, offset); - j = new IndexJ(range1, range2, offset); + i.AddOffset(offset); + j.AddOffset(offset); kernel((1, not_done), i, j <# for (int i = 0; i < c; i++) { #> - , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>) + , gpu<#= i + 1 #> <# } #> ); } @@ -290,7 +419,7 @@ namespace DotMP.GPU <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> >((int, int) range1, (int, int) range2, (int, int) range3, <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> - Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> > action, string src) <# for (int i = 0; i < c; i++) { #> @@ -298,9 +427,7 @@ namespace DotMP.GPU <# } #> { var len = (range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1); - var i = new IndexI(range1, range2, range3); - var j = new IndexJ(range1, range2, range3); - var k = new IndexK(range1, range2, range3); + (var i, var j, var k) = Get3DIdx(range1, range2, range3, src); var kernel = GetKernel(action, src); @@ -315,9 +442,9 @@ namespace DotMP.GPU if (not_done > 0) { int offset = len - not_done; - i = new IndexI(range1, range2, range3, offset); - j = new IndexJ(range1, range2, range3, offset); - k = new IndexK(range1, range2, range3, offset); + i.AddOffset(offset); + j.AddOffset(offset); + k.AddOffset(offset); kernel((1, not_done), i, j, k <# for (int i = 0; i < c; i++) { #> From 2275b97f0fb143329a43238144bd987a1d914790 Mon Sep 17 00:00:00 2001 From: Lane Date: Mon, 13 Nov 2023 12:42:08 -0600 Subject: [PATCH 52/61] add support for 3D buffers --- DotMP/GPU/Buffer.cs | 53 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/DotMP/GPU/Buffer.cs b/DotMP/GPU/Buffer.cs index 87b756e3..df45c507 100644 --- a/DotMP/GPU/Buffer.cs +++ b/DotMP/GPU/Buffer.cs @@ -50,7 +50,7 @@ public enum Behavior /// /// Buffer to manage GPU memory. Should only be created on the CPU. /// - public class Buffer : IDisposable + public sealed class Buffer : IDisposable where T : unmanaged { /// @@ -63,6 +63,11 @@ public class Buffer : IDisposable /// private MemoryBuffer2D buf2d; + /// + /// The ILGPU buffer for 3D arrays. + /// + private MemoryBuffer3D buf3d; + /// /// Behavior of the data, as specified by Behavior. /// @@ -78,6 +83,11 @@ public class Buffer : IDisposable /// private T[,] data2d; + /// + /// The CPU 3D array, so that we can copy the data back. + /// + private T[,,] data3d; + /// /// Handler int for the number of dimensions in the array. /// @@ -156,6 +166,33 @@ public Buffer(T[,] data, Buffer.Behavior behavior) Dimensions = 2; } + /// + /// Constructor for buffer object. Allocates a 3D array on the GPU and makes it available for the next GPU kernel. + /// + /// The data to allocate on the GPU. + /// The behavior of the data, see Behavior. + public Buffer(T[,,] data, Buffer.Behavior behavior) + { + new AcceleratorHandler(); + + this.behavior = behavior; + this.data3d = data; + + switch (behavior) + { + case Buffer.Behavior.To: + case Buffer.Behavior.ToFrom: + buf3d = AcceleratorHandler.accelerator.Allocate3DDenseZY(data); + break; + case Buffer.Behavior.From: + case Buffer.Behavior.NoCopy: + buf3d = AcceleratorHandler.accelerator.Allocate3DDenseZY((data.GetLength(0), data.GetLength(1), data.GetLength(2))); + break; + } + + Dimensions = 3; + } + /// /// Dispose of the buffer, freeing GPU memory and copying any relevant data back to the CPU. /// @@ -179,6 +216,15 @@ public void Dispose() buf2d.Dispose(); } + else if (Dimensions == 3) + { + if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom) + { + System.Buffer.BlockCopy(buf3d.GetAsArray3D(), 0, data3d, 0, Unsafe.SizeOf() * data3d.Length); + } + + buf3d.Dispose(); + } } /// @@ -190,5 +236,10 @@ public void Dispose() /// Get the view of the memory for the GPU. /// internal ArrayView2D View2D { get => buf2d.View; } + + /// + /// Get the view of the memory for the GPU. + /// + internal ArrayView3D View3D { get => buf3d.View; } } } \ No newline at end of file From 81c39ba413456cf9a922c490b68be20c1617dd3d Mon Sep 17 00:00:00 2001 From: Lane Date: Mon, 13 Nov 2023 12:42:36 -0600 Subject: [PATCH 53/61] migrate to new index technique --- DotMP/GPU/Gpu.tt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/DotMP/GPU/Gpu.tt b/DotMP/GPU/Gpu.tt index 060c3d8a..6cf2d841 100644 --- a/DotMP/GPU/Gpu.tt +++ b/DotMP/GPU/Gpu.tt @@ -59,7 +59,7 @@ namespace DotMP.GPU <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> >(int start, int end, <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> - Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) <# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #> @@ -89,7 +89,7 @@ namespace DotMP.GPU <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> >((int, int) range1, (int, int) range2, <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> - Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) <# for (int i = 0; i < c; i++) { #> @@ -122,7 +122,7 @@ namespace DotMP.GPU <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> >((int, int) range1, (int, int) range2, (int, int) range3, <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #> - Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0) <# for (int i = 0; i < c; i++) { #> @@ -137,4 +137,4 @@ namespace DotMP.GPU } <# } #> } -} \ No newline at end of file +} From af259ec49d52f4936c6da3eb6f68cae222aa48d8 Mon Sep 17 00:00:00 2001 From: Lane Date: Mon, 13 Nov 2023 12:43:06 -0600 Subject: [PATCH 54/61] WIP --- DotMP/GPU/GpuArray.cs | 91 +++++++++++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 33 deletions(-) diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs index a7d7d705..b4cbe394 100644 --- a/DotMP/GPU/GpuArray.cs +++ b/DotMP/GPU/GpuArray.cs @@ -15,6 +15,7 @@ */ using ILGPU; +using ILGPU.IR.Values; using ILGPU.Runtime; using System; using System.Diagnostics.CodeAnalysis; @@ -30,15 +31,20 @@ public struct GPUArray where T : unmanaged { /// - /// The ILGPU buffer for 1D arrays. + /// The ILGPU view for 1D arrays. /// private ArrayView1D view1d; /// - /// The ILGPU buffer for 2D arrays. + /// The ILGPU view for 2D arrays. /// private ArrayView2D view2d; + /// + /// The ILGPU view for 3D arrays. + /// + private ArrayView3D view3d; + /// /// Number of dimensions. /// @@ -47,30 +53,37 @@ public struct GPUArray /// /// Constructor. /// - /// The ArrayView to wrap. - public GPUArray(Buffer arrayView) + /// The Buffer to create an array from. + internal GPUArray(Buffer buf) { - if (arrayView.Dimensions == 1) - { - view1d = arrayView.View1D; - // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. - view2d = new Buffer(new T[1, 1], Buffer.Behavior.NoCopy).View2D; - } - else if (arrayView.Dimensions == 2) - { - // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. - view1d = new Buffer(new T[1], Buffer.Behavior.NoCopy).View1D; - view2d = arrayView.View2D; - } - else + switch (buf.Dimensions) { - // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. - view1d = new Buffer(new T[1], Buffer.Behavior.NoCopy).View1D; - // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. - view2d = new Buffer(new T[1, 1], Buffer.Behavior.NoCopy).View2D; + /*case 1: + view1d = buf.View1D; + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + view2d = new Buffer(new T[1, 1], Buffer.Behavior.NoCopy).View2D; + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + view3d = new Buffer(new T[1, 1, 1], Buffer.Behavior.NoCopy).View3D; + break;*/ + default: + case 2: + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + //view1d = new Buffer(new T[1], Buffer.Behavior.NoCopy).View1D; + view2d = buf.View2D; + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + //view3d = new Buffer(new T[1, 1, 1], Buffer.Behavior.NoCopy).View3D; + break; + /*case 3: + default: + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + view1d = new Buffer(new T[1], Buffer.Behavior.NoCopy).View1D; + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + view2d = new Buffer(new T[1, 1], Buffer.Behavior.NoCopy).View2D; + view3d = buf.View3D; + break;*/ } - dims = arrayView.Dimensions; + dims = buf.Dimensions; } /// @@ -78,11 +91,10 @@ public GPUArray(Buffer arrayView) /// /// The ID to index into. /// The data at that ID. - public T this[int idx] - { - get => view1d[idx]; - set => view1d[idx] = value; - } + //public ref T this[int idx] + //{ + // get => ref view1d[idx]; + //} /// /// Overload for [,] operator. @@ -90,12 +102,23 @@ public T this[int idx] /// The first ID to index into. /// The second ID to index into. /// The data at that ID. - public T this[int i, int j] + public ref T this[int i, int j] { - get => view2d[i, j]; - set => view2d[i, j] = value; + get => ref view2d[i, j]; } + /// + /// Overload for [,,] operator. + /// + /// The first ID to index into. + /// The second ID to index into. + /// The third ID to index into. + /// The data at that ID. + //public ref T this[int i, int j, int k] + //{ + // get => ref view3d[i, j, k]; + //} + /// /// Gets the length of the array. /// @@ -105,13 +128,15 @@ public int Length { switch (dims) { - case 1: + //case 1: default: - return view1d.IntLength; + // return view1d.IntLength; case 2: return view2d.IntLength; + //case 3: + // return view3d.IntLength; } } } } -} \ No newline at end of file +} From 7e4070aaf1a48c06df632b121bef7ce35c572ffb Mon Sep 17 00:00:00 2001 From: Lane Date: Mon, 13 Nov 2023 12:43:27 -0600 Subject: [PATCH 55/61] new index technique via index caching --- DotMP/GPU/Index.cs | 329 +++------------------------------------------ 1 file changed, 17 insertions(+), 312 deletions(-) diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs index f80cf57d..e6da53d3 100644 --- a/DotMP/GPU/Index.cs +++ b/DotMP/GPU/Index.cs @@ -15,8 +15,11 @@ */ using ILGPU; +using ILGPU.Runtime; using System.Diagnostics.CodeAnalysis; +using System.Linq; using System.Runtime.CompilerServices; +using System.Xml; namespace DotMP.GPU { @@ -24,329 +27,31 @@ namespace DotMP.GPU /// Represents an index passed as the first index argument. /// [ExcludeFromCodeCoverage] - public struct IndexI + public struct Index { - /// - /// The start of the first for loop, for index calculations. - /// - private int start1; - - /// - /// The start of the second for loop, for index calculations. - /// - private int start2; - - /// - /// The start of the third for loop, for index calculations. - /// - private int start3; - - /// - /// The index to return. - /// - private int idx_prv; - - /// - /// The difference between the second set of ranges. - /// - private int diff2; - - /// - /// The difference between the third set of ranges. - /// - private int diff3; - - /// - /// The offset, in case of a followup kernel. - /// - private int offset; - - /// - /// The number of dimensions. - /// - private int dims; - - /// - /// Constructor. - /// - /// The range of the for loop. - /// The offset for followup kernels. - internal IndexI((int, int) range, int offset = 0) - { - this.offset = offset; - - start1 = range.Item1; - start2 = -1; - start3 = -1; - idx_prv = -1; - diff2 = -1; - diff3 = -1; - dims = 1; - } - - /// - /// Constructor. - /// - /// The outer range of the for loop. - /// The inner range of the for loop. - /// The offset for followup kernels. - internal IndexI((int, int) range1, (int, int) range2, int offset = 0) - { - this.offset = offset; - - start1 = range1.Item1; - start2 = range2.Item1; - start3 = -1; - idx_prv = -1; - diff2 = range2.Item2 - range2.Item1; - diff3 = -1; - dims = 2; - } - - /// - /// Constructor. - /// - /// The outer range of the for loop. - /// The middle range of the for loop. - /// The inner range of the for loop. - /// The offset for followup kernels. - internal IndexI((int, int) range1, (int, int) range2, (int, int) range3, int offset = 0) - { - this.offset = offset; - - start1 = range1.Item1; - start2 = range2.Item1; - start3 = range3.Item1; - idx_prv = -1; - diff2 = range2.Item2 - range2.Item1; - diff3 = range3.Item2 - range3.Item1; - dims = 3; - } - - /// - /// Casts an index to an int. - /// - /// The Index struct to cast. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static implicit operator int(IndexI h) - { - switch (h.dims) - { - default: - case 1: - if (h.idx_prv == -1) - h.idx_prv = Grid.GlobalLinearIndex + h.start1 + h.offset; - - return h.idx_prv; - - case 2: - if (h.idx_prv == -1) - { - int idxoffset = Grid.GlobalLinearIndex + h.offset; - h.idx_prv = IntrinsicMath.DivRoundDown(idxoffset, h.diff2) + h.start1; - } - - return h.idx_prv; - - case 3: - if (h.idx_prv == -1) - { - - } - - return h.idx_prv; - } - } - } - - /// - /// Represents an index passed as the second index argument. - /// - [ExcludeFromCodeCoverage] - public struct IndexJ - { - /// - /// The start of the first for loop, for index calculations. - /// - private int start1; - - /// - /// The start of the second for loop, for index calculations. - /// - private int start2; - - /// - /// The start of the third for loop, for index calculations. - /// - private int start3; - - /// - /// The index to return. - /// - private int idx_prv; - - /// - /// The difference between the second set of ranges. - /// - private int diff2; - - /// - /// The difference between the third set of ranges. - /// - private int diff3; - - /// - /// The offset, in case of a followup kernel. - /// + private ArrayView1D lookup; private int offset; + private int idx; - /// - /// The number of dimensions. - /// - private int dims; - - /// - /// Constructor. - /// - /// The outer range of the for loop. - /// The inner range of the for loop. - /// The offset for followup kernels. - internal IndexJ((int, int) range1, (int, int) range2, int offset = 0) + internal Index(Buffer buf) { - this.offset = offset; - - start1 = range1.Item1; - start2 = range2.Item1; - start3 = -1; - idx_prv = -1; - diff2 = range2.Item2 - range2.Item1; - diff3 = -1; - dims = 2; + this.lookup = buf.View1D; + offset = 0; + idx = -1; } - /// - /// Constructor. - /// - /// The outer range of the for loop. - /// The middle range of the for loop. - /// The inner range of the for loop. - /// The offset for followup kernels. - internal IndexJ((int, int) range1, (int, int) range2, (int, int) range3, int offset = 0) + internal void AddOffset(int offset) { this.offset = offset; - - start1 = range1.Item1; - start2 = range2.Item1; - start3 = range3.Item1; - idx_prv = -1; - diff2 = range2.Item2 - range2.Item1; - diff3 = range3.Item2 - range3.Item1; - dims = 3; } - /// - /// Casts an index to an int. - /// - /// The Index struct to cast. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static implicit operator int(IndexJ h) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static implicit operator int(Index i) { - switch (h.dims) - { - default: - case 2: - if (h.idx_prv == -1) - { - int idxoffset = Grid.GlobalLinearIndex + h.offset; - h.idx_prv = (idxoffset % h.diff2) + h.start2; - } - - return h.idx_prv; - - case 3: - if (h.idx_prv == -1) - { - - } - - return h.idx_prv; - } - } - } - - /// - /// Represents an index passed as the third index argument. - /// - [ExcludeFromCodeCoverage] - public struct IndexK - { - /// - /// The start of the first for loop, for index calculations. - /// - private int start1; - - /// - /// The start of the second for loop, for index calculations. - /// - private int start2; - - /// - /// The start of the third for loop, for index calculations. - /// - private int start3; - - /// - /// The index to return. - /// - private int idx_prv; - - /// - /// The difference between the second set of ranges. - /// - private int diff2; - - /// - /// The difference between the third set of ranges. - /// - private int diff3; - - /// - /// The offset, in case of a followup kernel. - /// - private int offset; - - /// - /// Constructor. - /// - /// The outer range of the for loop. - /// The middle range of the for loop. - /// The inner range of the for loop. - /// The offset for followup kernels. - internal IndexK((int, int) range1, (int, int) range2, (int, int) range3, int offset = 0) - { - this.offset = offset; - - start1 = range1.Item1; - start2 = range2.Item1; - start3 = range3.Item1; - idx_prv = -1; - diff2 = range2.Item2 - range2.Item1; - diff3 = range3.Item2 - range3.Item1; - } - - /// - /// Casts an index to an int. - /// - /// The Index struct to cast. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static implicit operator int(IndexK h) - { - if (h.idx_prv == -1) - { - - } + if (i.idx == -1) + i.idx = i.lookup[Grid.GlobalLinearIndex + i.offset]; - return h.idx_prv; + return i.idx; } } -} \ No newline at end of file +} From a6d0072c594bf84823856ec2f880119d1ec9946b Mon Sep 17 00:00:00 2001 From: Lane Date: Mon, 13 Nov 2023 12:43:48 -0600 Subject: [PATCH 56/61] update benchmarks --- benchmarks/GPUHeatTransfer/Program.cs | 4 +- benchmarks/GPUOverhead/Program.cs | 6 +- benchmarks/ILGPUOverhead/ILGPUOverhead.csproj | 18 ++++++ benchmarks/ILGPUOverhead/Program.cs | 63 +++++++++++++++++++ 4 files changed, 86 insertions(+), 5 deletions(-) create mode 100644 benchmarks/ILGPUOverhead/ILGPUOverhead.csproj create mode 100644 benchmarks/ILGPUOverhead/Program.cs diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs index 0d332da2..75d0747f 100644 --- a/benchmarks/GPUHeatTransfer/Program.cs +++ b/benchmarks/GPUHeatTransfer/Program.cs @@ -37,7 +37,7 @@ public class HeatTransfer public enum ParType { DMPFor, DMPGPU } // test dims of 100x100, 1000x1000, and 5000x5000 - [Params(500)] + [Params(768)] public int dim; // test with 10 steps and 100 steps @@ -159,7 +159,7 @@ public class HeatTransferVerify public enum ParType { DMPFor, DMPGPU } // test dims of 100x100, 1000x1000, and 5000x5000 - public int dim = 500; + public int dim = 1000; // test with 10 steps and 100 steps public int steps = 100; diff --git a/benchmarks/GPUOverhead/Program.cs b/benchmarks/GPUOverhead/Program.cs index b9ff18a5..579d3868 100644 --- a/benchmarks/GPUOverhead/Program.cs +++ b/benchmarks/GPUOverhead/Program.cs @@ -27,20 +27,20 @@ [EventPipeProfiler(EventPipeProfile.CpuSampling)] public class Overhead { - DotMP.GPU.Buffer buf; + DotMP.GPU.Buffer buf; // run the setup [GlobalSetup] public void Setup() { - buf = new DotMP.GPU.Buffer(new byte[1], DotMP.GPU.Buffer.Behavior.NoCopy); + buf = new DotMP.GPU.Buffer(new int[1, 1], DotMP.GPU.Buffer.Behavior.NoCopy); } //run the simulation [Benchmark] public void TestOverhead() { - DotMP.GPU.Parallel.ParallelFor(0, 1, buf, (i, buf) => { }); + DotMP.GPU.Parallel.ParallelForCollapse((0, 500), (0, 500), buf, (i, j, buf) => { }); } } diff --git a/benchmarks/ILGPUOverhead/ILGPUOverhead.csproj b/benchmarks/ILGPUOverhead/ILGPUOverhead.csproj new file mode 100644 index 00000000..9cf0a6f0 --- /dev/null +++ b/benchmarks/ILGPUOverhead/ILGPUOverhead.csproj @@ -0,0 +1,18 @@ + + + + Exe + net6.0 + enable + enable + + + + + + + + + + + diff --git a/benchmarks/ILGPUOverhead/Program.cs b/benchmarks/ILGPUOverhead/Program.cs new file mode 100644 index 00000000..862ae1af --- /dev/null +++ b/benchmarks/ILGPUOverhead/Program.cs @@ -0,0 +1,63 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Jobs; +using BenchmarkDotNet.Running; +using BenchmarkDotNet.Diagnosers; +using System; +using ILGPU; +using ILGPU.Runtime; + +/* jscpd:ignore-start */ + +[SimpleJob(RuntimeMoniker.Net60)] +[ThreadingDiagnoser] +[HardwareCounters] +[EventPipeProfiler(EventPipeProfile.CpuSampling)] +public class Overhead +{ + Action> kernel; + ArrayView1D data; + + // run the setup + [GlobalSetup] + public void Setup() + { + var context = Context.CreateDefault(); + var accelerator = context.Devices[1].CreateAccelerator(context); + kernel = accelerator.LoadStreamKernel>(arr => { }); + data = accelerator.Allocate1D(1); + } + + //run the simulation + [Benchmark] + public void TestOverhead() + { + kernel((1, 256), data); + } +} + +/* jscpd:ignore-end */ + +// driver +public class Program +{ + public static void Main(string[] args) + { + BenchmarkRunner.Run(); + } +} From ca0b54a25c46afa73a4d597f612cabc6892b371b Mon Sep 17 00:00:00 2001 From: Lane Date: Mon, 13 Nov 2023 12:44:48 -0600 Subject: [PATCH 57/61] run dotnet format --- DotMP/GPU/GpuArray.cs | 22 +++++++++++----------- DotMP/GPU/Index.cs | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs index b4cbe394..11efff12 100644 --- a/DotMP/GPU/GpuArray.cs +++ b/DotMP/GPU/GpuArray.cs @@ -65,7 +65,7 @@ internal GPUArray(Buffer buf) // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. view3d = new Buffer(new T[1, 1, 1], Buffer.Behavior.NoCopy).View3D; break;*/ - default: + default: case 2: // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. //view1d = new Buffer(new T[1], Buffer.Behavior.NoCopy).View1D; @@ -73,14 +73,14 @@ internal GPUArray(Buffer buf) // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. //view3d = new Buffer(new T[1, 1, 1], Buffer.Behavior.NoCopy).View3D; break; - /*case 3: - default: - // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. - view1d = new Buffer(new T[1], Buffer.Behavior.NoCopy).View1D; - // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. - view2d = new Buffer(new T[1, 1], Buffer.Behavior.NoCopy).View2D; - view3d = buf.View3D; - break;*/ + /*case 3: + default: + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + view1d = new Buffer(new T[1], Buffer.Behavior.NoCopy).View1D; + // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices. + view2d = new Buffer(new T[1, 1], Buffer.Behavior.NoCopy).View2D; + view3d = buf.View3D; + break;*/ } dims = buf.Dimensions; @@ -133,8 +133,8 @@ public int Length // return view1d.IntLength; case 2: return view2d.IntLength; - //case 3: - // return view3d.IntLength; + //case 3: + // return view3d.IntLength; } } } diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs index e6da53d3..159fee65 100644 --- a/DotMP/GPU/Index.cs +++ b/DotMP/GPU/Index.cs @@ -45,7 +45,7 @@ internal void AddOffset(int offset) this.offset = offset; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static implicit operator int(Index i) { if (i.idx == -1) From 1c72a2fe4c11b4f5c5616a56cb5ff9c38b973700 Mon Sep 17 00:00:00 2001 From: Lane Date: Tue, 14 Nov 2023 17:49:18 -0600 Subject: [PATCH 58/61] add support for .NET 8 --- DotMP/DotMP.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DotMP/DotMP.csproj b/DotMP/DotMP.csproj index 00a47b68..24113bd7 100644 --- a/DotMP/DotMP.csproj +++ b/DotMP/DotMP.csproj @@ -1,7 +1,7 @@ - net6.0;net7.0 + net6.0;net7.0;net8.0 DotMP DotMP 2.0.0 From 8432ed7150717a312fa44d66a203172faa9c30a3 Mon Sep 17 00:00:00 2001 From: Lane Date: Wed, 15 Nov 2023 23:51:54 -0600 Subject: [PATCH 59/61] comments, optimizations --- DotMP/GPU/AcceleratorHandler.tt | 45 +++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt index e3582530..156d155a 100644 --- a/DotMP/GPU/AcceleratorHandler.tt +++ b/DotMP/GPU/AcceleratorHandler.tt @@ -52,12 +52,18 @@ namespace DotMP.GPU /// Kernel cache. /// private static Dictionary kernels = new Dictionary(); - + /// + /// Index cache for 1D kernels. + /// private static Dictionary>> indices1d = new Dictionary>>(); - + /// + /// Index cache for 2D kernels. + /// private static Dictionary, Buffer>> indices2d = new Dictionary, Buffer>>(); - + /// + /// Index cache for 3D kernels. + /// private static Dictionary, ValueTuple, ValueTuple, Buffer, Buffer, Buffer>> indices3d = new Dictionary, ValueTuple, ValueTuple, Buffer, Buffer, Buffer>>(); @@ -185,7 +191,7 @@ namespace DotMP.GPU /// /// The range of the for loop. /// The calling location in the source code. - /// A buffer representing the indices. + /// The calculated index. internal Index Get1DIdx((int, int) range, string src) { if (indices1d.ContainsKey(src)) @@ -207,6 +213,13 @@ namespace DotMP.GPU return new Index(buf); } + /// + /// Precomputes and caches the indices for a 2D for loop. + /// + /// The outer range of the for loop. + /// The inner range of the for loop. + /// The calling location in the source code. + /// A tuple of calculated indices. internal ValueTuple Get2DIdx((int, int) range1, (int, int) range2, string src) { if (indices2d.ContainsKey(src)) @@ -247,6 +260,14 @@ namespace DotMP.GPU return (new Index(b1), new Index(b2)); } + /// + /// Precomputes and caches the indices for a 3D for loop. + /// + /// The outer range of the for loop. + /// The middle range of the for loop. + /// The inner range of the for loop. + /// The calling location in the source code. + /// A tuple of calculated indices. internal ValueTuple Get3DIdx((int, int) range1, (int, int) range2, (int, int) range3, string src) { if (indices3d.ContainsKey(src)) @@ -323,9 +344,13 @@ namespace DotMP.GPU var kernel = GetKernel(action, src); +<# for (int i = 0; i < c; i++) { #> + var gpu<#= i + 1 #> = new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>); +<# } #> + kernel((len / block_size, block_size), idx <# for (int i = 0; i < c; i++) { #> - , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>) + , gpu<#= i + 1 #> <# } #> ); @@ -338,7 +363,7 @@ namespace DotMP.GPU kernel((1, not_done), idx <# for (int i = 0; i < c; i++) { #> - , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>) + , gpu<#= i + 1 #> <# } #> ); } @@ -431,9 +456,13 @@ namespace DotMP.GPU var kernel = GetKernel(action, src); +<# for (int i = 0; i < c; i++) { #> + var gpu<#= i + 1 #> = new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>); +<# } #> + kernel((len / block_size, block_size), i, j, k <# for (int i = 0; i < c; i++) { #> - , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>) + , gpu<#= i + 1 #> <# } #> ); @@ -448,7 +477,7 @@ namespace DotMP.GPU kernel((1, not_done), i, j, k <# for (int i = 0; i < c; i++) { #> - , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>) + , gpu<#= i + 1 #> <# } #> ); } From 97571c53a5bfec68bca1a980ba2ee0fe01e5b26b Mon Sep 17 00:00:00 2001 From: Lane Date: Wed, 15 Nov 2023 23:52:18 -0600 Subject: [PATCH 60/61] temporary --- DotMP/GPU/GpuArray.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs index 11efff12..ca4ad643 100644 --- a/DotMP/GPU/GpuArray.cs +++ b/DotMP/GPU/GpuArray.cs @@ -33,7 +33,7 @@ public struct GPUArray /// /// The ILGPU view for 1D arrays. /// - private ArrayView1D view1d; + //private ArrayView1D view1d; /// /// The ILGPU view for 2D arrays. @@ -43,7 +43,7 @@ public struct GPUArray /// /// The ILGPU view for 3D arrays. /// - private ArrayView3D view3d; + //private ArrayView3D view3d; /// /// Number of dimensions. From f87b6876c3532f54a477ac8628adf85e557766a1 Mon Sep 17 00:00:00 2001 From: Lane Date: Wed, 15 Nov 2023 23:52:36 -0600 Subject: [PATCH 61/61] add optimizations --- DotMP/GPU/Index.cs | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs index 159fee65..8de4dc3e 100644 --- a/DotMP/GPU/Index.cs +++ b/DotMP/GPU/Index.cs @@ -29,10 +29,23 @@ namespace DotMP.GPU [ExcludeFromCodeCoverage] public struct Index { + /// + /// Lookup table for indices. + /// private ArrayView1D lookup; + /// + /// Offset for followup kernels. + /// private int offset; + /// + /// Cached index. + /// private int idx; + /// + /// Constructor. + /// + /// Buffer representing the indices. internal Index(Buffer buf) { this.lookup = buf.View1D; @@ -40,11 +53,19 @@ internal Index(Buffer buf) idx = -1; } + /// + /// Adds an offset in preperation for a followup kernel. + /// + /// The offset to set. internal void AddOffset(int offset) { this.offset = offset; } + /// + /// Calculates the index and caches for future use. + /// + /// The Index object to cast to int. [MethodImpl(MethodImplOptions.AggressiveInlining)] public static implicit operator int(Index i) { /// Constructor. @@ -37,16 +45,62 @@ public struct Index /// The start of the parallel for loop. internal Index(int start) { - this.start = start; + this.start1 = start; + this.start2 = 0; + i_prv = -1; + j_prv = -1; + diff = 0; + } + + internal Index((int, int)[] ranges) + { + start1 = ranges[0].Item1; + start2 = ranges[1].Item1; + i_prv = -1; + j_prv = -1; + diff = ranges[1].Item2 - ranges[1].Item1; } /// /// Gets the index of the loop. /// /// Unused. + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static implicit operator int(Index h) { - return Grid.GlobalIndex.X + h.start; + return Grid.GlobalLinearIndex + h.start1; + } + + public int i + { + get + { + if (i_prv == -1) + { + i_prv = IntrinsicMath.DivRoundDown(Grid.GlobalLinearIndex, diff); + j_prv = Grid.GlobalLinearIndex - i_prv * diff; + i_prv += start1; + j_prv += start2; + } + + return i_prv; + } + } + + public int j + { + get + { + if (j_prv == -1) + { + i_prv = IntrinsicMath.DivRoundDown(Grid.GlobalLinearIndex, diff); + j_prv = Grid.GlobalLinearIndex - i_prv * diff; + i_prv += start1; + j_prv += start2; + } + + return j_prv; + } } } } \ No newline at end of file diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs index b3475816..58ca52e0 100644 --- a/benchmarks/GPUHeatTransfer/Program.cs +++ b/benchmarks/GPUHeatTransfer/Program.cs @@ -127,21 +127,15 @@ public void DoStep() break; case ParType.DMPGPU: - DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) => + DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) => { - for (int j = 1; j < 514 - 1; j++) - { - //set the scratch array to the average of the surrounding cells - scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); - } + //set the scratch array to the average of the surrounding cells + scratch[idx.i, idx.j] = 0.25 * (grid[idx.i - 1, idx.j] + grid[idx.i + 1, idx.j] + grid[idx.i, idx.j - 1] + grid[idx.i, idx.j + 1]); }); - DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) => + DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) => { - for (int j = 1; j < 514 - 1; j++) - { - grid[i, j] = scratch[i, j]; - } + grid[idx.i, idx.j] = scratch[idx.i, idx.j]; }); break; } @@ -245,21 +239,15 @@ public void DoStep() break; case ParType.DMPGPU: - DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) => + DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) => { - for (int j = 1; j < 514 - 1; j++) - { - //set the scratch array to the average of the surrounding cells - scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]); - } + //set the scratch array to the average of the surrounding cells + scratch[idx.i, idx.j] = 0.25 * (grid[idx.i - 1, idx.j] + grid[idx.i + 1, idx.j] + grid[idx.i, idx.j - 1] + grid[idx.i, idx.j + 1]); }); - DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) => + DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) => { - for (int j = 1; j < 514 - 1; j++) - { - grid[i, j] = scratch[i, j]; - } + grid[idx.i, idx.j] = scratch[idx.i, idx.j]; }); break; } @@ -306,4 +294,4 @@ public static void Main(string[] args) else BenchmarkRunner.Run(); } -} \ No newline at end of file +} From 348ad0c829ed1175a14766ee8c2d8785c0025988 Mon Sep 17 00:00:00 2001 From: Lane Date: Sat, 11 Nov 2023 17:20:37 -0600 Subject: [PATCH 33/61] implement T4 template for acceleratorhandler --- DotMP/GPU/AcceleratorHandler.tt | 713 ++++++++++++++++++++++++++++++++ 1 file changed, 713 insertions(+) create mode 100644 DotMP/GPU/AcceleratorHandler.tt diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt new file mode 100644 index 00000000..9542d5cf --- /dev/null +++ b/DotMP/GPU/AcceleratorHandler.tt @@ -0,0 +1,713 @@ +/* +* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. +* Copyright (C) 2023 Phillip Allen Lane +* +* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser +* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or +* (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +* License for more details. +* +* You should have received a copy of the GNU Lesser General Public License along with this library; if not, +* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +*/ + + +<#@ template debug="false" hostspecific="false" language="C#" #> +<#@ output extension=".cs" #> +<# var letters = new char[] { 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'A', 'B', 'C', 'D', 'E', 'F' }; + int max = 13; #> + +using System; +using System.Collections.Generic; +using System.Linq; +using ILGPU; +using ILGPU.Runtime; + +namespace DotMP.GPU +{ + /// + /// The handler class managing GPU acceleration. + /// + internal class AcceleratorHandler + { + /// + /// Determines if a GPU context has been initialized yet. + /// + private static bool initialized = false; + /// + /// The GPU context. + /// + private static Context context; + /// + /// The accelerator object. + /// + internal static Accelerator accelerator; + /// + /// Block size to use for kernels. + /// + private static int block_size; + /// + /// Kernel cache. + /// + private static Dictionary kernels = new Dictionary(); + + /// + /// Default constructor. If this is the first time it's called, it initializes all relevant singleton data. + /// + internal AcceleratorHandler() + { + if (initialized) return; + + context = Context.CreateDefault(); + var selectedDevice = context.Devices[0]; + + foreach (var d in context.Devices) + { + Console.WriteLine("Detected {0} accelerator.", d.ToString()); + + if (selectedDevice.AcceleratorType == AcceleratorType.CPU && d.AcceleratorType == AcceleratorType.OpenCL) + selectedDevice = d; + if (selectedDevice.AcceleratorType != AcceleratorType.Cuda && d.AcceleratorType == AcceleratorType.Cuda) + selectedDevice = d; + } + + accelerator = selectedDevice.CreateAccelerator(context); + //accelerator = context.Devices[0].CreateAccelerator(context); + + Console.WriteLine("Using {0} accelerator.", accelerator.AcceleratorType.ToString()); + initialized = true; + block_size = accelerator.AcceleratorType == AcceleratorType.CPU ? 16 : 256; + } + + /// + /// Synchronize pending operations. + /// + private void Synchronize() => accelerator.Synchronize(); + + /// + /// Get the kernel associated with this lambda. + /// + /// The action provided on the CPU. + /// The calling location. + /// The GPU kernel. +<# for (int c = 1; c <= max; c++) { #> + private Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> +> GetKernel< +<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #> +>(Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #> +> action, string src) +<# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #> + { + if (!kernels.ContainsKey(src)) + kernels.Add(src, accelerator.LoadStreamKernel(action)); + + return (Action GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> + >) kernels[src]; + } +<# } #> + + /// + /// Dispatches a kernel with one parameter. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The originating caller location. + /// The base type of the first argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf, Action> action, string src) + where T : unmanaged + { + var idx = new Index(start); + + var kernel = GetKernel(action, src); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with two parameters. + /// + /// The starts and ends of the loop. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The originating caller location. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + internal void DispatchKernel((int, int)[] ranges, Buffer buf1, Buffer buf2, Action, GPUArray> action, string src) + where T : unmanaged + where U : unmanaged + { + int len = ranges.Select(tup => tup.Item2 - tup.Item1).Aggregate((x, y) => x * y); + var idx = new Index(ranges); + + var kernel = GetKernel(action, src); + + kernel((len / block_size, block_size), idx, + new GPUArray(buf1), + new GPUArray(buf2)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with three parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The originating caller location. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Action, GPUArray, GPUArray> action, string src) + where T : unmanaged + where U : unmanaged + where V : unmanaged + { + var idx = new Index(start); + + var kernel = GetKernel(action, src); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with four parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The originating caller location. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Action, GPUArray, GPUArray, GPUArray> action, string src) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + { + var idx = new Index(start); + + var kernel = GetKernel(action, src); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with five parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The originating caller location. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Action, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + { + var idx = new Index(start); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with six parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The originating caller location. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + { + var idx = new Index(start); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5), + new GPUArray(buf6)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with seven parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The originating caller location. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + { + var idx = new Index(start); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5), + new GPUArray(buf6), + new GPUArray(buf7)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with eight parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The originating caller location. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + { + var idx = new Index(start); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5), + new GPUArray(buf6), + new GPUArray(buf7), + new GPUArray(buf8)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with nine parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The originating caller location. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + { + var idx = new Index(start); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5), + new GPUArray(buf6), + new GPUArray(buf7), + new GPUArray(buf8), + new GPUArray(buf9)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with ten parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The tenth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The originating caller location. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + /// The base type of the tenth argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + where C : unmanaged + { + var idx = new Index(start); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5), + new GPUArray(buf6), + new GPUArray(buf7), + new GPUArray(buf8), + new GPUArray(buf9), + new GPUArray(buf10)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with eleven parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The tenth buffer to run the kernel with. + /// The eleventh buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The originating caller location. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + /// The base type of the tenth argument. Must be an unmanaged type. + /// The base type of the eleventh argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + where C : unmanaged + where D : unmanaged + { + var idx = new Index(start); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5), + new GPUArray(buf6), + new GPUArray(buf7), + new GPUArray(buf8), + new GPUArray(buf9), + new GPUArray(buf10), + new GPUArray(buf11)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with twelve parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The tenth buffer to run the kernel with. + /// The eleventh buffer to run the kernel with. + /// The twelfth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The originating caller location. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + /// The base type of the tenth argument. Must be an unmanaged type. + /// The base type of the eleventh argument. Must be an unmanaged type. + /// The base type of the twelfth argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + where C : unmanaged + where D : unmanaged + where E : unmanaged + { + var idx = new Index(start); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5), + new GPUArray(buf6), + new GPUArray(buf7), + new GPUArray(buf8), + new GPUArray(buf9), + new GPUArray(buf10), + new GPUArray(buf11), + new GPUArray(buf12)); + + Synchronize(); + } + + /// + /// Dispatches a kernel with thirteen parameters. + /// + /// The start of the loop, inclusive. + /// The end of the loop, exclusive. + /// The first buffer to run the kernel with. + /// The second buffer to run the kernel with. + /// The third buffer to run the kernel with. + /// The fourth buffer to run the kernel with. + /// The fifth buffer to run the kernel with. + /// The sixth buffer to run the kernel with. + /// The seventh buffer to run the kernel with. + /// The eighth buffer to run the kernel with. + /// The ninth buffer to run the kernel with. + /// The tenth buffer to run the kernel with. + /// The eleventh buffer to run the kernel with. + /// The twelfth buffer to run the kernel with. + /// The thirteenth buffer to run the kernel with. + /// The kernel to run on the GPU. + /// The originating caller location. + /// The base type of the first argument. Must be an unmanaged type. + /// The base type of the second argument. Must be an unmanaged type. + /// The base type of the third argument. Must be an unmanaged type. + /// The base type of the fourth argument. Must be an unmanaged type. + /// The base type of the fifth argument. Must be an unmanaged type. + /// The base type of the sixth argument. Must be an unmanaged type. + /// The base type of the seventh argument. Must be an unmanaged type. + /// The base type of the eighth argument. Must be an unmanaged type. + /// The base type of the ninth argument. Must be an unmanaged type. + /// The base type of the tenth argument. Must be an unmanaged type. + /// The base type of the eleventh argument. Must be an unmanaged type. + /// The base type of the twelfth argument. Must be an unmanaged type. + /// The base type of the thirteenth argument. Must be an unmanaged type. + internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Buffer buf13, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) + where T : unmanaged + where U : unmanaged + where V : unmanaged + where W : unmanaged + where X : unmanaged + where Y : unmanaged + where Z : unmanaged + where A : unmanaged + where B : unmanaged + where C : unmanaged + where D : unmanaged + where E : unmanaged + where F : unmanaged + { + var idx = new Index(start); + + var kernel = accelerator.LoadStreamKernel(action); + + kernel(((end - start) / block_size, block_size), idx, + new GPUArray(buf1), + new GPUArray(buf2), + new GPUArray(buf3), + new GPUArray(buf4), + new GPUArray(buf5), + new GPUArray(buf6), + new GPUArray(buf7), + new GPUArray(buf8), + new GPUArray(buf9), + new GPUArray(buf10), + new GPUArray(buf11), + new GPUArray(buf12), + new GPUArray(buf13)); + + Synchronize(); + } +} +} \ No newline at end of file From 91f16921638143f64597a87087fc37a112314bd4 Mon Sep 17 00:00:00 2001 From: Lane Date: Sat, 11 Nov 2023 17:20:47 -0600 Subject: [PATCH 34/61] ignore generated acceleratorhandler file --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index a24fbd40..df42484a 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,7 @@ docs/* .vscode *.opencover.xml *.sln -parfor_dump.cs +AcceleratorHandler.cs ProcessedREADME.md # User-specific files From 65408250c534d29704e55c00549f1157e54800b3 Mon Sep 17 00:00:00 2001 From: Lane Date: Sat, 11 Nov 2023 17:20:54 -0600 Subject: [PATCH 35/61] add T4 stuff --- DotMP/DotMP.csproj | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/DotMP/DotMP.csproj b/DotMP/DotMP.csproj index b618b701..00a47b68 100644 --- a/DotMP/DotMP.csproj +++ b/DotMP/DotMP.csproj @@ -23,7 +23,14 @@ - + + + True + True + GPU/AcceleratorHandler.tt + + + From a8872d12342883ab2a1c0724bc42753613c3ac87 Mon Sep 17 00:00:00 2001 From: Lane Date: Sat, 11 Nov 2023 17:21:04 -0600 Subject: [PATCH 36/61] delete now unnecessary files --- DotMP/GPU/AcceleratorHandler.cs | 758 ------------------------------- DotMP/GPU/Python/dispatch_gen.py | 93 ---- DotMP/GPU/Python/parfor_gen.py | 88 ---- 3 files changed, 939 deletions(-) delete mode 100644 DotMP/GPU/AcceleratorHandler.cs delete mode 100644 DotMP/GPU/Python/dispatch_gen.py delete mode 100644 DotMP/GPU/Python/parfor_gen.py diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs deleted file mode 100644 index 3608a6c1..00000000 --- a/DotMP/GPU/AcceleratorHandler.cs +++ /dev/null @@ -1,758 +0,0 @@ -/* -* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. -* Copyright (C) 2023 Phillip Allen Lane -* -* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser -* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or -* (at your option) any later version. -* -* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the -* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -* License for more details. -* -* You should have received a copy of the GNU Lesser General Public License along with this library; if not, -* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -*/ - -using System; -using System.Collections.Generic; -using System.Linq; -using ILGPU; -using ILGPU.Runtime; - -namespace DotMP.GPU -{ - /// - /// The handler class managing GPU acceleration. - /// - internal class AcceleratorHandler - { - /// - /// Determines if a GPU context has been initialized yet. - /// - private static bool initialized = false; - /// - /// The GPU context. - /// - private static Context context; - /// - /// The accelerator object. - /// - internal static Accelerator accelerator; - /// - /// Block size to use for kernels. - /// - private static int block_size; - /// - /// Kernel cache. - /// - private static Dictionary kernels = new Dictionary(); - - /// - /// Default constructor. If this is the first time it's called, it initializes all relevant singleton data. - /// - internal AcceleratorHandler() - { - if (initialized) return; - - context = Context.CreateDefault(); - var selectedDevice = context.Devices[0]; - - foreach (var d in context.Devices) - { - Console.WriteLine("Detected {0} accelerator.", d.ToString()); - - if (selectedDevice.AcceleratorType == AcceleratorType.CPU && d.AcceleratorType == AcceleratorType.OpenCL) - selectedDevice = d; - if (selectedDevice.AcceleratorType != AcceleratorType.Cuda && d.AcceleratorType == AcceleratorType.Cuda) - selectedDevice = d; - } - - accelerator = selectedDevice.CreateAccelerator(context); - //accelerator = context.Devices[0].CreateAccelerator(context); - - Console.WriteLine("Using {0} accelerator.", accelerator.AcceleratorType.ToString()); - initialized = true; - block_size = accelerator.AcceleratorType == AcceleratorType.CPU ? 16 : 256; - } - - /// - /// Synchronize pending operations. - /// - private void Synchronize() => accelerator.Synchronize(); - - /// - /// Get the kernel associated with this lambda. - /// - /// The base type of the first argument. Must be an unmanaged type. - /// The action provided on the CPU. - /// The calling location. - /// The GPU kernel. - private Action> GetKernel(Action> action, string src) - where T : unmanaged - { - if (!kernels.ContainsKey(src)) - kernels.Add(src, accelerator.LoadStreamKernel(action)); - - return (Action>)kernels[src]; - } - - /// - /// Get the kernel associated with this lambda. - /// - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The action provided on the CPU. - /// The calling location. - /// The GPU kernel. - private Action, GPUArray> GetKernel(Action, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - { - if (!kernels.ContainsKey(src)) - kernels.Add(src, accelerator.LoadStreamKernel(action)); - - return (Action, GPUArray>)kernels[src]; - } - - /// - /// Get the kernel associated with this lambda. - /// - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The action provided on the CPU. - /// The calling location. - /// The GPU kernel. - private Action, GPUArray, GPUArray> GetKernel(Action, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - { - if (!kernels.ContainsKey(src)) - kernels.Add(src, accelerator.LoadStreamKernel(action)); - - return (Action, GPUArray, GPUArray>)kernels[src]; - } - - /// - /// Get the kernel associated with this lambda. - /// - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The action provided on the CPU. - /// The calling location. - /// The GPU kernel. - private Action, GPUArray, GPUArray, GPUArray> GetKernel(Action, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - { - if (!kernels.ContainsKey(src)) - kernels.Add(src, accelerator.LoadStreamKernel(action)); - - return (Action, GPUArray, GPUArray, GPUArray>)kernels[src]; - } - - /// - /// Dispatches a kernel with one parameter. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf, Action> action, string src) - where T : unmanaged - { - var idx = new Index(start); - - var kernel = GetKernel(action, src); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with two parameters. - /// - /// The starts and ends of the loop. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - internal void DispatchKernel((int, int)[] ranges, Buffer buf1, Buffer buf2, Action, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - { - int len = ranges.Select(tup => tup.Item2 - tup.Item1).Aggregate((x, y) => x * y); - var idx = new Index(ranges); - - var kernel = GetKernel(action, src); - - kernel((len / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with three parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Action, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - { - var idx = new Index(start); - - var kernel = GetKernel(action, src); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with four parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Action, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - { - var idx = new Index(start); - - var kernel = GetKernel(action, src); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with five parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Action, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with six parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5), - new GPUArray(buf6)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with seven parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5), - new GPUArray(buf6), - new GPUArray(buf7)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with eight parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5), - new GPUArray(buf6), - new GPUArray(buf7), - new GPUArray(buf8)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with nine parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5), - new GPUArray(buf6), - new GPUArray(buf7), - new GPUArray(buf8), - new GPUArray(buf9)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with ten parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The tenth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - /// The base type of the tenth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - where C : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5), - new GPUArray(buf6), - new GPUArray(buf7), - new GPUArray(buf8), - new GPUArray(buf9), - new GPUArray(buf10)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with eleven parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The tenth buffer to run the kernel with. - /// The eleventh buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - /// The base type of the tenth argument. Must be an unmanaged type. - /// The base type of the eleventh argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - where C : unmanaged - where D : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5), - new GPUArray(buf6), - new GPUArray(buf7), - new GPUArray(buf8), - new GPUArray(buf9), - new GPUArray(buf10), - new GPUArray(buf11)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with twelve parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The tenth buffer to run the kernel with. - /// The eleventh buffer to run the kernel with. - /// The twelfth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - /// The base type of the tenth argument. Must be an unmanaged type. - /// The base type of the eleventh argument. Must be an unmanaged type. - /// The base type of the twelfth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - where C : unmanaged - where D : unmanaged - where E : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5), - new GPUArray(buf6), - new GPUArray(buf7), - new GPUArray(buf8), - new GPUArray(buf9), - new GPUArray(buf10), - new GPUArray(buf11), - new GPUArray(buf12)); - - Synchronize(); - } - - /// - /// Dispatches a kernel with thirteen parameters. - /// - /// The start of the loop, inclusive. - /// The end of the loop, exclusive. - /// The first buffer to run the kernel with. - /// The second buffer to run the kernel with. - /// The third buffer to run the kernel with. - /// The fourth buffer to run the kernel with. - /// The fifth buffer to run the kernel with. - /// The sixth buffer to run the kernel with. - /// The seventh buffer to run the kernel with. - /// The eighth buffer to run the kernel with. - /// The ninth buffer to run the kernel with. - /// The tenth buffer to run the kernel with. - /// The eleventh buffer to run the kernel with. - /// The twelfth buffer to run the kernel with. - /// The thirteenth buffer to run the kernel with. - /// The kernel to run on the GPU. - /// The originating caller location. - /// The base type of the first argument. Must be an unmanaged type. - /// The base type of the second argument. Must be an unmanaged type. - /// The base type of the third argument. Must be an unmanaged type. - /// The base type of the fourth argument. Must be an unmanaged type. - /// The base type of the fifth argument. Must be an unmanaged type. - /// The base type of the sixth argument. Must be an unmanaged type. - /// The base type of the seventh argument. Must be an unmanaged type. - /// The base type of the eighth argument. Must be an unmanaged type. - /// The base type of the ninth argument. Must be an unmanaged type. - /// The base type of the tenth argument. Must be an unmanaged type. - /// The base type of the eleventh argument. Must be an unmanaged type. - /// The base type of the twelfth argument. Must be an unmanaged type. - /// The base type of the thirteenth argument. Must be an unmanaged type. - internal void DispatchKernel(int start, int end, Buffer buf1, Buffer buf2, Buffer buf3, Buffer buf4, Buffer buf5, Buffer buf6, Buffer buf7, Buffer buf8, Buffer buf9, Buffer buf10, Buffer buf11, Buffer buf12, Buffer buf13, Action, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray, GPUArray> action, string src) - where T : unmanaged - where U : unmanaged - where V : unmanaged - where W : unmanaged - where X : unmanaged - where Y : unmanaged - where Z : unmanaged - where A : unmanaged - where B : unmanaged - where C : unmanaged - where D : unmanaged - where E : unmanaged - where F : unmanaged - { - var idx = new Index(start); - - var kernel = accelerator.LoadStreamKernel(action); - - kernel(((end - start) / block_size, block_size), idx, - new GPUArray(buf1), - new GPUArray(buf2), - new GPUArray(buf3), - new GPUArray(buf4), - new GPUArray(buf5), - new GPUArray(buf6), - new GPUArray(buf7), - new GPUArray(buf8), - new GPUArray(buf9), - new GPUArray(buf10), - new GPUArray(buf11), - new GPUArray(buf12), - new GPUArray(buf13)); - - Synchronize(); - } - } -} \ No newline at end of file diff --git a/DotMP/GPU/Python/dispatch_gen.py b/DotMP/GPU/Python/dispatch_gen.py deleted file mode 100644 index bb4152cd..00000000 --- a/DotMP/GPU/Python/dispatch_gen.py +++ /dev/null @@ -1,93 +0,0 @@ -""" -* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. -* Copyright (C) 2023 Phillip Allen Lane -* -* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser -* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or -* (at your option) any later version. -* -* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the -* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -* License for more details. -* -* You should have received a copy of the GNU Lesser General Public License along with this library; if not, -* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -""" - -ofile = open("./dispatch_dump.cs", "w") - -cardinals = ["one", "two", "three", "four", "five", "six", "seven", "eight", - "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"] -ordinals = ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", - "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth"] - -letters = ["T", "U", "V", "W", "X", "Y", "Z", - "A", "B", "C", "D", "E", "F", "G", "H", "I"] - -for i in range(0, 13): - funcstr = "" - - funcstr += """/// -/// Dispatches a kernel with {c} parameters. -/// -/// The start of the loop, inclusive. -/// The end of the loop, exclusive.""".format(c=cardinals[i]) - - for j in range(i + 1): - adjusted = j + 1 - - funcstr += """ -/// The {o} buffer to run the kernel with.""".format(a=j + 1, o=ordinals[j]) - - funcstr += """ -/// The kernel to run on the GPU.""" - - for j in range(i + 1): - funcstr += """ -/// The base type of the {o} argument. Must be an unmanaged type.""".format(l=letters[j], o=ordinals[j]) - - funcstr += """ -internal void DispatchKernel<""" - - for j in range(i): - funcstr += "{l}, ".format(l=letters[j]) - - funcstr += "{l}>(int start, int end, ".format(l=letters[i]) - - for j in range(i + 1): - adjusted = j + 1 - funcstr += "Buffer<{l}> buf{a}, ".format(l=letters[j], a=adjusted) - - funcstr += "Action(buf{a}.View), -""".format(l=letters[j], a=adjusted) - - funcstr += """ new GPUArray<{l}>(buf{a}.View)); - - Synchronize(); -""".format(l=letters[i], a=i + 1) - - funcstr += "}\n\n" - - ofile.write(funcstr) diff --git a/DotMP/GPU/Python/parfor_gen.py b/DotMP/GPU/Python/parfor_gen.py deleted file mode 100644 index e960e861..00000000 --- a/DotMP/GPU/Python/parfor_gen.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. -* Copyright (C) 2023 Phillip Allen Lane -* -* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser -* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or -* (at your option) any later version. -* -* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the -* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -* License for more details. -* -* You should have received a copy of the GNU Lesser General Public License along with this library; if not, -* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -""" - -ofile = open("./parfor_dump.cs", "w") - -cardinals = ["one", "two", "three", "four", "five", "six", "seven", "eight", - "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"] -ordinals = ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", - "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth"] - -letters = ["T", "U", "V", "W", "X", "Y", "Z", - "A", "B", "C", "D", "E", "F", "G", "H", "I"] - -for i in range(0, 13): - funcstr = "" - - funcstr += """/// -/// Creates a GPU parallel for loop. -/// The body of the kernel is run on a GPU target. -/// This overload specifies that {c} arrays are used on the GPU. -/// -/// The start of the loop, inclusive. -/// The end of the loop, exclusive.""".format(c=cardinals[i]) - - for j in range(i + 1): - adjusted = j + 1 - - funcstr += """ -/// The {o} buffer to run the kernel with.""".format(a=j + 1, o=ordinals[j]) - - funcstr += """ -/// The kernel to run on the GPU.""" - - for j in range(i + 1): - funcstr += """ -/// The base type of the {o} argument. Must be an unmanaged type.""".format(l=letters[j], o=ordinals[j]) - - funcstr += """ -public static void ParallelFor<""" - - for j in range(i): - funcstr += "{l}, ".format(l=letters[j]) - - funcstr += "{l}>(int start, int end, ".format(l=letters[i]) - - for j in range(i + 1): - adjusted = j + 1 - funcstr += "Buffer<{l}> buf{a}, ".format(l=letters[j], a=adjusted) - - funcstr += "Action Date: Sat, 11 Nov 2023 17:37:34 -0600 Subject: [PATCH 37/61] more autogen --- DotMP/GPU/AcceleratorHandler.tt | 607 ++------------------------------ 1 file changed, 22 insertions(+), 585 deletions(-) diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt index 9542d5cf..96cb8336 100644 --- a/DotMP/GPU/AcceleratorHandler.tt +++ b/DotMP/GPU/AcceleratorHandler.tt @@ -87,13 +87,13 @@ namespace DotMP.GPU ///