From 61b93af05eb3b136d05ed197e6808b26106361ee Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 04:18:46 -0600
Subject: [PATCH 01/61] test using new GPU data transfer API

---
 DotMP-Tests/GPUTests.cs | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)
diff --git a/DotMP-Tests/GPUTests.cs b/DotMP-Tests/GPUTests.cs
index e97422b5..c155fd2d 100644
--- a/DotMP-Tests/GPUTests.cs
+++ b/DotMP-Tests/GPUTests.cs
@@ -33,13 +33,18 @@ public void GPU_for_works()
             random_init(x);
             random_init(y);
 
-            DotMP.GPU.Parallel.DataTo(a, x, y);
-            DotMP.GPU.Parallel.DataFrom(res);
-            DotMP.GPU.Parallel.ParallelFor<double, double, double, float>
-                (0, a.Length, (i, a, x, y, res) =>
             {
-                res[i] = (float)(a[i] * x[i] + y[i]);
-            });
+                using var a_gpu = new DotMP.GPU.Buffer<double>(a, DotMP.GPU.Buffer.Behavior.To);
+                using var x_gpu = new DotMP.GPU.Buffer<double>(x, DotMP.GPU.Buffer.Behavior.To);
+                using var y_gpu = new DotMP.GPU.Buffer<double>(y, DotMP.GPU.Buffer.Behavior.To);
+                using var res_gpu = new DotMP.GPU.Buffer<float>(res, DotMP.GPU.Buffer.Behavior.From);
+
+                DotMP.GPU.Parallel.ParallelFor(0, a.Length, a_gpu, x_gpu, y_gpu, res_gpu,
+                    (i, a, x, y, res) =>
+                {
+                    res[i] = (float)(a[i] * x[i] + y[i]);
+                });
+            }
 
             for (int i = 0; i < a.Length; i++)
             {
@@ -50,11 +55,13 @@ public void GPU_for_works()
 
             double[] a_old = a.Select(a => a).ToArray();
 
-            DotMP.GPU.Parallel.DataToFrom(a);
-            DotMP.GPU.Parallel.ParallelFor<double>(0, a.Length, (i, a) =>
+            using (var a_gpu = new DotMP.GPU.Buffer<double>(a, DotMP.GPU.Buffer.Behavior.ToFrom))
             {
-                a[i]++;
-            });
+                DotMP.GPU.Parallel.ParallelFor(0, a.Length, a_gpu, (i, a) =>
+                {
+                    a[i]++;
+                });
+            }
 
             for (int i = 0; i < a.Length; i++)
             {

From 60a1c9fd35a80c58ef4e2fb9c696644cf1cc09a3 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 04:18:57 -0600
Subject: [PATCH 02/61] implement new memory model

---
 DotMP/GPU/AcceleratorHandler.cs | 201 +++++---------------------------
 DotMP/GPU/Buffer.cs             |  95 +++++++++++++++
 DotMP/GPU/Gpu.cs                |  65 +++--------
 DotMP/GPU/GpuArray.cs           |   4 +-
 4 files changed, 144 insertions(+), 221 deletions(-)
 create mode 100644 DotMP/GPU/Buffer.cs

diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs
index e6f2771f..f2889a00 100644
--- a/DotMP/GPU/AcceleratorHandler.cs
+++ b/DotMP/GPU/AcceleratorHandler.cs
@@ -21,31 +21,10 @@ internal class AcceleratorHandler
         /// <summary>
         /// The accelerator object.
         /// </summary>
-        private static Accelerator accelerator;
+        internal static Accelerator accelerator;
         /// <summary>
-        /// The GPU pointers for arrays going to the GPU.
+        /// 
         /// </summary>
-        private static dynamic[] tos;
-        /// <summary>
-        /// The GPU pointers for arrays coming back from the GPU.
-        /// </summary>
-        private static dynamic[] froms;
-        /// <summary>
-        /// The CPU pointers for arrays coming back from the GPU.
-        /// </summary>
-        private static dynamic[] froms_cpu;
-        /// <summary>
-        /// The GPU pointers for arrays going both to and from the GPU.
-        /// </summary>
-        private static dynamic[] tofroms;
-        /// <summary>
-        /// The CPU pointers for arrays going both to and from the GPU.
-        /// </summary>
-        private static dynamic[] tofroms_cpu;
-        /// <summary>
-        /// Counts how many arrays have been copied back to the CPU for bookkeeping.
-        /// </summary>
-        private static int copied_back;
         private static int block_size;
 
         /// <summary>
@@ -59,129 +38,13 @@ internal AcceleratorHandler()
             accelerator = context.Devices[0].CreateAccelerator(context);
             Console.WriteLine("Using {0} accelerator.", accelerator.AcceleratorType.ToString());
             initialized = true;
-            copied_back = 0;
             block_size = accelerator.AcceleratorType == AcceleratorType.CPU ? 16 : 256;
-
-            tos = new dynamic[0];
-            froms = new dynamic[0];
-            tofroms = new dynamic[0];
-            froms_cpu = new dynamic[0][];
-            tofroms_cpu = new dynamic[0][];
-        }
-
-        /// <summary>
-        /// Aggregates the parameters into a single array.
-        /// </summary>
-        /// <returns>A dynamic array of all parameters.</returns>
-        private dynamic[] AggregateParams(int count)
-        {
-            dynamic[] ret = tos.Concat(froms).Concat(tofroms).ToArray();
-
-            if (ret.Length != count)
-                throw new WrongNumberOfDataMovementsSpecifiedException(string.Format("Specified {0} data movement(s), expected {1}.", ret.Length, count));
-
-            return ret;
-        }
-
-        /// <summary>
-        /// Called if data should be moved to the device.
-        /// Allocates data on GPU and copies the data from the CPU.
-        /// </summary>
-        /// <typeparam name="T">The type of data to allocate.</typeparam>
-        /// <param name="values">The data to allocate.</param>
-        internal void AllocateTo<T>(T[][] values)
-            where T : unmanaged
-        {
-            if (froms.Length > 0 || tofroms.Length > 0)
-                throw new ImproperDataMovementOrderingException("DataTo should be called before DataFrom and DataToFrom.");
-
-            var tos = values.Select(v => accelerator.Allocate1D(v)).ToArray();
-            AcceleratorHandler.tos = AcceleratorHandler.tos.Concat(tos).ToArray();
-
-            for (int i = 0; i < tos.Length; i++)
-                tos[i].CopyFromCPU(values[i]);
-        }
-
-        /// <summary>
-        /// Called if data should be moved from the device.
-        /// Allocates data on GPU.
-        /// </summary>
-        /// <typeparam name="T">The type of data to allocate.</typeparam>
-        /// <param name="values">The data to allocate.</param>
-        internal void AllocateFrom<T>(T[][] values)
-            where T : unmanaged
-        {
-            if (tofroms.Length > 0)
-                throw new ImproperDataMovementOrderingException("DataFrom should be called before DataToFrom.");
-
-            var froms = values.Select(v => accelerator.Allocate1D<T>(v.Length)).ToArray();
-            AcceleratorHandler.froms = AcceleratorHandler.froms.Concat(froms).ToArray();
-            AcceleratorHandler.froms_cpu = AcceleratorHandler.froms_cpu.Concat(values).ToArray();
-        }
-
-        /// <summary>
-        /// Called if data should be moved to and from the device.
-        /// Allocates data on GPU and copies the data from the CPU.
-        /// </summary>
-        /// <typeparam name="T">The type of data to allocate.</typeparam>
-        /// <param name="values">The data to allocate.</param>
-        internal void AllocateToFrom<T>(T[][] values)
-            where T : unmanaged
-        {
-            var tofroms = values.Select(v => accelerator.Allocate1D(v)).ToArray();
-            AcceleratorHandler.tofroms = AcceleratorHandler.tofroms.Concat(tofroms).ToArray();
-            AcceleratorHandler.tofroms_cpu = AcceleratorHandler.tofroms_cpu.Concat(values).ToArray();
-
-            for (int i = 0; i < tos.Length; i++)
-                tofroms[i].CopyFromCPU(values[i]);
-        }
-
-        /// <summary>
-        /// Synchronizes the GPU stream.
-        /// </summary>
-        internal void Synchronize() =>
-            accelerator.DefaultStream.Synchronize();
-
-        /// <summary>
-        /// Copies a piece of GPU memory back to the CPU.
-        /// </summary>
-        /// <typeparam name="T">The type of the data to transfer.</typeparam>
-        /// <param name="item">A MemoryBuffer1D object to transfer.</param>
-        internal void CopyBack<T>(dynamic item)
-            where T : unmanaged
-        {
-            MemoryBuffer1D<T, Stride1D.Dense> castedItem = item;
-
-            if (copied_back >= tos.Length && copied_back - tos.Length < froms.Length)
-                castedItem.GetAsArray1D().CopyTo(froms_cpu[copied_back - tos.Length], 0);
-            else if (copied_back >= tos.Length)
-                castedItem.GetAsArray1D().CopyTo(tofroms_cpu[copied_back - tos.Length - froms.Length], 0);
-
-            copied_back++;
         }
 
         /// <summary>
-        /// Called to finalize kernel execution.
-        /// Clears all of the arrays used in the kernel.
+        /// Synchronize pending operations.
         /// </summary>
-        internal void FinalizeKernel()
-        {
-            foreach (var i in tos)
-                i.Dispose();
-            tos = new dynamic[0];
-
-            foreach (var i in froms)
-                i.Dispose();
-            froms = new dynamic[0];
-            froms_cpu = new dynamic[0][];
-
-            foreach (var i in tofroms)
-                i.Dispose();
-            tofroms = new dynamic[0];
-            tofroms_cpu = new dynamic[0][];
-
-            copied_back = 0;
-        }
+        private void Synchronize() => accelerator.Synchronize();
 
         /// <summary>
         /// Dispatches a kernel with one data parameter.
@@ -189,21 +52,19 @@ internal void FinalizeKernel()
         /// <typeparam name="T">The type of the data parameter.</typeparam>
         /// <param name="start">The start of the loop, inclusive.</param>
         /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf">The buffer to run the kernel with.</param>
         /// <param name="action">The action to perform.</param>
-        internal void DispatchKernel<T>(int start, int end, Action<Index, GPUArray<T>> action)
+        internal void DispatchKernel<T>(int start, int end, Buffer<T> buf, Action<Index, GPUArray<T>> action)
             where T : unmanaged
         {
-            dynamic[] parameters = AggregateParams(1);
             var idx = new Index();
 
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(parameters[0].View));
+                new GPUArray<T>(buf.View));
 
             Synchronize();
-            CopyBack<T>(parameters[0]);
-            FinalizeKernel();
         }
 
         /// <summary>
@@ -213,24 +74,22 @@ internal void DispatchKernel<T>(int start, int end, Action<Index, GPUArray<T>> a
         /// <typeparam name="U">The type of the second data parameter.</typeparam>
         /// <param name="start">The start of the loop, inclusive.</param>
         /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
         /// <param name="action">The action to perform.</param>
-        internal void DispatchKernel<T, U>(int start, int end, Action<Index, GPUArray<T>, GPUArray<U>> action)
+        internal void DispatchKernel<T, U>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action)
             where T : unmanaged
             where U : unmanaged
         {
-            dynamic[] parameters = AggregateParams(2);
             var idx = new Index();
 
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(parameters[0].View),
-                new GPUArray<U>(parameters[1].View));
+                new GPUArray<T>(buf1.View),
+                new GPUArray<U>(buf2.View));
 
             Synchronize();
-            CopyBack<T>(parameters[0]);
-            CopyBack<U>(parameters[1]);
-            FinalizeKernel();
         }
 
         /// <summary>
@@ -241,27 +100,25 @@ internal void DispatchKernel<T, U>(int start, int end, Action<Index, GPUArray<T>
         /// <typeparam name="V">The type of the third data parameter.</typeparam>
         /// <param name="start">The start of the loop, inclusive.</param>
         /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
         /// <param name="action">The action to perform.</param>
-        internal void DispatchKernel<T, U, V>(int start, int end, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> action)
+        internal void DispatchKernel<T, U, V>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> action)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
         {
-            dynamic[] parameters = AggregateParams(3);
             var idx = new Index();
 
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(parameters[0].View),
-                new GPUArray<U>(parameters[1].View),
-                new GPUArray<V>(parameters[2].View));
+                new GPUArray<T>(buf1.View),
+                new GPUArray<U>(buf2.View),
+                new GPUArray<V>(buf3.View));
 
             Synchronize();
-            CopyBack<T>(parameters[0]);
-            CopyBack<U>(parameters[1]);
-            CopyBack<V>(parameters[2]);
-            FinalizeKernel();
         }
 
         /// <summary>
@@ -273,30 +130,28 @@ internal void DispatchKernel<T, U, V>(int start, int end, Action<Index, GPUArray
         /// <typeparam name="W">The type of the fourth data parameter.</typeparam>
         /// <param name="start">The start of the loop, inclusive.</param>
         /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
         /// <param name="action">The action to perform.</param>
-        internal void DispatchKernel<T, U, V, W>(int start, int end, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> action)
+        internal void DispatchKernel<T, U, V, W>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> action)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
             where W : unmanaged
         {
-            dynamic[] parameters = AggregateParams(4);
             var idx = new Index();
 
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(parameters[0].View),
-                new GPUArray<U>(parameters[1].View),
-                new GPUArray<V>(parameters[2].View),
-                new GPUArray<W>(parameters[3].View));
+                new GPUArray<T>(buf1.View),
+                new GPUArray<U>(buf2.View),
+                new GPUArray<V>(buf3.View),
+                new GPUArray<W>(buf4.View));
 
             Synchronize();
-            CopyBack<T>(parameters[0]);
-            CopyBack<U>(parameters[1]);
-            CopyBack<V>(parameters[2]);
-            CopyBack<W>(parameters[3]);
-            FinalizeKernel();
         }
     }
 }
\ No newline at end of file
diff --git a/DotMP/GPU/Buffer.cs b/DotMP/GPU/Buffer.cs
new file mode 100644
index 00000000..6419f130
--- /dev/null
+++ b/DotMP/GPU/Buffer.cs
@@ -0,0 +1,95 @@
+using System;
+using DotMP.GPU;
+using ILGPU.Runtime;
+
+namespace DotMP.GPU
+{
+    namespace Buffer
+    {
+        /// <summary>
+        /// Specifies the behavior of the buffer.
+        /// </summary>
+        public enum Behavior
+        {
+            /// <summary>
+            /// Specifies that data should be transfered to the GPU, but not from it.
+            /// </summary>
+            To,
+            /// <summary>
+            /// Specifies that data should be transfered from the GPU, but not to it.
+            /// </summary>
+            From,
+            /// <summary>
+            /// Specifies that data should be transfered both to and from the GPU.
+            /// </summary>
+            ToFrom
+        }
+    }
+
+    /// <summary>
+    /// Buffer to manage GPU memory. Should only be created on the CPU.
+    /// </summary>
+    public class Buffer<T> : IDisposable
+        where T : unmanaged
+    {
+
+        /// <summary>
+        /// The ILGPU buffer.
+        /// </summary>
+        private MemoryBuffer1D<T, ILGPU.Stride1D.Dense> buf;
+
+        /// <summary>
+        /// Behavior of the data, as specified by Behavior.
+        /// </summary>
+        private Buffer.Behavior behavior;
+
+        /// <summary>
+        /// The CPU array, so that we can copy the data back.
+        /// </summary>
+        private T[] data;
+
+        /// <summary>
+        /// Constructor for buffer object. Allocates data on the GPU and makes it available for the next GPU kernel.
+        /// </summary>
+        /// <param name="data">The data to allocate on the GPU.</param>
+        /// <param name="behavior">The behavior of the data, see Behavior.</param>
+        public Buffer(T[] data, Buffer.Behavior behavior)
+        {
+            new AcceleratorHandler();
+
+            this.behavior = behavior;
+            this.data = data;
+
+            switch (behavior)
+            {
+                case Buffer.Behavior.To:
+                    buf = AcceleratorHandler.accelerator.Allocate1D(data);
+                    break;
+                case Buffer.Behavior.From:
+                    buf = AcceleratorHandler.accelerator.Allocate1D<T>(data.Length);
+                    break;
+                case Buffer.Behavior.ToFrom:
+                    buf = AcceleratorHandler.accelerator.Allocate1D(data);
+                    break;
+            }
+        }
+
+        /// <summary>
+        /// Dispose of the buffer, freeing GPU memory and copying any relevant data back to the CPU.
+        /// </summary>
+        public void Dispose()
+        {
+            if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom)
+            {
+                buf.GetAsArray1D().CopyTo(data, 0);
+            }
+
+            buf.Dispose();
+        }
+
+        /// <summary>
+        /// Get the view of the memory for the GPU.
+        /// </summary>
+        internal ArrayView1D<T, ILGPU.Stride1D.Dense> View { get => buf.View; }
+    }
+}
\ No newline at end of file
diff --git a/DotMP/GPU/Gpu.cs b/DotMP/GPU/Gpu.cs
index c541462a..63c402d5 100644
--- a/DotMP/GPU/Gpu.cs
+++ b/DotMP/GPU/Gpu.cs
@@ -17,13 +17,14 @@ public static class Parallel
         /// </summary>
         /// <param name="start">The start of the loop, inclusive.</param>
         /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf">The buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T>(int start, int end, Action<Index, GPUArray<T>> action)
+        public static void ParallelFor<T>(int start, int end, Buffer<T> buf, Action<Index, GPUArray<T>> action)
             where T : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, action);
+            handler.DispatchKernel(start, end, buf, action);
         }
 
         /// <summary>
@@ -33,15 +34,17 @@ public static void ParallelFor<T>(int start, int end, Action<Index, GPUArray<T>>
         /// </summary>
         /// <param name="start">The start of the loop, inclusive.</param>
         /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U>(int start, int end, Action<Index, GPUArray<T>, GPUArray<U>> action)
+        public static void ParallelFor<T, U>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action)
             where T : unmanaged
             where U : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, action);
+            handler.DispatchKernel(start, end, buf1, buf2, action);
         }
 
         /// <summary>
@@ -51,17 +54,20 @@ public static void ParallelFor<T, U>(int start, int end, Action<Index, GPUArray<
         /// </summary>
         /// <param name="start">The start of the loop, inclusive.</param>
         /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V>(int start, int end, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> action)
+        public static void ParallelFor<T, U, V>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> action)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, action);
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, action);
         }
 
         /// <summary>
@@ -71,58 +77,23 @@ public static void ParallelFor<T, U, V>(int start, int end, Action<Index, GPUArr
         /// </summary>
         /// <param name="start">The start of the loop, inclusive.</param>
         /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W>(int start, int end, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> action)
+        public static void ParallelFor<T, U, V, W>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> action)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
             where W : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, action);
-        }
-
-        /// <summary>
-        /// Specifies data movement to the GPU at the start of the kernel, but not back to the CPU at the end of the kernel.
-        /// Can be called multiple times with different datatypes, but is cleared after a call to Kernel().
-        /// </summary>
-        /// <typeparam name="T">The base type of the data. Must be an unmanaged type.</typeparam>
-        /// <param name="to_data">The data to move to the GPU.</param>
-        public static void DataTo<T>(params T[][] to_data)
-            where T : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            handler.AllocateTo(to_data);
-        }
-
-        /// <summary>
-        /// Specifies data movement back to the CPU at the end of the kernel, but not to the GPU at the start of the kernel..
-        /// Can be called multiple times with different datatypes, but is cleared after a call to Kernel().
-        /// </summary>
-        /// <typeparam name="T">The base type of the data. Must be an unmanaged type.</typeparam>
-        /// <param name="to_data">The data to move from the GPU.</param>
-        public static void DataFrom<T>(params T[][] to_data)
-            where T : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            handler.AllocateFrom(to_data);
-        }
-
-        /// <summary>
-        /// Specifies data movement to the GPU at the start of the kernel, and from the GPU back to the CPU at the end of the kernel.
-        /// Can be called multiple times with different datatypes, but is cleared after a call to Kernel().
-        /// </summary>
-        /// <typeparam name="T">The base type of the data. Must be an unmanaged type.</typeparam>
-        /// <param name="to_data">The data to move to and from the GPU.</param>
-        public static void DataToFrom<T>(params T[][] to_data)
-            where T : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            handler.AllocateToFrom(to_data);
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, action);
         }
     }
 }
\ No newline at end of file
diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs
index 0c3fd3e4..d7281740 100644
--- a/DotMP/GPU/GpuArray.cs
+++ b/DotMP/GPU/GpuArray.cs
@@ -1,4 +1,5 @@
 using ILGPU;
+using System;
 
 namespace DotMP.GPU
 {
@@ -6,7 +7,8 @@ namespace DotMP.GPU
     /// Wrapper object for representing arrays on the GPU.
     /// </summary>
     /// <typeparam name="T"></typeparam>
-    public struct GPUArray<T> where T : unmanaged
+    public struct GPUArray<T>
+        where T : unmanaged
     {
         /// <summary>
         /// Internal ArrayView object.

From 49706b3910d14542fd8a3cebe279b8c7ddb09eb0 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 04:20:26 -0600
Subject: [PATCH 03/61] tidying up duplicate code

---
 DotMP/GPU/Buffer.cs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/DotMP/GPU/Buffer.cs b/DotMP/GPU/Buffer.cs
index 6419f130..e0bd6908 100644
--- a/DotMP/GPU/Buffer.cs
+++ b/DotMP/GPU/Buffer.cs
@@ -63,14 +63,12 @@ public Buffer(T[] data, Buffer.Behavior behavior)
             switch (behavior)
             {
                 case Buffer.Behavior.To:
+                case Buffer.Behavior.ToFrom:
                     buf = AcceleratorHandler.accelerator.Allocate1D(data);
                     break;
                 case Buffer.Behavior.From:
                     buf = AcceleratorHandler.accelerator.Allocate1D<T>(data.Length);
                     break;
-                case Buffer.Behavior.ToFrom:
-                    buf = AcceleratorHandler.accelerator.Allocate1D(data);
-                    break;
             }
         }
 

From 68092ff3b67ff586130a1e50842402f58cbae29d Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 04:23:25 -0600
Subject: [PATCH 04/61] remove unnecessary implicit operators

---
 DotMP/GPU/GpuArray.cs | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs
index d7281740..55c45edb 100644
--- a/DotMP/GPU/GpuArray.cs
+++ b/DotMP/GPU/GpuArray.cs
@@ -24,24 +24,6 @@ public GPUArray(ArrayView<T> arrayView)
             this.arrayView = arrayView;
         }
 
-        /// <summary>
-        /// Implicit conversion to ArrayView.
-        /// </summary>
-        /// <param name="array">The GPUArray object.</param>
-        public static implicit operator ArrayView<T>(GPUArray<T> array)
-        {
-            return array.arrayView;
-        }
-
-        /// <summary>
-        /// Implicit conversion to GPUArray.
-        /// </summary>
-        /// <param name="array">The ArrayView object.</param>
-        public static implicit operator GPUArray<T>(ArrayView<T> array)
-        {
-            return new GPUArray<T>(array);
-        }
-
         /// <summary>
         /// Overload for [] operator.
         /// </summary>

From 0ac98e5faf92800be778097415e2bbdb7b3ca2e7 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 04:30:13 -0600
Subject: [PATCH 05/61] add overloads for dispatching loops with 5 or 6
 variables

---
 DotMP/GPU/AcceleratorHandler.cs | 80 ++++++++++++++++++++++++++++++++-
 DotMP/GPU/Gpu.cs                | 61 +++++++++++++++++++++++++
 2 files changed, 140 insertions(+), 1 deletion(-)

diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs
index f2889a00..220d57bd 100644
--- a/DotMP/GPU/AcceleratorHandler.cs
+++ b/DotMP/GPU/AcceleratorHandler.cs
@@ -122,7 +122,7 @@ internal void DispatchKernel<T, U, V>(int start, int end, Buffer<T> buf1, Buffer
         }
 
         /// <summary>
-        /// The type of the first data parameter.
+        /// Dispatches a kernel with four parameters.
         /// </summary>
         /// <typeparam name="T">The type of the first data parameter.</typeparam>
         /// <typeparam name="U">The type of the second data parameter.</typeparam>
@@ -153,5 +153,83 @@ internal void DispatchKernel<T, U, V, W>(int start, int end, Buffer<T> buf1, Buf
 
             Synchronize();
         }
+
+        /// <summary>
+        /// Dispatches a kernel with five parameters.
+        /// </summary>
+        /// <typeparam name="T">The type of the first data parameter.</typeparam>
+        /// <typeparam name="U">The type of the second data parameter.</typeparam>
+        /// <typeparam name="V">The type of the third data parameter.</typeparam>
+        /// <typeparam name="W">The type of the fourth data parameter.</typeparam>
+        /// <typeparam name="X">The type of the fifth data parameter.</typeparam>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="action">The action to perform.</param>
+        internal void DispatchKernel<T, U, V, W, X>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+        {
+            var idx = new Index();
+
+            var kernel = accelerator.LoadStreamKernel(action);
+
+            kernel(((end - start) / block_size, block_size), idx,
+                new GPUArray<T>(buf1.View),
+                new GPUArray<U>(buf2.View),
+                new GPUArray<V>(buf3.View),
+                new GPUArray<W>(buf4.View),
+                new GPUArray<X>(buf5.View));
+
+            Synchronize();
+        }
+
+        /// <summary>
+        /// Dispatches a kernel with six parameters.
+        /// </summary>
+        /// <typeparam name="T">The type of the first data parameter.</typeparam>
+        /// <typeparam name="U">The type of the second data parameter.</typeparam>
+        /// <typeparam name="V">The type of the third data parameter.</typeparam>
+        /// <typeparam name="W">The type of the fourth data parameter.</typeparam>
+        /// <typeparam name="X">The type of the fifth data parameter.</typeparam>
+        /// <typeparam name="Y">The type of the sixth data parameter.</typeparam>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="action">The action to perform.</param>
+        internal void DispatchKernel<T, U, V, W, X, Y>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+        {
+            var idx = new Index();
+
+            var kernel = accelerator.LoadStreamKernel(action);
+
+            kernel(((end - start) / block_size, block_size), idx,
+                new GPUArray<T>(buf1.View),
+                new GPUArray<U>(buf2.View),
+                new GPUArray<V>(buf3.View),
+                new GPUArray<W>(buf4.View),
+                new GPUArray<X>(buf5.View),
+                new GPUArray<Y>(buf6.View));
+
+            Synchronize();
+        }
     }
 }
\ No newline at end of file
diff --git a/DotMP/GPU/Gpu.cs b/DotMP/GPU/Gpu.cs
index 63c402d5..6175bf09 100644
--- a/DotMP/GPU/Gpu.cs
+++ b/DotMP/GPU/Gpu.cs
@@ -95,5 +95,66 @@ public static void ParallelFor<T, U, V, W>(int start, int end, Buffer<T> buf1, B
             var handler = new AcceleratorHandler();
             handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, action);
         }
+
+        /// <summary>
+        /// Creates a GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// This overload specifies that five arrays are used on the GPU.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        public static void ParallelFor<T, U, V, W, X>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+        {
+            var handler = new AcceleratorHandler();
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, action);
+        }
+
+        /// <summary>
+        /// Creates a GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// This overload specifies that six arrays are used on the GPU.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        public static void ParallelFor<T, U, V, W, X, Y>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+        {
+            var handler = new AcceleratorHandler();
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, action);
+        }
     }
 }
\ No newline at end of file

From b76af519cd4b79f09fa5f19bf0e5915ce7caff94 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 05:11:06 -0600
Subject: [PATCH 06/61] add parfor_dump to gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 25fb8c99..a24fbd40 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,7 @@ docs/*
 .vscode
 *.opencover.xml
 *.sln
+parfor_dump.cs
 ProcessedREADME.md
 
 # User-specific files

From c2f591a0769f6fc5f6f294259d6566a0d269323e Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 05:11:18 -0600
Subject: [PATCH 07/61] add parallel for overload code gen

---
 DotMP/GPU/parfor_gen.py | 72 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 DotMP/GPU/parfor_gen.py

diff --git a/DotMP/GPU/parfor_gen.py b/DotMP/GPU/parfor_gen.py
new file mode 100644
index 00000000..f001dc9b
--- /dev/null
+++ b/DotMP/GPU/parfor_gen.py
@@ -0,0 +1,72 @@
+ofile = open("./parfor_dump.cs", "w")
+
+cardinals = ["one", "two", "three", "four", "five", "six", "seven", "eight",
+             "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"]
+ordinals = ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
+            "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth"]
+
+letters = ["T", "U", "V", "W", "X", "Y", "Z",
+           "A", "B", "C", "D", "E", "F", "G", "H", "I"]
+
+for i in range(0, 16):
+    funcstr = ""
+
+    funcstr += """/// <summary>
+/// Creates a GPU parallel for loop.
+/// The body of the kernel is run on a GPU target.
+/// This overload specifies that {c} arrays are used on the GPU.
+/// </summary>
+/// <param name="start">The start of the loop, inclusive.</param>
+/// <param name="end">The end of the loop, exclusive.</param>""".format(c=cardinals[i])
+
+    for j in range(i + 1):
+        adjusted = j + 1
+
+        funcstr += """
+/// <param name="buf{a}">The {o} buffer to run the kernel with.</param>""".format(a=j + 1, o=ordinals[j])
+
+    funcstr += """
+/// <param name="action">The kernel to run on the GPU.</param>"""
+
+    for j in range(i + 1):
+        funcstr += """
+/// <typeparam name="{l}">The base type of the {o} argument. Must be an unmanaged type.</typeparam>""".format(l=letters[j], o=ordinals[j])
+
+    funcstr += """
+public static void ParallelFor<"""
+
+    for j in range(i):
+        funcstr += "{l}, ".format(l=letters[j])
+
+    funcstr += "{l}>(int start, int end, ".format(l=letters[i])
+
+    for j in range(i + 1):
+        adjusted = j + 1
+        funcstr += "Buffer<{l}> buf{a}, ".format(l=letters[j], a=adjusted)
+
+    funcstr += "Action<Index, "
+
+    for j in range(i):
+        adjusted = j + 1
+        funcstr += "GPUArray<{l}>, ".format(l=letters[j])
+
+    funcstr += "GPUArray<{l}>> action)".format(l=letters[i])
+
+    for j in range(i + 1):
+        funcstr += "\n    where {l} : unmanaged".format(l=letters[j])
+
+    funcstr += """
+{
+    var handler = new AcceleratorHandler();
+    handler.DispatchKernel(start, end, """
+
+    for j in range(i + 1):
+        adjusted = j + 1
+        funcstr += "buf{a}, ".format(a=adjusted)
+
+    funcstr += """action);
+}
+
+"""
+
+    ofile.write(funcstr)

From dd5640701440b24efa2c154e6816958c2201ef1d Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 05:11:37 -0600
Subject: [PATCH 08/61] add overloads for up to 16 kernel variables

---
 DotMP/GPU/Gpu.cs | 485 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 485 insertions(+)

diff --git a/DotMP/GPU/Gpu.cs b/DotMP/GPU/Gpu.cs
index 6175bf09..624c6033 100644
--- a/DotMP/GPU/Gpu.cs
+++ b/DotMP/GPU/Gpu.cs
@@ -156,5 +156,490 @@ public static void ParallelFor<T, U, V, W, X, Y>(int start, int end, Buffer<T> b
             var handler = new AcceleratorHandler();
             handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, action);
         }
+
+        /// <summary>
+        /// Creates a GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// This overload specifies that seven arrays are used on the GPU.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        public static void ParallelFor<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+        {
+            var handler = new AcceleratorHandler();
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, action);
+        }
+
+        /// <summary>
+        /// Creates a GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// This overload specifies that eight arrays are used on the GPU.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+        public static void ParallelFor<T, U, V, W, X, Y, Z, A>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+            where A : unmanaged
+        {
+            var handler = new AcceleratorHandler();
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, action);
+        }
+
+        /// <summary>
+        /// Creates a GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// This overload specifies that nine arrays are used on the GPU.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+            where A : unmanaged
+            where B : unmanaged
+        {
+            var handler = new AcceleratorHandler();
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, action);
+        }
+
+        /// <summary>
+        /// Creates a GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// This overload specifies that ten arrays are used on the GPU.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
+        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+            where A : unmanaged
+            where B : unmanaged
+            where C : unmanaged
+        {
+            var handler = new AcceleratorHandler();
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, action);
+        }
+
+        /// <summary>
+        /// Creates a GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// This overload specifies that eleven arrays are used on the GPU.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
+        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
+        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+            where A : unmanaged
+            where B : unmanaged
+            where C : unmanaged
+            where D : unmanaged
+        {
+            var handler = new AcceleratorHandler();
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, action);
+        }
+
+        /// <summary>
+        /// Creates a GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// This overload specifies that twelve arrays are used on the GPU.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
+        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
+        /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
+        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+            where A : unmanaged
+            where B : unmanaged
+            where C : unmanaged
+            where D : unmanaged
+            where E : unmanaged
+        {
+            var handler = new AcceleratorHandler();
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, action);
+        }
+
+        /// <summary>
+        /// Creates a GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// This overload specifies that thirteen arrays are used on the GPU.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
+        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
+        /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
+        /// <param name="buf13">The thirteenth buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="F">The base type of the thirteenth argument. Must be an unmanaged type.</typeparam>
+        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E, F>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Buffer<F> buf13, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>, GPUArray<F>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+            where A : unmanaged
+            where B : unmanaged
+            where C : unmanaged
+            where D : unmanaged
+            where E : unmanaged
+            where F : unmanaged
+        {
+            var handler = new AcceleratorHandler();
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, action);
+        }
+
+        /// <summary>
+        /// Creates a GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// This overload specifies that fourteen arrays are used on the GPU.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
+        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
+        /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
+        /// <param name="buf13">The thirteenth buffer to run the kernel with.</param>
+        /// <param name="buf14">The fourteenth buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="F">The base type of the thirteenth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="G">The base type of the fourteenth argument. Must be an unmanaged type.</typeparam>
+        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E, F, G>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Buffer<F> buf13, Buffer<G> buf14, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>, GPUArray<F>, GPUArray<G>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+            where A : unmanaged
+            where B : unmanaged
+            where C : unmanaged
+            where D : unmanaged
+            where E : unmanaged
+            where F : unmanaged
+            where G : unmanaged
+        {
+            var handler = new AcceleratorHandler();
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, action);
+        }
+
+        /// <summary>
+        /// Creates a GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// This overload specifies that fifteen arrays are used on the GPU.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
+        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
+        /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
+        /// <param name="buf13">The thirteenth buffer to run the kernel with.</param>
+        /// <param name="buf14">The fourteenth buffer to run the kernel with.</param>
+        /// <param name="buf15">The fifteenth buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="F">The base type of the thirteenth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="G">The base type of the fourteenth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="H">The base type of the fifteenth argument. Must be an unmanaged type.</typeparam>
+        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E, F, G, H>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Buffer<F> buf13, Buffer<G> buf14, Buffer<H> buf15, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>, GPUArray<F>, GPUArray<G>, GPUArray<H>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+            where A : unmanaged
+            where B : unmanaged
+            where C : unmanaged
+            where D : unmanaged
+            where E : unmanaged
+            where F : unmanaged
+            where G : unmanaged
+            where H : unmanaged
+        {
+            var handler = new AcceleratorHandler();
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, buf15, action);
+        }
+
+        /// <summary>
+        /// Creates a GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// This overload specifies that sixteen arrays are used on the GPU.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
+        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
+        /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
+        /// <param name="buf13">The thirteenth buffer to run the kernel with.</param>
+        /// <param name="buf14">The fourteenth buffer to run the kernel with.</param>
+        /// <param name="buf15">The fifteenth buffer to run the kernel with.</param>
+        /// <param name="buf16">The sixteenth buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="F">The base type of the thirteenth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="G">The base type of the fourteenth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="H">The base type of the fifteenth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="I">The base type of the sixteenth argument. Must be an unmanaged type.</typeparam>
+        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E, F, G, H, I>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Buffer<F> buf13, Buffer<G> buf14, Buffer<H> buf15, Buffer<I> buf16, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>, GPUArray<F>, GPUArray<G>, GPUArray<H>, GPUArray<I>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+            where A : unmanaged
+            where B : unmanaged
+            where C : unmanaged
+            where D : unmanaged
+            where E : unmanaged
+            where F : unmanaged
+            where G : unmanaged
+            where H : unmanaged
+            where I : unmanaged
+        {
+            var handler = new AcceleratorHandler();
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, buf15, buf16, action);
+        }
     }
 }
\ No newline at end of file

From 82165daa00fff2e4af60e21f243904531cbf58ce Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 05:12:47 -0600
Subject: [PATCH 09/61] move python to python folder

---
 .../{parfor_gen.py => Python/dispatch_gen.py} |  0
 DotMP/GPU/Python/parfor_gen.py                | 72 +++++++++++++++++++
 2 files changed, 72 insertions(+)
 rename DotMP/GPU/{parfor_gen.py => Python/dispatch_gen.py} (100%)
 create mode 100644 DotMP/GPU/Python/parfor_gen.py

diff --git a/DotMP/GPU/parfor_gen.py b/DotMP/GPU/Python/dispatch_gen.py
similarity index 100%
rename from DotMP/GPU/parfor_gen.py
rename to DotMP/GPU/Python/dispatch_gen.py
diff --git a/DotMP/GPU/Python/parfor_gen.py b/DotMP/GPU/Python/parfor_gen.py
new file mode 100644
index 00000000..f001dc9b
--- /dev/null
+++ b/DotMP/GPU/Python/parfor_gen.py
@@ -0,0 +1,72 @@
+ofile = open("./parfor_dump.cs", "w")
+
+cardinals = ["one", "two", "three", "four", "five", "six", "seven", "eight",
+             "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"]
+ordinals = ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
+            "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth"]
+
+letters = ["T", "U", "V", "W", "X", "Y", "Z",
+           "A", "B", "C", "D", "E", "F", "G", "H", "I"]
+
+for i in range(0, 16):
+    funcstr = ""
+
+    funcstr += """/// <summary>
+/// Creates a GPU parallel for loop.
+/// The body of the kernel is run on a GPU target.
+/// This overload specifies that {c} arrays are used on the GPU.
+/// </summary>
+/// <param name="start">The start of the loop, inclusive.</param>
+/// <param name="end">The end of the loop, exclusive.</param>""".format(c=cardinals[i])
+
+    for j in range(i + 1):
+        adjusted = j + 1
+
+        funcstr += """
+/// <param name="buf{a}">The {o} buffer to run the kernel with.</param>""".format(a=j + 1, o=ordinals[j])
+
+    funcstr += """
+/// <param name="action">The kernel to run on the GPU.</param>"""
+
+    for j in range(i + 1):
+        funcstr += """
+/// <typeparam name="{l}">The base type of the {o} argument. Must be an unmanaged type.</typeparam>""".format(l=letters[j], o=ordinals[j])
+
+    funcstr += """
+public static void ParallelFor<"""
+
+    for j in range(i):
+        funcstr += "{l}, ".format(l=letters[j])
+
+    funcstr += "{l}>(int start, int end, ".format(l=letters[i])
+
+    for j in range(i + 1):
+        adjusted = j + 1
+        funcstr += "Buffer<{l}> buf{a}, ".format(l=letters[j], a=adjusted)
+
+    funcstr += "Action<Index, "
+
+    for j in range(i):
+        adjusted = j + 1
+        funcstr += "GPUArray<{l}>, ".format(l=letters[j])
+
+    funcstr += "GPUArray<{l}>> action)".format(l=letters[i])
+
+    for j in range(i + 1):
+        funcstr += "\n    where {l} : unmanaged".format(l=letters[j])
+
+    funcstr += """
+{
+    var handler = new AcceleratorHandler();
+    handler.DispatchKernel(start, end, """
+
+    for j in range(i + 1):
+        adjusted = j + 1
+        funcstr += "buf{a}, ".format(a=adjusted)
+
+    funcstr += """action);
+}
+
+"""
+
+    ofile.write(funcstr)

From 89b5e449ce4145f8f1bf63798e6584cf9aa71aa8 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 05:29:56 -0600
Subject: [PATCH 10/61] update python to only generate up to 13 data params

---
 DotMP/GPU/Python/dispatch_gen.py | 31 ++++++++++++++++++-------------
 DotMP/GPU/Python/parfor_gen.py   |  2 +-
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/DotMP/GPU/Python/dispatch_gen.py b/DotMP/GPU/Python/dispatch_gen.py
index f001dc9b..a9d0f08d 100644
--- a/DotMP/GPU/Python/dispatch_gen.py
+++ b/DotMP/GPU/Python/dispatch_gen.py
@@ -1,4 +1,4 @@
-ofile = open("./parfor_dump.cs", "w")
+ofile = open("./dispatch_dump.cs", "w")
 
 cardinals = ["one", "two", "three", "four", "five", "six", "seven", "eight",
              "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"]
@@ -8,13 +8,11 @@
 letters = ["T", "U", "V", "W", "X", "Y", "Z",
            "A", "B", "C", "D", "E", "F", "G", "H", "I"]
 
-for i in range(0, 16):
+for i in range(0, 13):
     funcstr = ""
 
     funcstr += """/// <summary>
-/// Creates a GPU parallel for loop.
-/// The body of the kernel is run on a GPU target.
-/// This overload specifies that {c} arrays are used on the GPU.
+/// Dispatches a kernel with {c} parameters.
 /// </summary>
 /// <param name="start">The start of the loop, inclusive.</param>
 /// <param name="end">The end of the loop, exclusive.</param>""".format(c=cardinals[i])
@@ -33,7 +31,7 @@
 /// <typeparam name="{l}">The base type of the {o} argument. Must be an unmanaged type.</typeparam>""".format(l=letters[j], o=ordinals[j])
 
     funcstr += """
-public static void ParallelFor<"""
+internal void DispatchKernel<"""
 
     for j in range(i):
         funcstr += "{l}, ".format(l=letters[j])
@@ -57,16 +55,23 @@
 
     funcstr += """
 {
-    var handler = new AcceleratorHandler();
-    handler.DispatchKernel(start, end, """
+    var idx = new Index();
 
-    for j in range(i + 1):
+    var kernel = accelerator.LoadStreamKernel(action);
+
+    kernel(((end - start) / block_size, block_size), idx,
+"""
+
+    for j in range(i):
         adjusted = j + 1
-        funcstr += "buf{a}, ".format(a=adjusted)
+        funcstr += """        new GPUArray<{l}>(buf{a}.View),
+""".format(l=letters[j], a=adjusted)
 
-    funcstr += """action);
-}
+    funcstr += """        new GPUArray<{l}>(buf{a}.View));
 
-"""
+    Synchronize();
+""".format(l=letters[i], a=i + 1)
+
+    funcstr += "}\n\n"
 
     ofile.write(funcstr)
diff --git a/DotMP/GPU/Python/parfor_gen.py b/DotMP/GPU/Python/parfor_gen.py
index f001dc9b..c119b624 100644
--- a/DotMP/GPU/Python/parfor_gen.py
+++ b/DotMP/GPU/Python/parfor_gen.py
@@ -8,7 +8,7 @@
 letters = ["T", "U", "V", "W", "X", "Y", "Z",
            "A", "B", "C", "D", "E", "F", "G", "H", "I"]
 
-for i in range(0, 16):
+for i in range(0, 13):
     funcstr = ""
 
     funcstr += """/// <summary>

From 7b9093336eba6c3de29142ef6e4ae1ec9f8dc695 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 05:30:10 -0600
Subject: [PATCH 11/61] add parfor overloads for up to 13 data parameters

---
 DotMP/GPU/AcceleratorHandler.cs | 465 +++++++++++++++++++++++++++++---
 DotMP/GPU/Gpu.cs                | 177 ------------
 2 files changed, 432 insertions(+), 210 deletions(-)

diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs
index 220d57bd..579bb127 100644
--- a/DotMP/GPU/AcceleratorHandler.cs
+++ b/DotMP/GPU/AcceleratorHandler.cs
@@ -47,14 +47,14 @@ internal AcceleratorHandler()
         private void Synchronize() => accelerator.Synchronize();
 
         /// <summary>
-        /// Dispatches a kernel with one data parameter.
+        /// Dispatches a kernel with one parameters.
         /// </summary>
-        /// <typeparam name="T">The type of the data parameter.</typeparam>
         /// <param name="start">The start of the loop, inclusive.</param>
         /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf">The buffer to run the kernel with.</param>
-        /// <param name="action">The action to perform.</param>
-        internal void DispatchKernel<T>(int start, int end, Buffer<T> buf, Action<Index, GPUArray<T>> action)
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        internal void DispatchKernel<T>(int start, int end, Buffer<T> buf1, Action<Index, GPUArray<T>> action)
             where T : unmanaged
         {
             var idx = new Index();
@@ -62,21 +62,21 @@ internal void DispatchKernel<T>(int start, int end, Buffer<T> buf, Action<Index,
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf.View));
+                new GPUArray<T>(buf1.View));
 
             Synchronize();
         }
 
         /// <summary>
-        /// Dispatches a kernel with two data parameters.
+        /// Dispatches a kernel with two parameters.
         /// </summary>
-        /// <typeparam name="T">The type of the first data parameter.</typeparam>
-        /// <typeparam name="U">The type of the second data parameter.</typeparam>
         /// <param name="start">The start of the loop, inclusive.</param>
         /// <param name="end">The end of the loop, exclusive.</param>
         /// <param name="buf1">The first buffer to run the kernel with.</param>
         /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="action">The action to perform.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         internal void DispatchKernel<T, U>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action)
             where T : unmanaged
             where U : unmanaged
@@ -93,17 +93,17 @@ internal void DispatchKernel<T, U>(int start, int end, Buffer<T> buf1, Buffer<U>
         }
 
         /// <summary>
-        /// Dispatches a kernel with three data parameters.
+        /// Dispatches a kernel with three parameters.
         /// </summary>
-        /// <typeparam name="T">The type of the first data parameter.</typeparam>
-        /// <typeparam name="U">The type of the second data parameter.</typeparam>
-        /// <typeparam name="V">The type of the third data parameter.</typeparam>
         /// <param name="start">The start of the loop, inclusive.</param>
         /// <param name="end">The end of the loop, exclusive.</param>
         /// <param name="buf1">The first buffer to run the kernel with.</param>
         /// <param name="buf2">The second buffer to run the kernel with.</param>
         /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="action">The action to perform.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
         internal void DispatchKernel<T, U, V>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> action)
             where T : unmanaged
             where U : unmanaged
@@ -124,17 +124,17 @@ internal void DispatchKernel<T, U, V>(int start, int end, Buffer<T> buf1, Buffer
         /// <summary>
         /// Dispatches a kernel with four parameters.
         /// </summary>
-        /// <typeparam name="T">The type of the first data parameter.</typeparam>
-        /// <typeparam name="U">The type of the second data parameter.</typeparam>
-        /// <typeparam name="V">The type of the third data parameter.</typeparam>
-        /// <typeparam name="W">The type of the fourth data parameter.</typeparam>
         /// <param name="start">The start of the loop, inclusive.</param>
         /// <param name="end">The end of the loop, exclusive.</param>
         /// <param name="buf1">The first buffer to run the kernel with.</param>
         /// <param name="buf2">The second buffer to run the kernel with.</param>
         /// <param name="buf3">The third buffer to run the kernel with.</param>
         /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="action">The action to perform.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
         internal void DispatchKernel<T, U, V, W>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> action)
             where T : unmanaged
             where U : unmanaged
@@ -157,11 +157,6 @@ internal void DispatchKernel<T, U, V, W>(int start, int end, Buffer<T> buf1, Buf
         /// <summary>
         /// Dispatches a kernel with five parameters.
         /// </summary>
-        /// <typeparam name="T">The type of the first data parameter.</typeparam>
-        /// <typeparam name="U">The type of the second data parameter.</typeparam>
-        /// <typeparam name="V">The type of the third data parameter.</typeparam>
-        /// <typeparam name="W">The type of the fourth data parameter.</typeparam>
-        /// <typeparam name="X">The type of the fifth data parameter.</typeparam>
         /// <param name="start">The start of the loop, inclusive.</param>
         /// <param name="end">The end of the loop, exclusive.</param>
         /// <param name="buf1">The first buffer to run the kernel with.</param>
@@ -169,7 +164,12 @@ internal void DispatchKernel<T, U, V, W>(int start, int end, Buffer<T> buf1, Buf
         /// <param name="buf3">The third buffer to run the kernel with.</param>
         /// <param name="buf4">The fourth buffer to run the kernel with.</param>
         /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="action">The action to perform.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
         internal void DispatchKernel<T, U, V, W, X>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>> action)
             where T : unmanaged
             where U : unmanaged
@@ -194,12 +194,6 @@ internal void DispatchKernel<T, U, V, W, X>(int start, int end, Buffer<T> buf1,
         /// <summary>
         /// Dispatches a kernel with six parameters.
         /// </summary>
-        /// <typeparam name="T">The type of the first data parameter.</typeparam>
-        /// <typeparam name="U">The type of the second data parameter.</typeparam>
-        /// <typeparam name="V">The type of the third data parameter.</typeparam>
-        /// <typeparam name="W">The type of the fourth data parameter.</typeparam>
-        /// <typeparam name="X">The type of the fifth data parameter.</typeparam>
-        /// <typeparam name="Y">The type of the sixth data parameter.</typeparam>
         /// <param name="start">The start of the loop, inclusive.</param>
         /// <param name="end">The end of the loop, exclusive.</param>
         /// <param name="buf1">The first buffer to run the kernel with.</param>
@@ -208,7 +202,13 @@ internal void DispatchKernel<T, U, V, W, X>(int start, int end, Buffer<T> buf1,
         /// <param name="buf4">The fourth buffer to run the kernel with.</param>
         /// <param name="buf5">The fifth buffer to run the kernel with.</param>
         /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="action">The action to perform.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
         internal void DispatchKernel<T, U, V, W, X, Y>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>> action)
             where T : unmanaged
             where U : unmanaged
@@ -231,5 +231,404 @@ internal void DispatchKernel<T, U, V, W, X, Y>(int start, int end, Buffer<T> buf
 
             Synchronize();
         }
+
+        /// <summary>
+        /// Dispatches a kernel with seven parameters.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        internal void DispatchKernel<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+        {
+            var idx = new Index();
+
+            var kernel = accelerator.LoadStreamKernel(action);
+
+            kernel(((end - start) / block_size, block_size), idx,
+                new GPUArray<T>(buf1.View),
+                new GPUArray<U>(buf2.View),
+                new GPUArray<V>(buf3.View),
+                new GPUArray<W>(buf4.View),
+                new GPUArray<X>(buf5.View),
+                new GPUArray<Y>(buf6.View),
+                new GPUArray<Z>(buf7.View));
+
+            Synchronize();
+        }
+
+        /// <summary>
+        /// Dispatches a kernel with eight parameters.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+        internal void DispatchKernel<T, U, V, W, X, Y, Z, A>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+            where A : unmanaged
+        {
+            var idx = new Index();
+
+            var kernel = accelerator.LoadStreamKernel(action);
+
+            kernel(((end - start) / block_size, block_size), idx,
+                new GPUArray<T>(buf1.View),
+                new GPUArray<U>(buf2.View),
+                new GPUArray<V>(buf3.View),
+                new GPUArray<W>(buf4.View),
+                new GPUArray<X>(buf5.View),
+                new GPUArray<Y>(buf6.View),
+                new GPUArray<Z>(buf7.View),
+                new GPUArray<A>(buf8.View));
+
+            Synchronize();
+        }
+
+        /// <summary>
+        /// Dispatches a kernel with nine parameters.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+            where A : unmanaged
+            where B : unmanaged
+        {
+            var idx = new Index();
+
+            var kernel = accelerator.LoadStreamKernel(action);
+
+            kernel(((end - start) / block_size, block_size), idx,
+                new GPUArray<T>(buf1.View),
+                new GPUArray<U>(buf2.View),
+                new GPUArray<V>(buf3.View),
+                new GPUArray<W>(buf4.View),
+                new GPUArray<X>(buf5.View),
+                new GPUArray<Y>(buf6.View),
+                new GPUArray<Z>(buf7.View),
+                new GPUArray<A>(buf8.View),
+                new GPUArray<B>(buf9.View));
+
+            Synchronize();
+        }
+
+        /// <summary>
+        /// Dispatches a kernel with ten parameters.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
+        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+            where A : unmanaged
+            where B : unmanaged
+            where C : unmanaged
+        {
+            var idx = new Index();
+
+            var kernel = accelerator.LoadStreamKernel(action);
+
+            kernel(((end - start) / block_size, block_size), idx,
+                new GPUArray<T>(buf1.View),
+                new GPUArray<U>(buf2.View),
+                new GPUArray<V>(buf3.View),
+                new GPUArray<W>(buf4.View),
+                new GPUArray<X>(buf5.View),
+                new GPUArray<Y>(buf6.View),
+                new GPUArray<Z>(buf7.View),
+                new GPUArray<A>(buf8.View),
+                new GPUArray<B>(buf9.View),
+                new GPUArray<C>(buf10.View));
+
+            Synchronize();
+        }
+
+        /// <summary>
+        /// Dispatches a kernel with eleven parameters.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
+        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
+        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+            where A : unmanaged
+            where B : unmanaged
+            where C : unmanaged
+            where D : unmanaged
+        {
+            var idx = new Index();
+
+            var kernel = accelerator.LoadStreamKernel(action);
+
+            kernel(((end - start) / block_size, block_size), idx,
+                new GPUArray<T>(buf1.View),
+                new GPUArray<U>(buf2.View),
+                new GPUArray<V>(buf3.View),
+                new GPUArray<W>(buf4.View),
+                new GPUArray<X>(buf5.View),
+                new GPUArray<Y>(buf6.View),
+                new GPUArray<Z>(buf7.View),
+                new GPUArray<A>(buf8.View),
+                new GPUArray<B>(buf9.View),
+                new GPUArray<C>(buf10.View),
+                new GPUArray<D>(buf11.View));
+
+            Synchronize();
+        }
+
+        /// <summary>
+        /// Dispatches a kernel with twelve parameters.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
+        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
+        /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
+        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+            where A : unmanaged
+            where B : unmanaged
+            where C : unmanaged
+            where D : unmanaged
+            where E : unmanaged
+        {
+            var idx = new Index();
+
+            var kernel = accelerator.LoadStreamKernel(action);
+
+            kernel(((end - start) / block_size, block_size), idx,
+                new GPUArray<T>(buf1.View),
+                new GPUArray<U>(buf2.View),
+                new GPUArray<V>(buf3.View),
+                new GPUArray<W>(buf4.View),
+                new GPUArray<X>(buf5.View),
+                new GPUArray<Y>(buf6.View),
+                new GPUArray<Z>(buf7.View),
+                new GPUArray<A>(buf8.View),
+                new GPUArray<B>(buf9.View),
+                new GPUArray<C>(buf10.View),
+                new GPUArray<D>(buf11.View),
+                new GPUArray<E>(buf12.View));
+
+            Synchronize();
+        }
+
+        /// <summary>
+        /// Dispatches a kernel with thirteen parameters.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf2">The second buffer to run the kernel with.</param>
+        /// <param name="buf3">The third buffer to run the kernel with.</param>
+        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
+        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
+        /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
+        /// <param name="buf13">The thirteenth buffer to run the kernel with.</param>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="F">The base type of the thirteenth argument. Must be an unmanaged type.</typeparam>
+        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E, F>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Buffer<F> buf13, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>, GPUArray<F>> action)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+            where X : unmanaged
+            where Y : unmanaged
+            where Z : unmanaged
+            where A : unmanaged
+            where B : unmanaged
+            where C : unmanaged
+            where D : unmanaged
+            where E : unmanaged
+            where F : unmanaged
+        {
+            var idx = new Index();
+
+            var kernel = accelerator.LoadStreamKernel(action);
+
+            kernel(((end - start) / block_size, block_size), idx,
+                new GPUArray<T>(buf1.View),
+                new GPUArray<U>(buf2.View),
+                new GPUArray<V>(buf3.View),
+                new GPUArray<W>(buf4.View),
+                new GPUArray<X>(buf5.View),
+                new GPUArray<Y>(buf6.View),
+                new GPUArray<Z>(buf7.View),
+                new GPUArray<A>(buf8.View),
+                new GPUArray<B>(buf9.View),
+                new GPUArray<C>(buf10.View),
+                new GPUArray<D>(buf11.View),
+                new GPUArray<E>(buf12.View),
+                new GPUArray<F>(buf13.View));
+
+            Synchronize();
+        }
     }
 }
\ No newline at end of file
diff --git a/DotMP/GPU/Gpu.cs b/DotMP/GPU/Gpu.cs
index 624c6033..1237a75d 100644
--- a/DotMP/GPU/Gpu.cs
+++ b/DotMP/GPU/Gpu.cs
@@ -464,182 +464,5 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E, F>(int start,
             var handler = new AcceleratorHandler();
             handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, action);
         }
-
-        /// <summary>
-        /// Creates a GPU parallel for loop.
-        /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that fourteen arrays are used on the GPU.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
-        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
-        /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
-        /// <param name="buf13">The thirteenth buffer to run the kernel with.</param>
-        /// <param name="buf14">The fourteenth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="F">The base type of the thirteenth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="G">The base type of the fourteenth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E, F, G>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Buffer<F> buf13, Buffer<G> buf14, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>, GPUArray<F>, GPUArray<G>> action)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-            where A : unmanaged
-            where B : unmanaged
-            where C : unmanaged
-            where D : unmanaged
-            where E : unmanaged
-            where F : unmanaged
-            where G : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, action);
-        }
-
-        /// <summary>
-        /// Creates a GPU parallel for loop.
-        /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that fifteen arrays are used on the GPU.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
-        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
-        /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
-        /// <param name="buf13">The thirteenth buffer to run the kernel with.</param>
-        /// <param name="buf14">The fourteenth buffer to run the kernel with.</param>
-        /// <param name="buf15">The fifteenth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="F">The base type of the thirteenth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="G">The base type of the fourteenth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="H">The base type of the fifteenth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E, F, G, H>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Buffer<F> buf13, Buffer<G> buf14, Buffer<H> buf15, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>, GPUArray<F>, GPUArray<G>, GPUArray<H>> action)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-            where A : unmanaged
-            where B : unmanaged
-            where C : unmanaged
-            where D : unmanaged
-            where E : unmanaged
-            where F : unmanaged
-            where G : unmanaged
-            where H : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, buf15, action);
-        }
-
-        /// <summary>
-        /// Creates a GPU parallel for loop.
-        /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that sixteen arrays are used on the GPU.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
-        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
-        /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
-        /// <param name="buf13">The thirteenth buffer to run the kernel with.</param>
-        /// <param name="buf14">The fourteenth buffer to run the kernel with.</param>
-        /// <param name="buf15">The fifteenth buffer to run the kernel with.</param>
-        /// <param name="buf16">The sixteenth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="F">The base type of the thirteenth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="G">The base type of the fourteenth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="H">The base type of the fifteenth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="I">The base type of the sixteenth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E, F, G, H, I>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Buffer<F> buf13, Buffer<G> buf14, Buffer<H> buf15, Buffer<I> buf16, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>, GPUArray<F>, GPUArray<G>, GPUArray<H>, GPUArray<I>> action)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-            where A : unmanaged
-            where B : unmanaged
-            where C : unmanaged
-            where D : unmanaged
-            where E : unmanaged
-            where F : unmanaged
-            where G : unmanaged
-            where H : unmanaged
-            where I : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, buf14, buf15, buf16, action);
-        }
     }
 }
\ No newline at end of file

From 8a87b5c4b511a0893b5a590aff2a4732311d6085 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 05:35:18 -0600
Subject: [PATCH 12/61] fix cardinality in documentation

---
 DotMP/GPU/AcceleratorHandler.cs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs
index 579bb127..05ebe6b9 100644
--- a/DotMP/GPU/AcceleratorHandler.cs
+++ b/DotMP/GPU/AcceleratorHandler.cs
@@ -47,14 +47,14 @@ internal AcceleratorHandler()
         private void Synchronize() => accelerator.Synchronize();
 
         /// <summary>
-        /// Dispatches a kernel with one parameters.
+        /// Dispatches a kernel with one parameter.
         /// </summary>
         /// <param name="start">The start of the loop, inclusive.</param>
         /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
+        /// <param name="buf">The buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T>(int start, int end, Buffer<T> buf1, Action<Index, GPUArray<T>> action)
+        internal void DispatchKernel<T>(int start, int end, Buffer<T> buf, Action<Index, GPUArray<T>> action)
             where T : unmanaged
         {
             var idx = new Index();
@@ -62,7 +62,7 @@ internal void DispatchKernel<T>(int start, int end, Buffer<T> buf1, Action<Index
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1.View));
+                new GPUArray<T>(buf.View));
 
             Synchronize();
         }

From ef3035f33363eb24934097e06af9a7b2b3cba9fb Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 08:54:35 -0600
Subject: [PATCH 13/61] implement 2D arrays into GPUArray and Buffer objects

---
 DotMP/GPU/AcceleratorHandler.cs | 188 ++++++++++++++++----------------
 DotMP/GPU/Buffer.cs             | 104 +++++++++++++++---
 DotMP/GPU/GpuArray.cs           |  63 +++++++++--
 3 files changed, 243 insertions(+), 112 deletions(-)

diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs
index 05ebe6b9..95d415ff 100644
--- a/DotMP/GPU/AcceleratorHandler.cs
+++ b/DotMP/GPU/AcceleratorHandler.cs
@@ -35,7 +35,11 @@ internal AcceleratorHandler()
             if (initialized) return;
 
             context = Context.CreateDefault();
-            accelerator = context.Devices[0].CreateAccelerator(context);
+            accelerator = context.Devices[1].CreateAccelerator(context);
+            foreach (var d in context.Devices)
+            {
+                Console.WriteLine("Detected {0} accelerator.", d.ToString());
+            }
             Console.WriteLine("Using {0} accelerator.", accelerator.AcceleratorType.ToString());
             initialized = true;
             block_size = accelerator.AcceleratorType == AcceleratorType.CPU ? 16 : 256;
@@ -62,7 +66,7 @@ internal void DispatchKernel<T>(int start, int end, Buffer<T> buf, Action<Index,
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf.View));
+                new GPUArray<T>(buf));
 
             Synchronize();
         }
@@ -86,8 +90,8 @@ internal void DispatchKernel<T, U>(int start, int end, Buffer<T> buf1, Buffer<U>
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1.View),
-                new GPUArray<U>(buf2.View));
+                new GPUArray<T>(buf1),
+                new GPUArray<U>(buf2));
 
             Synchronize();
         }
@@ -114,9 +118,9 @@ internal void DispatchKernel<T, U, V>(int start, int end, Buffer<T> buf1, Buffer
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1.View),
-                new GPUArray<U>(buf2.View),
-                new GPUArray<V>(buf3.View));
+                new GPUArray<T>(buf1),
+                new GPUArray<U>(buf2),
+                new GPUArray<V>(buf3));
 
             Synchronize();
         }
@@ -146,10 +150,10 @@ internal void DispatchKernel<T, U, V, W>(int start, int end, Buffer<T> buf1, Buf
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1.View),
-                new GPUArray<U>(buf2.View),
-                new GPUArray<V>(buf3.View),
-                new GPUArray<W>(buf4.View));
+                new GPUArray<T>(buf1),
+                new GPUArray<U>(buf2),
+                new GPUArray<V>(buf3),
+                new GPUArray<W>(buf4));
 
             Synchronize();
         }
@@ -182,11 +186,11 @@ internal void DispatchKernel<T, U, V, W, X>(int start, int end, Buffer<T> buf1,
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1.View),
-                new GPUArray<U>(buf2.View),
-                new GPUArray<V>(buf3.View),
-                new GPUArray<W>(buf4.View),
-                new GPUArray<X>(buf5.View));
+                new GPUArray<T>(buf1),
+                new GPUArray<U>(buf2),
+                new GPUArray<V>(buf3),
+                new GPUArray<W>(buf4),
+                new GPUArray<X>(buf5));
 
             Synchronize();
         }
@@ -222,12 +226,12 @@ internal void DispatchKernel<T, U, V, W, X, Y>(int start, int end, Buffer<T> buf
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1.View),
-                new GPUArray<U>(buf2.View),
-                new GPUArray<V>(buf3.View),
-                new GPUArray<W>(buf4.View),
-                new GPUArray<X>(buf5.View),
-                new GPUArray<Y>(buf6.View));
+                new GPUArray<T>(buf1),
+                new GPUArray<U>(buf2),
+                new GPUArray<V>(buf3),
+                new GPUArray<W>(buf4),
+                new GPUArray<X>(buf5),
+                new GPUArray<Y>(buf6));
 
             Synchronize();
         }
@@ -266,13 +270,13 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T>
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1.View),
-                new GPUArray<U>(buf2.View),
-                new GPUArray<V>(buf3.View),
-                new GPUArray<W>(buf4.View),
-                new GPUArray<X>(buf5.View),
-                new GPUArray<Y>(buf6.View),
-                new GPUArray<Z>(buf7.View));
+                new GPUArray<T>(buf1),
+                new GPUArray<U>(buf2),
+                new GPUArray<V>(buf3),
+                new GPUArray<W>(buf4),
+                new GPUArray<X>(buf5),
+                new GPUArray<Y>(buf6),
+                new GPUArray<Z>(buf7));
 
             Synchronize();
         }
@@ -314,14 +318,14 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A>(int start, int end, Buffer<
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1.View),
-                new GPUArray<U>(buf2.View),
-                new GPUArray<V>(buf3.View),
-                new GPUArray<W>(buf4.View),
-                new GPUArray<X>(buf5.View),
-                new GPUArray<Y>(buf6.View),
-                new GPUArray<Z>(buf7.View),
-                new GPUArray<A>(buf8.View));
+                new GPUArray<T>(buf1),
+                new GPUArray<U>(buf2),
+                new GPUArray<V>(buf3),
+                new GPUArray<W>(buf4),
+                new GPUArray<X>(buf5),
+                new GPUArray<Y>(buf6),
+                new GPUArray<Z>(buf7),
+                new GPUArray<A>(buf8));
 
             Synchronize();
         }
@@ -366,15 +370,15 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B>(int start, int end, Buff
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1.View),
-                new GPUArray<U>(buf2.View),
-                new GPUArray<V>(buf3.View),
-                new GPUArray<W>(buf4.View),
-                new GPUArray<X>(buf5.View),
-                new GPUArray<Y>(buf6.View),
-                new GPUArray<Z>(buf7.View),
-                new GPUArray<A>(buf8.View),
-                new GPUArray<B>(buf9.View));
+                new GPUArray<T>(buf1),
+                new GPUArray<U>(buf2),
+                new GPUArray<V>(buf3),
+                new GPUArray<W>(buf4),
+                new GPUArray<X>(buf5),
+                new GPUArray<Y>(buf6),
+                new GPUArray<Z>(buf7),
+                new GPUArray<A>(buf8),
+                new GPUArray<B>(buf9));
 
             Synchronize();
         }
@@ -422,16 +426,16 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C>(int start, int end, B
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1.View),
-                new GPUArray<U>(buf2.View),
-                new GPUArray<V>(buf3.View),
-                new GPUArray<W>(buf4.View),
-                new GPUArray<X>(buf5.View),
-                new GPUArray<Y>(buf6.View),
-                new GPUArray<Z>(buf7.View),
-                new GPUArray<A>(buf8.View),
-                new GPUArray<B>(buf9.View),
-                new GPUArray<C>(buf10.View));
+                new GPUArray<T>(buf1),
+                new GPUArray<U>(buf2),
+                new GPUArray<V>(buf3),
+                new GPUArray<W>(buf4),
+                new GPUArray<X>(buf5),
+                new GPUArray<Y>(buf6),
+                new GPUArray<Z>(buf7),
+                new GPUArray<A>(buf8),
+                new GPUArray<B>(buf9),
+                new GPUArray<C>(buf10));
 
             Synchronize();
         }
@@ -482,17 +486,17 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int end
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1.View),
-                new GPUArray<U>(buf2.View),
-                new GPUArray<V>(buf3.View),
-                new GPUArray<W>(buf4.View),
-                new GPUArray<X>(buf5.View),
-                new GPUArray<Y>(buf6.View),
-                new GPUArray<Z>(buf7.View),
-                new GPUArray<A>(buf8.View),
-                new GPUArray<B>(buf9.View),
-                new GPUArray<C>(buf10.View),
-                new GPUArray<D>(buf11.View));
+                new GPUArray<T>(buf1),
+                new GPUArray<U>(buf2),
+                new GPUArray<V>(buf3),
+                new GPUArray<W>(buf4),
+                new GPUArray<X>(buf5),
+                new GPUArray<Y>(buf6),
+                new GPUArray<Z>(buf7),
+                new GPUArray<A>(buf8),
+                new GPUArray<B>(buf9),
+                new GPUArray<C>(buf10),
+                new GPUArray<D>(buf11));
 
             Synchronize();
         }
@@ -546,18 +550,18 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, int
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1.View),
-                new GPUArray<U>(buf2.View),
-                new GPUArray<V>(buf3.View),
-                new GPUArray<W>(buf4.View),
-                new GPUArray<X>(buf5.View),
-                new GPUArray<Y>(buf6.View),
-                new GPUArray<Z>(buf7.View),
-                new GPUArray<A>(buf8.View),
-                new GPUArray<B>(buf9.View),
-                new GPUArray<C>(buf10.View),
-                new GPUArray<D>(buf11.View),
-                new GPUArray<E>(buf12.View));
+                new GPUArray<T>(buf1),
+                new GPUArray<U>(buf2),
+                new GPUArray<V>(buf3),
+                new GPUArray<W>(buf4),
+                new GPUArray<X>(buf5),
+                new GPUArray<Y>(buf6),
+                new GPUArray<Z>(buf7),
+                new GPUArray<A>(buf8),
+                new GPUArray<B>(buf9),
+                new GPUArray<C>(buf10),
+                new GPUArray<D>(buf11),
+                new GPUArray<E>(buf12));
 
             Synchronize();
         }
@@ -614,19 +618,19 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E, F>(int start, i
             var kernel = accelerator.LoadStreamKernel(action);
 
             kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1.View),
-                new GPUArray<U>(buf2.View),
-                new GPUArray<V>(buf3.View),
-                new GPUArray<W>(buf4.View),
-                new GPUArray<X>(buf5.View),
-                new GPUArray<Y>(buf6.View),
-                new GPUArray<Z>(buf7.View),
-                new GPUArray<A>(buf8.View),
-                new GPUArray<B>(buf9.View),
-                new GPUArray<C>(buf10.View),
-                new GPUArray<D>(buf11.View),
-                new GPUArray<E>(buf12.View),
-                new GPUArray<F>(buf13.View));
+                new GPUArray<T>(buf1),
+                new GPUArray<U>(buf2),
+                new GPUArray<V>(buf3),
+                new GPUArray<W>(buf4),
+                new GPUArray<X>(buf5),
+                new GPUArray<Y>(buf6),
+                new GPUArray<Z>(buf7),
+                new GPUArray<A>(buf8),
+                new GPUArray<B>(buf9),
+                new GPUArray<C>(buf10),
+                new GPUArray<D>(buf11),
+                new GPUArray<E>(buf12),
+                new GPUArray<F>(buf13));
 
             Synchronize();
         }
diff --git a/DotMP/GPU/Buffer.cs b/DotMP/GPU/Buffer.cs
index e0bd6908..2ec5b33c 100644
--- a/DotMP/GPU/Buffer.cs
+++ b/DotMP/GPU/Buffer.cs
@@ -1,5 +1,5 @@
 using System;
-using DotMP.GPU;
+using ILGPU;
 using ILGPU.Runtime;
 
 namespace DotMP.GPU
@@ -32,11 +32,15 @@ public enum Behavior
     public class Buffer<T> : IDisposable
         where T : unmanaged
     {
+        /// <summary>
+        /// The ILGPU buffer for 1D arrays.
+        /// </summary>
+        private MemoryBuffer1D<T, Stride1D.Dense> buf1d;
 
         /// <summary>
-        /// The ILGPU buffer.
+        /// The ILGPU buffer for 2D arrays.
         /// </summary>
-        private MemoryBuffer1D<T, ILGPU.Stride1D.Dense> buf;
+        private MemoryBuffer2D<T, Stride2D.DenseY> buf2d;
 
         /// <summary>
         /// Behavior of the data, as specified by Behavior.
@@ -44,12 +48,41 @@ public class Buffer<T> : IDisposable
         private Buffer.Behavior behavior;
 
         /// <summary>
-        /// The CPU array, so that we can copy the data back.
+        /// The CPU 1D array, so that we can copy the data back.
+        /// </summary>
+        private T[] data1d;
+
+        /// <summary>
+        /// The CPU 2D array, so that we can copy the data back.
+        /// </summary>
+        private T[,] data2d;
+
+        /// <summary>
+        /// Handler int for the number of dimensions in the array.
+        /// </summary>
+        private int dims;
+
+        /// <summary>
+        /// The number of dimensions in the array.
         /// </summary>
-        private T[] data;
+        internal int Dimensions
+        {
+            get
+            {
+                return dims;
+            }
+
+            private set
+            {
+                if (value < 1 || value > 3)
+                    throw new ArgumentOutOfRangeException("Number of dimensions must be between 1 and 3.");
+
+                dims = value;
+            }
+        }
 
         /// <summary>
-        /// Constructor for buffer object. Allocates data on the GPU and makes it available for the next GPU kernel.
+        /// Constructor for buffer object. Allocates a 1D array on the GPU and makes it available for the next GPU kernel.
         /// </summary>
         /// <param name="data">The data to allocate on the GPU.</param>
         /// <param name="behavior">The behavior of the data, see Behavior.</param>
@@ -58,18 +91,46 @@ public Buffer(T[] data, Buffer.Behavior behavior)
             new AcceleratorHandler();
 
             this.behavior = behavior;
-            this.data = data;
+            this.data1d = data;
 
             switch (behavior)
             {
                 case Buffer.Behavior.To:
                 case Buffer.Behavior.ToFrom:
-                    buf = AcceleratorHandler.accelerator.Allocate1D(data);
+                    buf1d = AcceleratorHandler.accelerator.Allocate1D(data);
                     break;
                 case Buffer.Behavior.From:
-                    buf = AcceleratorHandler.accelerator.Allocate1D<T>(data.Length);
+                    buf1d = AcceleratorHandler.accelerator.Allocate1D<T>(data.Length);
                     break;
             }
+
+            Dimensions = 1;
+        }
+
+        /// <summary>
+        /// Constructor for buffer object. Allocates a 2D array on the GPU and makes it available for the next GPU kernel.
+        /// </summary>
+        /// <param name="data">The data to allocate on the GPU.</param>
+        /// <param name="behavior">The behavior of the data, see Behavior.</param>
+        public Buffer(T[,] data, Buffer.Behavior behavior)
+        {
+            new AcceleratorHandler();
+
+            this.behavior = behavior;
+            this.data2d = data;
+
+            switch (behavior)
+            {
+                case Buffer.Behavior.To:
+                case Buffer.Behavior.ToFrom:
+                    buf2d = AcceleratorHandler.accelerator.Allocate2DDenseY(data);
+                    break;
+                case Buffer.Behavior.From:
+                    buf2d = AcceleratorHandler.accelerator.Allocate2DDenseY<T>((data.GetLength(0), data.GetLength(1)));
+                    break;
+            }
+
+            Dimensions = 2;
         }
 
         /// <summary>
@@ -77,17 +138,34 @@ public Buffer(T[] data, Buffer.Behavior behavior)
         /// </summary>
         public void Dispose()
         {
-            if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom)
+            if (Dimensions == 1)
             {
-                buf.GetAsArray1D().CopyTo(data, 0);
+                if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom)
+                {
+                    buf1d.GetAsArray1D().CopyTo(data1d, 0);
+                }
+
+                buf1d.Dispose();
             }
+            else if (Dimensions == 2)
+            {
+                if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom)
+                {
+                    buf2d.GetAsArray2D().CopyTo(data2d, 0);
+                }
 
-            buf.Dispose();
+                buf2d.Dispose();
+            }
         }
 
         /// <summary>
         /// Get the view of the memory for the GPU.
         /// </summary>
-        internal ArrayView1D<T, ILGPU.Stride1D.Dense> View { get => buf.View; }
+        internal ArrayView1D<T, Stride1D.Dense> View1D { get => buf1d.View; }
+
+        /// <summary>
+        /// Get the view of the memory for the GPU.
+        /// </summary>
+        internal ArrayView2D<T, Stride2D.DenseY> View2D { get => buf2d.View; }
     }
 }
\ No newline at end of file
diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs
index 55c45edb..ffc898ef 100644
--- a/DotMP/GPU/GpuArray.cs
+++ b/DotMP/GPU/GpuArray.cs
@@ -1,4 +1,5 @@
 using ILGPU;
+using ILGPU.Runtime;
 using System;
 
 namespace DotMP.GPU
@@ -11,17 +12,43 @@ public struct GPUArray<T>
         where T : unmanaged
     {
         /// <summary>
-        /// Internal ArrayView object.
+        /// The ILGPU buffer for 1D arrays.
         /// </summary>
-        private ArrayView<T> arrayView;
+        private ArrayView1D<T, Stride1D.Dense> view1d;
+
+        /// <summary>
+        /// The ILGPU buffer for 2D arrays.
+        /// </summary>
+        private ArrayView2D<T, Stride2D.DenseY> view2d;
+
+        /// <summary>
+        /// Number of dimensions.
+        /// </summary>
+        private int dims;
 
         /// <summary>
         /// Constructor.
         /// </summary>
         /// <param name="arrayView">The ArrayView to wrap.</param>
-        public GPUArray(ArrayView<T> arrayView)
+        public GPUArray(Buffer<T> arrayView)
         {
-            this.arrayView = arrayView;
+            if (arrayView.Dimensions == 1)
+            {
+                view1d = arrayView.View1D;
+                view2d = ArrayView2D<T, Stride2D.DenseY>.Empty;
+            }
+            else if (arrayView.Dimensions == 2)
+            {
+                view1d = ArrayView1D<T, Stride1D.Dense>.Empty;
+                view2d = arrayView.View2D;
+            }
+            else
+            {
+                view1d = ArrayView1D<T, Stride1D.Dense>.Empty;
+                view2d = ArrayView2D<T, Stride2D.DenseY>.Empty;
+            }
+
+            dims = arrayView.Dimensions;
         }
 
         /// <summary>
@@ -31,8 +58,20 @@ public GPUArray(ArrayView<T> arrayView)
         /// <returns>The data at that ID.</returns>
         public T this[int idx]
         {
-            get => arrayView[idx];
-            set => arrayView[idx] = value;
+            get => view1d[idx];
+            set => view1d[idx] = value;
+        }
+
+        /// <summary>
+        /// Overload for [,] operator.
+        /// </summary>
+        /// <param name="i">The first ID to index into.</param>
+        /// <param name="j">The second ID to index into.</param>
+        /// <returns>The data at that ID.</returns>
+        public T this[int i, int j]
+        {
+            get => view2d[i, j];
+            set => view2d[i, j] = value;
         }
 
         /// <summary>
@@ -40,7 +79,17 @@ public T this[int idx]
         /// </summary>
         public int Length
         {
-            get => arrayView.IntLength;
+            get
+            {
+                switch (dims)
+                {
+                    case 1:
+                    default:
+                        return view1d.IntLength;
+                    case 2:
+                        return view2d.IntLength;
+                }
+            }
         }
     }
 }
\ No newline at end of file

From 4eb444543392df07778564b72bcf1316b858b226 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 08:54:51 -0600
Subject: [PATCH 14/61] initial commit of GPU heat transfer benchmark

---
 .../GPUHeatTransfer/GPUHeatTransfer.csproj    |  10 +
 benchmarks/GPUHeatTransfer/Program.cs         | 302 ++++++++++++++++++
 2 files changed, 312 insertions(+)
 create mode 100644 benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj
 create mode 100644 benchmarks/GPUHeatTransfer/Program.cs

diff --git a/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj b/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj
new file mode 100644
index 00000000..d4398000
--- /dev/null
+++ b/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj
@@ -0,0 +1,10 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net7.0</TargetFramework>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+</Project>
diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs
new file mode 100644
index 00000000..3889f953
--- /dev/null
+++ b/benchmarks/GPUHeatTransfer/Program.cs
@@ -0,0 +1,302 @@
+﻿/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Jobs;
+using BenchmarkDotNet.Running;
+using BenchmarkDotNet.Diagnosers;
+
+/* jscpd:ignore-start */
+
+[SimpleJob(RuntimeMoniker.Net60)]
+[ThreadingDiagnoser]
+[HardwareCounters]
+[EventPipeProfiler(EventPipeProfile.CpuSampling)]
+// test heat transfer using Parallel.For
+public class HeatTransfer
+{
+    // scratch array
+    private double[,] scratch = new double[0, 0];
+    // grid array
+    private double[,] grid = new double[0, 0];
+
+    //private 
+
+    // parallel type enum
+    public enum ParType { DMPFor, DMPGPU }
+
+    // test dims of 100x100, 1000x1000, and 5000x5000
+    [Params(500)]
+    public int dim;
+
+    // test with 10 steps and 100 steps
+    [Params(100)]
+    public int steps;
+
+    // test with all 3 parallel types
+    [Params(ParType.DMPFor, ParType.DMPGPU)]
+    public ParType type;
+
+    // change this to configure the number of threads to use
+    public uint num_threads = 6;
+
+    // run the setup
+    [GlobalSetup]
+    public void Setup()
+    {
+        scratch = new double[dim, dim];
+        grid = new double[dim, dim];
+
+        grid[0, dim / 2 - 1] = 100.0;
+        grid[0, dim / 2] = 100.0;
+    }
+
+    //run the simulation
+    [Benchmark]
+    public void DoSimulation()
+    {
+        Action action = () =>
+        {
+            //do the steps
+            for (int i = 0; i < steps; i++)
+            {
+                DoStep();
+            }
+        };
+
+        if (type == ParType.DMPGPU)
+        {
+            action();
+        }
+        else
+        {
+            // spawn a parallel region
+            DotMP.Parallel.ParallelRegion(num_threads: num_threads, action: action);
+        }
+    }
+
+    //do a step of the heat transfer simulation
+    public void DoStep()
+    {
+        switch (type)
+        {
+            case ParType.DMPFor:
+                //iterate over all cells not on the border
+                DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i =>
+                {
+                    for (int j = 1; j < dim - 1; j++)
+                    {
+                        //set the scratch array to the average of the surrounding cells
+                        scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
+                    }
+                });
+
+                //copy the scratch array to the grid array
+                DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i =>
+                {
+                    for (int j = 1; j < dim - 1; j++)
+                    {
+                        grid[i, j] = scratch[i, j];
+                    }
+                });
+                break;
+
+            case ParType.DMPGPU:
+                DotMP.GPU.ParallelFor();
+                break;
+        }
+    }
+}
+
+// test heat transfer using Parallel.For
+public class HeatTransferVerify
+{
+    // scratch array
+    private double[,] scratch = new double[0, 0];
+    // grid array
+    private double[,] grid = new double[0, 0];
+
+    // parallel type enum
+    public enum ParType { TPL, For, ForCollapse, Serial }
+
+    // test dims of 100x100, 1000x1000, and 5000x5000
+    public int dim = 500;
+
+    // test with 10 steps and 100 steps
+    public int steps = 100;
+
+    // test with all 3 parallel types
+    public ParType type = ParType.For;
+
+    // change this to configure the number of threads to use
+    public uint num_threads = 6;
+
+    // run the setup
+    public void Setup()
+    {
+        scratch = new double[dim, dim];
+        grid = new double[dim, dim];
+
+        grid[0, dim / 2 - 1] = 100.0;
+        grid[0, dim / 2] = 100.0;
+    }
+
+    //run the simulation
+    public void DoSimulation()
+    {
+        Action action = () =>
+        {
+            //do the steps
+            for (int i = 0; i < steps; i++)
+            {
+                DoStep();
+            }
+        };
+
+        if (type == ParType.TPL || type == ParType.Serial)
+        {
+            action();
+        }
+        else
+        {
+            // spawn a parallel region
+            DotMP.Parallel.ParallelRegion(num_threads: num_threads, action: action);
+        }
+    }
+
+    //do a step of the heat transfer simulation
+    public void DoStep()
+    {
+        switch (type)
+        {
+            case ParType.TPL:
+                //iterate over all cells not on the border
+                System.Threading.Tasks.Parallel.For(1, dim - 1, i =>
+                {
+                    System.Threading.Tasks.Parallel.For(1, dim - 1, j =>
+                    {
+                        //set the scratch array to the average of the surrounding cells
+                        scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
+                    });
+                });
+
+                //copy the scratch array to the grid array
+                System.Threading.Tasks.Parallel.For(1, dim - 1, i =>
+                {
+                    System.Threading.Tasks.Parallel.For(1, dim - 1, j =>
+                    {
+                        grid[i, j] = scratch[i, j];
+                    });
+                });
+                break;
+
+            case ParType.For:
+                //iterate over all cells not on the border
+                DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i =>
+                {
+                    for (int j = 1; j < dim - 1; j++)
+                    {
+                        //set the scratch array to the average of the surrounding cells
+                        scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
+                    }
+                });
+
+                //copy the scratch array to the grid array
+                DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i =>
+                {
+                    for (int j = 1; j < dim - 1; j++)
+                    {
+                        grid[i, j] = scratch[i, j];
+                    }
+                });
+                break;
+
+            case ParType.ForCollapse:
+                //iterate over all cells not on the border
+                DotMP.Parallel.ForCollapse((1, dim - 1), (1, dim - 1), schedule: DotMP.Schedule.Guided, action: (i, j) =>
+                {
+                    //set the scratch array to the average of the surrounding cells
+                    scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
+                });
+
+                //copy the scratch array to the grid array
+                DotMP.Parallel.ForCollapse((1, dim - 1), (1, dim - 1), schedule: DotMP.Schedule.Guided, action: (i, j) =>
+                {
+                    grid[i, j] = scratch[i, j];
+                });
+                break;
+
+            case ParType.Serial:
+                for (int i = 1; i < dim - 1; i++)
+                {
+                    for (int j = 1; j < dim - 1; j++)
+                    {
+                        //set the scratch array to the average of the surrounding cells
+                        scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
+                    }
+                }
+
+                //copy the scratch array to the grid array
+                for (int i = 1; i < dim - 1; i++)
+                {
+                    for (int j = 1; j < dim - 1; j++)
+                    {
+                        grid[i, j] = scratch[i, j];
+                    }
+                }
+                break;
+        }
+    }
+
+    public void Verify()
+    {
+        Setup();
+        type = ParType.For;
+        DoSimulation();
+        double[,] gridA = grid;
+
+        Setup();
+        type = ParType.Serial;
+        DoSimulation();
+        double[,] gridB = grid;
+
+        bool wrong = false;
+
+        for (int i = 0; i < dim; i++)
+            for (int j = 0; j < dim; j++)
+                if (gridA[i, j] != gridB[i, j])
+                    wrong = true;
+
+        if (wrong)
+            Console.WriteLine("WRONG RESULT");
+        else
+            Console.WriteLine("RIGHT RESULT");
+    }
+}
+
+/* jscpd:ignore-end */
+
+// driver
+public class Program
+{
+    public static void Main(string[] args)
+    {
+        if (args.Length > 0 && args[0] == "verify")
+            new HeatTransferVerify().Verify();
+        else
+            BenchmarkRunner.Run<HeatTransfer>();
+    }
+}
\ No newline at end of file

From b5365b787a45d2fb2fba7fc5ebb6cdf29840e2f5 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 10:37:06 -0600
Subject: [PATCH 15/61] add nocopy behavior

---
 DotMP/GPU/Buffer.cs | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/DotMP/GPU/Buffer.cs b/DotMP/GPU/Buffer.cs
index 2ec5b33c..63e35e8e 100644
--- a/DotMP/GPU/Buffer.cs
+++ b/DotMP/GPU/Buffer.cs
@@ -22,7 +22,11 @@ public enum Behavior
             /// <summary>
             /// Specifies that data should be transfered both to and from the GPU.
             /// </summary>
-            ToFrom
+            ToFrom,
+            /// <summary>
+            /// Specifies that the data shouldn't be transfered to or from the GPU. For internal use.
+            /// </summary>
+            NoCopy
         }
     }
 
@@ -100,6 +104,7 @@ public Buffer(T[] data, Buffer.Behavior behavior)
                     buf1d = AcceleratorHandler.accelerator.Allocate1D(data);
                     break;
                 case Buffer.Behavior.From:
+                case Buffer.Behavior.NoCopy:
                     buf1d = AcceleratorHandler.accelerator.Allocate1D<T>(data.Length);
                     break;
             }
@@ -126,6 +131,7 @@ public Buffer(T[,] data, Buffer.Behavior behavior)
                     buf2d = AcceleratorHandler.accelerator.Allocate2DDenseY(data);
                     break;
                 case Buffer.Behavior.From:
+                case Buffer.Behavior.NoCopy:
                     buf2d = AcceleratorHandler.accelerator.Allocate2DDenseY<T>((data.GetLength(0), data.GetLength(1)));
                     break;
             }

From c092eb94002decb48d389dc244bf59bf7461e7ca Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 10:37:22 -0600
Subject: [PATCH 16/61] fix exception on OpenCL devices

---
 DotMP/GPU/GpuArray.cs | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs
index ffc898ef..5be53869 100644
--- a/DotMP/GPU/GpuArray.cs
+++ b/DotMP/GPU/GpuArray.cs
@@ -35,17 +35,21 @@ public GPUArray(Buffer<T> arrayView)
             if (arrayView.Dimensions == 1)
             {
                 view1d = arrayView.View1D;
-                view2d = ArrayView2D<T, Stride2D.DenseY>.Empty;
+                // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                view2d = new Buffer<T>(new T[1, 1], Buffer.Behavior.NoCopy).View2D;
             }
             else if (arrayView.Dimensions == 2)
             {
-                view1d = ArrayView1D<T, Stride1D.Dense>.Empty;
+                // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                view1d = new Buffer<T>(new T[1], Buffer.Behavior.NoCopy).View1D;
                 view2d = arrayView.View2D;
             }
             else
             {
-                view1d = ArrayView1D<T, Stride1D.Dense>.Empty;
-                view2d = ArrayView2D<T, Stride2D.DenseY>.Empty;
+                // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                view1d = new Buffer<T>(new T[1], Buffer.Behavior.NoCopy).View1D;
+                // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                view2d = new Buffer<T>(new T[1, 1], Buffer.Behavior.NoCopy).View2D;
             }
 
             dims = arrayView.Dimensions;

From f66b22795b74f4e60c6221925c571c4c410c6a2b Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 11:44:40 -0600
Subject: [PATCH 17/61] better accelerator selection

---
 DotMP/GPU/AcceleratorHandler.cs | 37 ++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs
index 95d415ff..d369489b 100644
--- a/DotMP/GPU/AcceleratorHandler.cs
+++ b/DotMP/GPU/AcceleratorHandler.cs
@@ -35,11 +35,20 @@ internal AcceleratorHandler()
             if (initialized) return;
 
             context = Context.CreateDefault();
-            accelerator = context.Devices[1].CreateAccelerator(context);
+            var selectedDevice = context.Devices[0];
+
             foreach (var d in context.Devices)
             {
                 Console.WriteLine("Detected {0} accelerator.", d.ToString());
+
+                if (selectedDevice.AcceleratorType == AcceleratorType.CPU && d.AcceleratorType == AcceleratorType.OpenCL)
+                    selectedDevice = d;
+                if (selectedDevice.AcceleratorType != AcceleratorType.Cuda && d.AcceleratorType == AcceleratorType.Cuda)
+                    selectedDevice = d;
             }
+
+            accelerator = selectedDevice.CreateAccelerator(context);
+
             Console.WriteLine("Using {0} accelerator.", accelerator.AcceleratorType.ToString());
             initialized = true;
             block_size = accelerator.AcceleratorType == AcceleratorType.CPU ? 16 : 256;
@@ -61,7 +70,7 @@ internal AcceleratorHandler()
         internal void DispatchKernel<T>(int start, int end, Buffer<T> buf, Action<Index, GPUArray<T>> action)
             where T : unmanaged
         {
-            var idx = new Index();
+            var idx = new Index(start);
 
             var kernel = accelerator.LoadStreamKernel(action);
 
@@ -85,7 +94,7 @@ internal void DispatchKernel<T, U>(int start, int end, Buffer<T> buf1, Buffer<U>
             where T : unmanaged
             where U : unmanaged
         {
-            var idx = new Index();
+            var idx = new Index(start);
 
             var kernel = accelerator.LoadStreamKernel(action);
 
@@ -113,7 +122,7 @@ internal void DispatchKernel<T, U, V>(int start, int end, Buffer<T> buf1, Buffer
             where U : unmanaged
             where V : unmanaged
         {
-            var idx = new Index();
+            var idx = new Index(start);
 
             var kernel = accelerator.LoadStreamKernel(action);
 
@@ -145,7 +154,7 @@ internal void DispatchKernel<T, U, V, W>(int start, int end, Buffer<T> buf1, Buf
             where V : unmanaged
             where W : unmanaged
         {
-            var idx = new Index();
+            var idx = new Index(start);
 
             var kernel = accelerator.LoadStreamKernel(action);
 
@@ -181,7 +190,7 @@ internal void DispatchKernel<T, U, V, W, X>(int start, int end, Buffer<T> buf1,
             where W : unmanaged
             where X : unmanaged
         {
-            var idx = new Index();
+            var idx = new Index(start);
 
             var kernel = accelerator.LoadStreamKernel(action);
 
@@ -221,7 +230,7 @@ internal void DispatchKernel<T, U, V, W, X, Y>(int start, int end, Buffer<T> buf
             where X : unmanaged
             where Y : unmanaged
         {
-            var idx = new Index();
+            var idx = new Index(start);
 
             var kernel = accelerator.LoadStreamKernel(action);
 
@@ -265,7 +274,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T>
             where Y : unmanaged
             where Z : unmanaged
         {
-            var idx = new Index();
+            var idx = new Index(start);
 
             var kernel = accelerator.LoadStreamKernel(action);
 
@@ -313,7 +322,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A>(int start, int end, Buffer<
             where Z : unmanaged
             where A : unmanaged
         {
-            var idx = new Index();
+            var idx = new Index(start);
 
             var kernel = accelerator.LoadStreamKernel(action);
 
@@ -365,7 +374,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B>(int start, int end, Buff
             where A : unmanaged
             where B : unmanaged
         {
-            var idx = new Index();
+            var idx = new Index(start);
 
             var kernel = accelerator.LoadStreamKernel(action);
 
@@ -421,7 +430,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C>(int start, int end, B
             where B : unmanaged
             where C : unmanaged
         {
-            var idx = new Index();
+            var idx = new Index(start);
 
             var kernel = accelerator.LoadStreamKernel(action);
 
@@ -481,7 +490,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int end
             where C : unmanaged
             where D : unmanaged
         {
-            var idx = new Index();
+            var idx = new Index(start);
 
             var kernel = accelerator.LoadStreamKernel(action);
 
@@ -545,7 +554,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, int
             where D : unmanaged
             where E : unmanaged
         {
-            var idx = new Index();
+            var idx = new Index(start);
 
             var kernel = accelerator.LoadStreamKernel(action);
 
@@ -613,7 +622,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E, F>(int start, i
             where E : unmanaged
             where F : unmanaged
         {
-            var idx = new Index();
+            var idx = new Index(start);
 
             var kernel = accelerator.LoadStreamKernel(action);
 

From 79c4a357ac1ba96fcbfd407cd04eaaad43b7bee0 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 11:45:11 -0600
Subject: [PATCH 18/61] fix copying back 2D arrays

---
 DotMP/GPU/Buffer.cs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/DotMP/GPU/Buffer.cs b/DotMP/GPU/Buffer.cs
index 63e35e8e..253f4968 100644
--- a/DotMP/GPU/Buffer.cs
+++ b/DotMP/GPU/Buffer.cs
@@ -1,4 +1,5 @@
 using System;
+using System.Runtime.CompilerServices;
 using ILGPU;
 using ILGPU.Runtime;
 
@@ -157,7 +158,7 @@ public void Dispose()
             {
                 if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom)
                 {
-                    buf2d.GetAsArray2D().CopyTo(data2d, 0);
+                    System.Buffer.BlockCopy(buf2d.GetAsArray2D(), 0, data2d, 0, Unsafe.SizeOf<T>() * data2d.Length);
                 }
 
                 buf2d.Dispose();

From 528cba09f8b895346c2ccd23ab8957f3a1c979ba Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 11:46:13 -0600
Subject: [PATCH 19/61] added start offset for index calculations

---
 DotMP/GPU/Handle.cs | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/DotMP/GPU/Handle.cs b/DotMP/GPU/Handle.cs
index c90f773f..a67f41a2 100644
--- a/DotMP/GPU/Handle.cs
+++ b/DotMP/GPU/Handle.cs
@@ -8,13 +8,27 @@ namespace DotMP.GPU
     /// </summary>
     public struct Index
     {
+        /// <summary>
+        /// The start of the for loop, for index calculations.
+        /// </summary>
+        private int start;
+
+        /// <summary>
+        /// Constructor.
+        /// </summary>
+        /// <param name="start">The start of the parallel for loop.</param>
+        internal Index(int start)
+        {
+            this.start = start;
+        }
+
         /// <summary>
         /// Gets the index of the loop.
         /// </summary>
         /// <param name="h">Unused.</param>
         public static implicit operator int(Index h)
         {
-            return Grid.GlobalIndex.X;
+            return Grid.GlobalIndex.X + h.start;
         }
     }
 }
\ No newline at end of file

From 6121c0dfbac71ed4feca00d4b5b8cb3a53ca4bc5 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 11:46:30 -0600
Subject: [PATCH 20/61] get HeatTransferVerify running properly

---
 .../GPUHeatTransfer/GPUHeatTransfer.csproj    |   8 ++
 benchmarks/GPUHeatTransfer/Program.cs         | 108 +++++++++---------
 2 files changed, 64 insertions(+), 52 deletions(-)

diff --git a/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj b/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj
index d4398000..41e8c54b 100644
--- a/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj
+++ b/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj
@@ -7,4 +7,12 @@
     <Nullable>enable</Nullable>
   </PropertyGroup>
 
+  <ItemGroup>
+    <PackageReference Include="BenchmarkDotNet" Version="0.13.10" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\..\DotMP\DotMP.csproj" />
+  </ItemGroup>
+
 </Project>
diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs
index 3889f953..6d4be932 100644
--- a/benchmarks/GPUHeatTransfer/Program.cs
+++ b/benchmarks/GPUHeatTransfer/Program.cs
@@ -53,6 +53,12 @@ public enum ParType { DMPFor, DMPGPU }
     // change this to configure the number of threads to use
     public uint num_threads = 6;
 
+    // buffer for grid
+    private DotMP.GPU.Buffer<double> gridbuf;
+
+    // buffer for scratch
+    private DotMP.GPU.Buffer<double> scratchbuf;
+
     // run the setup
     [GlobalSetup]
     public void Setup()
@@ -62,6 +68,9 @@ public void Setup()
 
         grid[0, dim / 2 - 1] = 100.0;
         grid[0, dim / 2] = 100.0;
+
+        gridbuf = new DotMP.GPU.Buffer<double>(grid, DotMP.GPU.Buffer.Behavior.To);
+        scratchbuf = new DotMP.GPU.Buffer<double>(scratch, DotMP.GPU.Buffer.Behavior.NoCopy);
     }
 
     //run the simulation
@@ -115,7 +124,22 @@ public void DoStep()
                 break;
 
             case ParType.DMPGPU:
-                DotMP.GPU.ParallelFor();
+                DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) =>
+                {
+                    for (int j = 1; j < dim - 1; j++)
+                    {
+                        //set the scratch array to the average of the surrounding cells
+                        scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
+                    }
+                });
+
+                DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) =>
+                {
+                    for (int j = 1; j < dim - 1; j++)
+                    {
+                        grid[i, j] = scratch[i, j];
+                    }
+                });
                 break;
         }
     }
@@ -130,20 +154,26 @@ public class HeatTransferVerify
     private double[,] grid = new double[0, 0];
 
     // parallel type enum
-    public enum ParType { TPL, For, ForCollapse, Serial }
+    public enum ParType { DMPFor, DMPGPU }
 
     // test dims of 100x100, 1000x1000, and 5000x5000
-    public int dim = 500;
+    public int dim = 514;
 
     // test with 10 steps and 100 steps
     public int steps = 100;
 
     // test with all 3 parallel types
-    public ParType type = ParType.For;
+    public ParType type = ParType.DMPFor;
 
     // change this to configure the number of threads to use
     public uint num_threads = 6;
 
+    // buffer for grid
+    private DotMP.GPU.Buffer<double> gridbuf;
+
+    // buffer for scratch
+    private DotMP.GPU.Buffer<double> scratchbuf;
+
     // run the setup
     public void Setup()
     {
@@ -152,6 +182,12 @@ public void Setup()
 
         grid[0, dim / 2 - 1] = 100.0;
         grid[0, dim / 2] = 100.0;
+
+        if (type == ParType.DMPGPU)
+        {
+            gridbuf = new DotMP.GPU.Buffer<double>(grid, DotMP.GPU.Buffer.Behavior.ToFrom);
+            scratchbuf = new DotMP.GPU.Buffer<double>(scratch, DotMP.GPU.Buffer.Behavior.NoCopy);
+        }
     }
 
     //run the simulation
@@ -166,9 +202,11 @@ public void DoSimulation()
             }
         };
 
-        if (type == ParType.TPL || type == ParType.Serial)
+        if (type == ParType.DMPGPU)
         {
             action();
+            gridbuf.Dispose();
+            scratchbuf.Dispose();
         }
         else
         {
@@ -182,28 +220,7 @@ public void DoStep()
     {
         switch (type)
         {
-            case ParType.TPL:
-                //iterate over all cells not on the border
-                System.Threading.Tasks.Parallel.For(1, dim - 1, i =>
-                {
-                    System.Threading.Tasks.Parallel.For(1, dim - 1, j =>
-                    {
-                        //set the scratch array to the average of the surrounding cells
-                        scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
-                    });
-                });
-
-                //copy the scratch array to the grid array
-                System.Threading.Tasks.Parallel.For(1, dim - 1, i =>
-                {
-                    System.Threading.Tasks.Parallel.For(1, dim - 1, j =>
-                    {
-                        grid[i, j] = scratch[i, j];
-                    });
-                });
-                break;
-
-            case ParType.For:
+            case ParType.DMPFor:
                 //iterate over all cells not on the border
                 DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i =>
                 {
@@ -224,52 +241,36 @@ public void DoStep()
                 });
                 break;
 
-            case ParType.ForCollapse:
-                //iterate over all cells not on the border
-                DotMP.Parallel.ForCollapse((1, dim - 1), (1, dim - 1), schedule: DotMP.Schedule.Guided, action: (i, j) =>
-                {
-                    //set the scratch array to the average of the surrounding cells
-                    scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
-                });
-
-                //copy the scratch array to the grid array
-                DotMP.Parallel.ForCollapse((1, dim - 1), (1, dim - 1), schedule: DotMP.Schedule.Guided, action: (i, j) =>
-                {
-                    grid[i, j] = scratch[i, j];
-                });
-                break;
-
-            case ParType.Serial:
-                for (int i = 1; i < dim - 1; i++)
+            case ParType.DMPGPU:
+                DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) =>
                 {
-                    for (int j = 1; j < dim - 1; j++)
+                    for (int j = 1; j < 514 - 1; j++)
                     {
                         //set the scratch array to the average of the surrounding cells
                         scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
                     }
-                }
+                });
 
-                //copy the scratch array to the grid array
-                for (int i = 1; i < dim - 1; i++)
+                DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) =>
                 {
-                    for (int j = 1; j < dim - 1; j++)
+                    for (int j = 1; j < 514 - 1; j++)
                     {
                         grid[i, j] = scratch[i, j];
                     }
-                }
+                });
                 break;
         }
     }
 
     public void Verify()
     {
+        type = ParType.DMPFor;
         Setup();
-        type = ParType.For;
         DoSimulation();
         double[,] gridA = grid;
 
+        type = ParType.DMPGPU;
         Setup();
-        type = ParType.Serial;
         DoSimulation();
         double[,] gridB = grid;
 
@@ -278,7 +279,10 @@ public void Verify()
         for (int i = 0; i < dim; i++)
             for (int j = 0; j < dim; j++)
                 if (gridA[i, j] != gridB[i, j])
+                {
                     wrong = true;
+                    Console.WriteLine("Wrong at ({0}, {1}), expected {2}, got {3}.", i, j, gridA[i, j], gridB[i, j]);
+                }
 
         if (wrong)
             Console.WriteLine("WRONG RESULT");

From 62b5ebe515e8e18d3bfb7b87e4e3cd6272f8fc57 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 11:58:48 -0600
Subject: [PATCH 21/61] add LGPL license header

---
 DotMP/GPU/AcceleratorHandler.cs  | 16 ++++++++++
 DotMP/GPU/AssemblyAttributes.cs  | 16 ++++++++++
 DotMP/GPU/Buffer.cs              | 16 ++++++++++
 DotMP/GPU/Exceptions.cs          | 39 ++++++++++---------------
 DotMP/GPU/Gpu.cs                 | 16 ++++++++++
 DotMP/GPU/GpuArray.cs            | 16 ++++++++++
 DotMP/GPU/Handle.cs              | 34 ----------------------
 DotMP/GPU/Index.cs               | 50 ++++++++++++++++++++++++++++++++
 DotMP/GPU/Python/dispatch_gen.py | 16 ++++++++++
 DotMP/GPU/Python/parfor_gen.py   | 16 ++++++++++
 10 files changed, 178 insertions(+), 57 deletions(-)
 delete mode 100644 DotMP/GPU/Handle.cs
 create mode 100644 DotMP/GPU/Index.cs

diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs
index d369489b..c7be93df 100644
--- a/DotMP/GPU/AcceleratorHandler.cs
+++ b/DotMP/GPU/AcceleratorHandler.cs
@@ -1,3 +1,19 @@
+/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
 using System;
 using System.Linq;
 using ILGPU;
diff --git a/DotMP/GPU/AssemblyAttributes.cs b/DotMP/GPU/AssemblyAttributes.cs
index a45c976b..7077a588 100644
--- a/DotMP/GPU/AssemblyAttributes.cs
+++ b/DotMP/GPU/AssemblyAttributes.cs
@@ -1,3 +1,19 @@
+/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
 using System.Runtime.CompilerServices;
 
 [assembly: InternalsVisibleTo("ILGPURuntime")]
\ No newline at end of file
diff --git a/DotMP/GPU/Buffer.cs b/DotMP/GPU/Buffer.cs
index 253f4968..87b756e3 100644
--- a/DotMP/GPU/Buffer.cs
+++ b/DotMP/GPU/Buffer.cs
@@ -1,3 +1,19 @@
+/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
 using System;
 using System.Runtime.CompilerServices;
 using ILGPU;
diff --git a/DotMP/GPU/Exceptions.cs b/DotMP/GPU/Exceptions.cs
index 58acddb5..4705041b 100644
--- a/DotMP/GPU/Exceptions.cs
+++ b/DotMP/GPU/Exceptions.cs
@@ -1,28 +1,21 @@
+/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
 using System;
 
 namespace DotMP.GPU
 {
-    /// <summary>
-    /// Exception thrown if too many or too few data movements were specified before a GPU kernel.
-    /// </summary>
-    public class WrongNumberOfDataMovementsSpecifiedException : Exception
-    {
-        /// <summary>
-        /// Constructor with a message.
-        /// </summary>
-        /// <param name="msg">The message to associate with the exception.</param>
-        public WrongNumberOfDataMovementsSpecifiedException(string msg) : base(msg) { }
-    }
-
-    /// <summary>
-    /// Exception thrown if data movement is presented out-of-order.
-    /// </summary>
-    public class ImproperDataMovementOrderingException : Exception
-    {
-        /// <summary>
-        /// Constructor with a message.
-        /// </summary>
-        /// <param name="msg">The message to associate with the exception.</param>
-        public ImproperDataMovementOrderingException(string msg) : base(msg) { }
-    }
 }
\ No newline at end of file
diff --git a/DotMP/GPU/Gpu.cs b/DotMP/GPU/Gpu.cs
index 1237a75d..c9b8963c 100644
--- a/DotMP/GPU/Gpu.cs
+++ b/DotMP/GPU/Gpu.cs
@@ -1,3 +1,19 @@
+/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
 using System;
 using ILGPU;
 
diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs
index 5be53869..fca82deb 100644
--- a/DotMP/GPU/GpuArray.cs
+++ b/DotMP/GPU/GpuArray.cs
@@ -1,3 +1,19 @@
+/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
 using ILGPU;
 using ILGPU.Runtime;
 using System;
diff --git a/DotMP/GPU/Handle.cs b/DotMP/GPU/Handle.cs
deleted file mode 100644
index a67f41a2..00000000
--- a/DotMP/GPU/Handle.cs
+++ /dev/null
@@ -1,34 +0,0 @@
-using ILGPU;
-using System;
-
-namespace DotMP.GPU
-{
-    /// <summary>
-    /// Handle for a GPU kernel to retrieve its kernel variables.
-    /// </summary>
-    public struct Index
-    {
-        /// <summary>
-        /// The start of the for loop, for index calculations.
-        /// </summary>
-        private int start;
-
-        /// <summary>
-        /// Constructor.
-        /// </summary>
-        /// <param name="start">The start of the parallel for loop.</param>
-        internal Index(int start)
-        {
-            this.start = start;
-        }
-
-        /// <summary>
-        /// Gets the index of the loop.
-        /// </summary>
-        /// <param name="h">Unused.</param>
-        public static implicit operator int(Index h)
-        {
-            return Grid.GlobalIndex.X + h.start;
-        }
-    }
-}
\ No newline at end of file
diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs
new file mode 100644
index 00000000..03355c53
--- /dev/null
+++ b/DotMP/GPU/Index.cs
@@ -0,0 +1,50 @@
+/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+using ILGPU;
+using System;
+
+namespace DotMP.GPU
+{
+    /// <summary>
+    /// Handle for a GPU kernel to retrieve its kernel variables.
+    /// </summary>
+    public struct Index
+    {
+        /// <summary>
+        /// The start of the for loop, for index calculations.
+        /// </summary>
+        private int start;
+
+        /// <summary>
+        /// Constructor.
+        /// </summary>
+        /// <param name="start">The start of the parallel for loop.</param>
+        internal Index(int start)
+        {
+            this.start = start;
+        }
+
+        /// <summary>
+        /// Gets the index of the loop.
+        /// </summary>
+        /// <param name="h">Unused.</param>
+        public static implicit operator int(Index h)
+        {
+            return Grid.GlobalIndex.X + h.start;
+        }
+    }
+}
\ No newline at end of file
diff --git a/DotMP/GPU/Python/dispatch_gen.py b/DotMP/GPU/Python/dispatch_gen.py
index a9d0f08d..bb4152cd 100644
--- a/DotMP/GPU/Python/dispatch_gen.py
+++ b/DotMP/GPU/Python/dispatch_gen.py
@@ -1,3 +1,19 @@
+"""
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+"""
+
 ofile = open("./dispatch_dump.cs", "w")
 
 cardinals = ["one", "two", "three", "four", "five", "six", "seven", "eight",
diff --git a/DotMP/GPU/Python/parfor_gen.py b/DotMP/GPU/Python/parfor_gen.py
index c119b624..e960e861 100644
--- a/DotMP/GPU/Python/parfor_gen.py
+++ b/DotMP/GPU/Python/parfor_gen.py
@@ -1,3 +1,19 @@
+"""
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+"""
+
 ofile = open("./parfor_dump.cs", "w")
 
 cardinals = ["one", "two", "three", "four", "five", "six", "seven", "eight",

From d11913c78392968f4fd9c08b2619a095bac523a8 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 11:59:18 -0600
Subject: [PATCH 22/61] prepare benchmark

---
 benchmarks/GPUHeatTransfer/Program.cs | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs
index 6d4be932..63c1bb61 100644
--- a/benchmarks/GPUHeatTransfer/Program.cs
+++ b/benchmarks/GPUHeatTransfer/Program.cs
@@ -33,13 +33,11 @@ public class HeatTransfer
     // grid array
     private double[,] grid = new double[0, 0];
 
-    //private 
-
     // parallel type enum
     public enum ParType { DMPFor, DMPGPU }
 
     // test dims of 100x100, 1000x1000, and 5000x5000
-    [Params(500)]
+    [Params(514)]
     public int dim;
 
     // test with 10 steps and 100 steps
@@ -69,8 +67,11 @@ public void Setup()
         grid[0, dim / 2 - 1] = 100.0;
         grid[0, dim / 2] = 100.0;
 
-        gridbuf = new DotMP.GPU.Buffer<double>(grid, DotMP.GPU.Buffer.Behavior.To);
-        scratchbuf = new DotMP.GPU.Buffer<double>(scratch, DotMP.GPU.Buffer.Behavior.NoCopy);
+        if (type == ParType.DMPGPU)
+        {
+            gridbuf = new DotMP.GPU.Buffer<double>(grid, DotMP.GPU.Buffer.Behavior.ToFrom);
+            scratchbuf = new DotMP.GPU.Buffer<double>(scratch, DotMP.GPU.Buffer.Behavior.NoCopy);
+        }
     }
 
     //run the simulation
@@ -89,6 +90,8 @@ public void DoSimulation()
         if (type == ParType.DMPGPU)
         {
             action();
+            gridbuf.Dispose();
+            scratchbuf.Dispose();
         }
         else
         {
@@ -126,7 +129,7 @@ public void DoStep()
             case ParType.DMPGPU:
                 DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) =>
                 {
-                    for (int j = 1; j < dim - 1; j++)
+                    for (int j = 1; j < 514 - 1; j++)
                     {
                         //set the scratch array to the average of the surrounding cells
                         scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
@@ -135,7 +138,7 @@ public void DoStep()
 
                 DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) =>
                 {
-                    for (int j = 1; j < dim - 1; j++)
+                    for (int j = 1; j < 514 - 1; j++)
                     {
                         grid[i, j] = scratch[i, j];
                     }

From 8b1242253b26e391a2d3ca60094006fd76979f2b Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 12:06:06 -0600
Subject: [PATCH 23/61] remove dispose for benchmarking

---
 benchmarks/GPUHeatTransfer/Program.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs
index 63c1bb61..b3475816 100644
--- a/benchmarks/GPUHeatTransfer/Program.cs
+++ b/benchmarks/GPUHeatTransfer/Program.cs
@@ -90,8 +90,8 @@ public void DoSimulation()
         if (type == ParType.DMPGPU)
         {
             action();
-            gridbuf.Dispose();
-            scratchbuf.Dispose();
+            //gridbuf.Dispose();
+            //scratchbuf.Dispose();
         }
         else
         {

From 20eebd6767ceec8557836676766836cf7a44ea6b Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Fri, 10 Nov 2023 12:06:16 -0600
Subject: [PATCH 24/61] change to .net 6

---
 benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj b/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj
index 41e8c54b..9cf0a6f0 100644
--- a/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj
+++ b/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj
@@ -2,7 +2,7 @@
 
   <PropertyGroup>
     <OutputType>Exe</OutputType>
-    <TargetFramework>net7.0</TargetFramework>
+    <TargetFramework>net6.0</TargetFramework>
     <ImplicitUsings>enable</ImplicitUsings>
     <Nullable>enable</Nullable>
   </PropertyGroup>

From 708d9d564734c43b2ac47652b8fc58c4455a98fe Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 10:18:14 -0600
Subject: [PATCH 25/61] add attributes to prevent exceptions when collecting
 code coverage

---
 DotMP/GPU/GpuArray.cs | 2 ++
 DotMP/GPU/Index.cs    | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs
index fca82deb..a7d7d705 100644
--- a/DotMP/GPU/GpuArray.cs
+++ b/DotMP/GPU/GpuArray.cs
@@ -17,6 +17,7 @@
 using ILGPU;
 using ILGPU.Runtime;
 using System;
+using System.Diagnostics.CodeAnalysis;
 
 namespace DotMP.GPU
 {
@@ -24,6 +25,7 @@ namespace DotMP.GPU
     /// Wrapper object for representing arrays on the GPU.
     /// </summary>
     /// <typeparam name="T"></typeparam>
+    [ExcludeFromCodeCoverage]
     public struct GPUArray<T>
         where T : unmanaged
     {
diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs
index 03355c53..b1e9df75 100644
--- a/DotMP/GPU/Index.cs
+++ b/DotMP/GPU/Index.cs
@@ -16,12 +16,14 @@
 
 using ILGPU;
 using System;
+using System.Diagnostics.CodeAnalysis;
 
 namespace DotMP.GPU
 {
     /// <summary>
     /// Handle for a GPU kernel to retrieve its kernel variables.
     /// </summary>
+    [ExcludeFromCodeCoverage]
     public struct Index
     {
         /// <summary>

From 2e328e709c975461c351e0fe0556af653d4b4380 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 10:18:23 -0600
Subject: [PATCH 26/61] fix bug

---
 DotMP/Init.cs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/DotMP/Init.cs b/DotMP/Init.cs
index 49629f3f..83de8207 100644
--- a/DotMP/Init.cs
+++ b/DotMP/Init.cs
@@ -160,12 +160,14 @@ internal bool in_for
         {
             get
             {
-                if (in_for_pv == null)
+                int tid = Parallel.GetThreadNum();
+
+                if (in_for_pv == null || tid >= in_for_pv.Length)
                 {
                     return false;
                 }
 
-                return in_for_pv[Parallel.GetThreadNum()];
+                return in_for_pv[tid];
             }
             set
             {

From f3b7735caabf1f217f4210a19a54e5c2392234f0 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 10:18:50 -0600
Subject: [PATCH 27/61] mark old single/ordered/critical regions as obsolete,
 implement new versions

---
 DotMP/Parallel.cs | 167 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 159 insertions(+), 8 deletions(-)

diff --git a/DotMP/Parallel.cs b/DotMP/Parallel.cs
index f5712643..d24189e3 100644
--- a/DotMP/Parallel.cs
+++ b/DotMP/Parallel.cs
@@ -1,5 +1,7 @@
 ﻿using System;
 using System.Collections.Generic;
+using System.Runtime.CompilerServices;
+using System.ComponentModel;
 using System.Threading;
 
 namespace DotMP
@@ -14,15 +16,15 @@ public static class Parallel
         /// <summary>
         /// The dictionary for critical regions.
         /// </summary>
-        private static volatile Dictionary<int, object> critical_lock = new Dictionary<int, object>();
+        private static volatile Dictionary<string, object> critical_lock = new Dictionary<string, object>();
         /// <summary>
         /// The dictionary for single regions.
         /// </summary>
-        private static volatile HashSet<int> single_thread = new HashSet<int>();
+        private static volatile HashSet<string> single_thread = new HashSet<string>();
         /// <summary>
         /// The dictionary for ordered regions.
         /// </summary>
-        private static volatile Dictionary<int, int> ordered = new Dictionary<int, int>();
+        private static volatile Dictionary<string, int> ordered = new Dictionary<string, int>();
         /// <summary>
         /// Barrier object for DotMP.Parallel.Barrier()
         /// </summary>
@@ -106,6 +108,17 @@ private static void FixArgs(int start, int end, ref Schedule sched, ref uint? ch
             }
         }
 
+        /// <summary>
+        /// Formats the caller information for determining uniqueness of a call.
+        /// </summary>
+        /// <param name="filename">The calling file.</param>
+        /// <param name="linenum">The calling line number.</param>
+        /// <returns>A formatted string representing "{filename}:{linenum}"</returns>
+        private static string FormatCaller(string filename, int linenum)
+        {
+            return string.Format("{0}:{1}", filename, linenum);
+        }
+
         /// <summary>
         /// Creates a for loop inside a parallel region.
         /// A for loop created with For inside of a parallel region is executed in parallel, with iterations being distributed among the threads, and potentially out-of-order.
@@ -950,11 +963,15 @@ public static void ParallelSections(uint? num_threads = null, params Action[] ac
         /// Creates a critical region.
         /// A critical region is a region of code that can only be executed by one thread at a time.
         /// If a thread encounters a critical region while another thread is inside a critical region, it will wait until the other thread is finished.
+        /// 
+        /// THIS METHOD IS NOW DEPRECATED.
         /// </summary>
         /// <param name="id">The ID of the critical region. Must be unique per region but consistent across all threads.</param>
         /// <param name="action">The action to be performed in the critical region.</param>
         /// <returns>The ID of the critical region.</returns>
         /// <exception cref="NotInParallelRegionException">Thrown when not in a parallel region.</exception>
+        [Obsolete("This version of Critical is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")]
+        [EditorBrowsable(EditorBrowsableState.Never)]
         public static int Critical(int id, Action action)
         {
             if (!InParallel())
@@ -964,6 +981,45 @@ public static int Critical(int id, Action action)
 
             object lock_obj;
 
+            lock (critical_lock)
+            {
+                if (!critical_lock.ContainsKey(id.ToString()))
+                {
+                    critical_lock.Add(id.ToString(), new object());
+                }
+
+                lock_obj = critical_lock[id.ToString()];
+            }
+
+            lock (lock_obj)
+            {
+                action();
+            }
+
+            return id;
+        }
+
+        /// <summary>
+        /// Creates a critical region.
+        /// A critical region is a region of code that can only be executed by one thread at a time.
+        /// If a thread encounters a critical region while another thread is inside a critical region, it will wait until the other thread is finished.
+        /// </summary>
+        /// <param name="action">The action to be performed in the critical region.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
+        /// <returns>The ID of the critical region.</returns>
+        /// <exception cref="NotInParallelRegionException">Thrown when not in a parallel region.</exception>
+        public static void Critical(Action action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
+        {
+            string id = FormatCaller(path, line);
+
+            if (!InParallel())
+            {
+                throw new NotInParallelRegionException("Cannot use DotMP Critical outside of a parallel region.");
+            }
+
+            object lock_obj;
+
             lock (critical_lock)
             {
                 if (!critical_lock.ContainsKey(id))
@@ -978,8 +1034,6 @@ public static int Critical(int id, Action action)
             {
                 action();
             }
-
-            return id;
         }
 
         /// <summary>
@@ -1034,11 +1088,15 @@ public static void Master(Action action)
         /// Creates a single region.
         /// A single region is only executed once per Parallel.ParallelRegion.
         /// The first thread to encounter the single region marks the region as encountered, then executes it.
+        /// 
+        /// THIS METHOD IS NOW DEPRECATED.
         /// </summary>
         /// <param name="id">The ID of the single region. Must be unique per region but consistent across all threads.</param>
         /// <param name="action">The action to be performed in the single region.</param>
         /// <exception cref="NotInParallelRegionException">Thrown when not in a parallel region.</exception>
         /// <exception cref="CannotPerformNestedWorksharingException">Thrown when nested inside another worksharing region.</exception>
+        [Obsolete("This version of Single is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")]
+        [EditorBrowsable(EditorBrowsableState.Never)]
         public static void Single(int id, Action action)
         {
             var freg = new ForkedRegion();
@@ -1058,6 +1116,55 @@ public static void Single(int id, Action action)
 
             Interlocked.Increment(ref freg.in_workshare);
 
+            lock (single_thread)
+            {
+                if (!single_thread.Contains(id.ToString()))
+                {
+                    single_thread.Add(id.ToString());
+                    new_single = true;
+                }
+            }
+
+            if (new_single)
+            {
+                action();
+            }
+
+            Interlocked.Decrement(ref freg.in_workshare);
+
+            Barrier();
+        }
+
+        /// <summary>
+        /// Creates a single region.
+        /// A single region is only executed once per Parallel.ParallelRegion.
+        /// The first thread to encounter the single region marks the region as encountered, then executes it.
+        /// </summary>
+        /// <param name="action">The action to be performed in the single region.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
+        /// <exception cref="NotInParallelRegionException">Thrown when not in a parallel region.</exception>
+        /// <exception cref="CannotPerformNestedWorksharingException">Thrown when nested inside another worksharing region.</exception>
+        public static void Single(Action action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
+        {
+            string id = FormatCaller(path, line);
+            var freg = new ForkedRegion();
+            bool new_single = false;
+
+            if (!freg.in_parallel)
+            {
+                throw new NotInParallelRegionException("Cannot use DotMP Single outside of a parallel region.");
+            }
+
+            var ws = new WorkShare();
+
+            if (ws.in_for)
+            {
+                throw new CannotPerformNestedWorksharingException("Cannot use DotMP Single nested within other worksharing constructs.");
+            }
+
+            Interlocked.Increment(ref freg.in_workshare);
+
             lock (single_thread)
             {
                 if (!single_thread.Contains(id))
@@ -1081,10 +1188,14 @@ public static void Single(int id, Action action)
         /// Creates an ordered region.
         /// An ordered region is a region of code that is executed in order inside of a For() or ForReduction&lt;T&gt;() loop.
         /// This also acts as an implicit Critical() region.
+        /// 
+        /// THIS METHOD IS NOW DEPRECATED.
         /// </summary>
         /// <param name="id">The ID of the ordered region. Must be unique per region but consistent across all threads.</param>
         /// <param name="action">The action to be performed in the ordered region.</param>
         /// <exception cref="NotInParallelRegionException">Thrown when not in a parallel region.</exception>
+        [Obsolete("This version of Ordered is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")]
+        [EditorBrowsable(EditorBrowsableState.Never)]
         public static void Ordered(int id, Action action)
         {
             var freg = new ForkedRegion();
@@ -1098,22 +1209,62 @@ public static void Ordered(int id, Action action)
 
             lock (ordered)
             {
-                if (!ordered.ContainsKey(id))
+                if (!ordered.ContainsKey(id.ToString()))
                 {
-                    ordered.Add(id, 0);
+                    ordered.Add(id.ToString(), 0);
                 }
                 Thread.MemoryBarrier();
             }
 
             WorkShare ws = new WorkShare();
 
-            while (ordered[id] != ws.thread.working_iter)
+            while (ordered[id.ToString()] != ws.thread.working_iter)
             {
                 freg.reg.spin[tid].SpinOnce();
             }
 
             action();
 
+            lock (ordered)
+            {
+                ordered[id.ToString()]++;
+            }
+        }
+
+        /// <summary>
+        /// Creates an ordered region.
+        /// An ordered region is a region of code that is executed in order inside of a For() or ForReduction&lt;T&gt;() loop.
+        /// This also acts as an implicit Critical() region.
+        /// </summary>
+        /// <param name="action">The action to be performed in the ordered region.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
+        /// <exception cref="NotInParallelRegionException">Thrown when not in a parallel region.</exception>
+        public static void Ordered(Action action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
+        {
+            string id = FormatCaller(path, line);
+            var freg = new ForkedRegion();
+
+            if (!freg.in_parallel)
+            {
+                throw new NotInParallelRegionException("Cannot use DotMP Ordered outside of a parallel region.");
+            }
+
+            lock (ordered)
+            {
+                if (!ordered.ContainsKey(id))
+                {
+                    ordered.Add(id, 0);
+                }
+                Thread.MemoryBarrier();
+            }
+
+            WorkShare ws = new WorkShare();
+
+            while (ordered[id] != ws.thread.working_iter) ;
+
+            action();
+
             lock (ordered)
             {
                 ordered[id]++;

From 429bde2b05aa7a2432c43c9500ce8f6e17d3251e Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 10:18:59 -0600
Subject: [PATCH 28/61] remove sln files

---
 examples/CSParallel/KNN/KNN.sln | 25 -------------------------
 examples/DotMP/KNN/KNN.sln      | 25 -------------------------
 examples/Serial/KNN/KNN.sln     | 25 -------------------------
 3 files changed, 75 deletions(-)
 delete mode 100644 examples/CSParallel/KNN/KNN.sln
 delete mode 100644 examples/DotMP/KNN/KNN.sln
 delete mode 100644 examples/Serial/KNN/KNN.sln

diff --git a/examples/CSParallel/KNN/KNN.sln b/examples/CSParallel/KNN/KNN.sln
deleted file mode 100644
index 62ec72ad..00000000
--- a/examples/CSParallel/KNN/KNN.sln
+++ /dev/null
@@ -1,25 +0,0 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio Version 17
-VisualStudioVersion = 17.5.002.0
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "KNN", "KNN.csproj", "{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Any CPU = Debug|Any CPU
-		Release|Any CPU = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Release|Any CPU.Build.0 = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-	GlobalSection(ExtensibilityGlobals) = postSolution
-		SolutionGuid = {AEB4E020-A8E5-48C3-A343-84A0FEBE91E2}
-	EndGlobalSection
-EndGlobal
diff --git a/examples/DotMP/KNN/KNN.sln b/examples/DotMP/KNN/KNN.sln
deleted file mode 100644
index 62ec72ad..00000000
--- a/examples/DotMP/KNN/KNN.sln
+++ /dev/null
@@ -1,25 +0,0 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio Version 17
-VisualStudioVersion = 17.5.002.0
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "KNN", "KNN.csproj", "{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Any CPU = Debug|Any CPU
-		Release|Any CPU = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Release|Any CPU.Build.0 = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-	GlobalSection(ExtensibilityGlobals) = postSolution
-		SolutionGuid = {AEB4E020-A8E5-48C3-A343-84A0FEBE91E2}
-	EndGlobalSection
-EndGlobal
diff --git a/examples/Serial/KNN/KNN.sln b/examples/Serial/KNN/KNN.sln
deleted file mode 100644
index 62ec72ad..00000000
--- a/examples/Serial/KNN/KNN.sln
+++ /dev/null
@@ -1,25 +0,0 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio Version 17
-VisualStudioVersion = 17.5.002.0
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "KNN", "KNN.csproj", "{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Any CPU = Debug|Any CPU
-		Release|Any CPU = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{0260DB7F-8C67-42A5-A4B6-66A3EC0E95DB}.Release|Any CPU.Build.0 = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-	GlobalSection(ExtensibilityGlobals) = postSolution
-		SolutionGuid = {AEB4E020-A8E5-48C3-A343-84A0FEBE91E2}
-	EndGlobalSection
-EndGlobal

From fab2c475b273b98db24eb0fd24aa4fbdf69c55f4 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 10:20:37 -0600
Subject: [PATCH 29/61] exclude obsolete methods from code coverage

---
 DotMP/Parallel.cs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/DotMP/Parallel.cs b/DotMP/Parallel.cs
index d24189e3..1c88570a 100644
--- a/DotMP/Parallel.cs
+++ b/DotMP/Parallel.cs
@@ -972,6 +972,7 @@ public static void ParallelSections(uint? num_threads = null, params Action[] ac
         /// <exception cref="NotInParallelRegionException">Thrown when not in a parallel region.</exception>
         [Obsolete("This version of Critical is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")]
         [EditorBrowsable(EditorBrowsableState.Never)]
+        [ExcludeFromCodeCoverage]
         public static int Critical(int id, Action action)
         {
             if (!InParallel())
@@ -1097,6 +1098,7 @@ public static void Master(Action action)
         /// <exception cref="CannotPerformNestedWorksharingException">Thrown when nested inside another worksharing region.</exception>
         [Obsolete("This version of Single is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")]
         [EditorBrowsable(EditorBrowsableState.Never)]
+        [ExcludeFromCodeCoverage]
         public static void Single(int id, Action action)
         {
             var freg = new ForkedRegion();
@@ -1196,6 +1198,7 @@ public static void Single(Action action, [CallerFilePath] string path = "", [Cal
         /// <exception cref="NotInParallelRegionException">Thrown when not in a parallel region.</exception>
         [Obsolete("This version of Ordered is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")]
         [EditorBrowsable(EditorBrowsableState.Never)]
+        [ExcludeFromCodeCoverage]
         public static void Ordered(int id, Action action)
         {
             var freg = new ForkedRegion();

From 8b40582bd68e2e4a1b2a825de791739c1bab945e Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 10:29:13 -0600
Subject: [PATCH 30/61] add missing using

---
 DotMP/Parallel.cs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DotMP/Parallel.cs b/DotMP/Parallel.cs
index 1c88570a..6b400d72 100644
--- a/DotMP/Parallel.cs
+++ b/DotMP/Parallel.cs
@@ -3,6 +3,7 @@
 using System.Runtime.CompilerServices;
 using System.ComponentModel;
 using System.Threading;
+using System.Diagnostics.CodeAnalysis;
 
 namespace DotMP
 {

From 594b6bac1712dae783cd3e946f531fca8ed7efde Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 10:29:49 -0600
Subject: [PATCH 31/61] use new critical/ordered/single methods

---
 DotMP-Tests/CPUTests.cs | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/DotMP-Tests/CPUTests.cs b/DotMP-Tests/CPUTests.cs
index defeab2b..9b036e5c 100644
--- a/DotMP-Tests/CPUTests.cs
+++ b/DotMP-Tests/CPUTests.cs
@@ -429,7 +429,7 @@ public void Critical_works()
             DotMP.Parallel.ParallelRegion(num_threads: threads, action: () =>
             {
                 for (int i = 0; i < iters; i++)
-                    DotMP.Parallel.Critical(0, () => ++total);
+                    DotMP.Parallel.Critical(() => ++total);
             });
 
             total.Should().Be((int)threads * iters);
@@ -438,14 +438,13 @@ public void Critical_works()
 
             DotMP.Parallel.ParallelRegion(num_threads: 4, action: () =>
             {
-                if (DotMP.Parallel.GetThreadNum() == 0) DotMP.Parallel.Critical(0, () => Thread.Sleep(1000));
-                if (DotMP.Parallel.GetThreadNum() == 1) DotMP.Parallel.Critical(1, () => Thread.Sleep(1000));
-                if (DotMP.Parallel.GetThreadNum() == 2) DotMP.Parallel.Critical(0, () => Thread.Sleep(1000));
-                if (DotMP.Parallel.GetThreadNum() == 3) DotMP.Parallel.Critical(1, () => Thread.Sleep(1000));
+                if (DotMP.Parallel.GetThreadNum() % 2 == 0) DotMP.Parallel.Critical(() => Thread.Sleep(1000));
+                if (DotMP.Parallel.GetThreadNum() % 2 == 1) DotMP.Parallel.Critical(() => Thread.Sleep(1000));
             });
 
             double elapsed = DotMP.Parallel.GetWTime() - start;
-            elapsed.Should().BeLessThan(2200);
+            elapsed.Should().BeLessThan(2.2);
+            elapsed.Should().BeGreaterThan(2.0);
         }
 
         /// <summary>
@@ -478,7 +477,7 @@ public void Single_works()
             {
                 for (int i = 0; i < 10; i++)
                 {
-                    DotMP.Parallel.Single(0, () => DotMP.Atomic.Inc(ref total));
+                    DotMP.Parallel.Single(() => DotMP.Atomic.Inc(ref total));
                 }
             });
 
@@ -490,7 +489,7 @@ public void Single_works()
             {
                 for (int i = 0; i < 10; i++)
                 {
-                    DotMP.Parallel.Single(0, () => DotMP.Atomic.Inc(ref total));
+                    DotMP.Parallel.Single(() => DotMP.Atomic.Inc(ref total));
                 }
             });
 
@@ -646,7 +645,7 @@ public void Ordered_works()
             DotMP.Parallel.ParallelFor(0, 1024, schedule: DotMP.Schedule.Static,
                                         num_threads: threads, action: i =>
             {
-                DotMP.Parallel.Ordered(0, () => incrementing[i] = i);
+                DotMP.Parallel.Ordered(() => incrementing[i] = i);
             });
 
             for (int i = 0; i < incrementing.Length; i++)
@@ -1004,7 +1003,7 @@ public void Tasking_works()
 
             DotMP.Parallel.ParallelRegion(num_threads: threads, action: () =>
             {
-                DotMP.Parallel.Single(0, () =>
+                DotMP.Parallel.Single(() =>
                 {
                     for (int i = 0; i < threads * 2; i++)
                     {
@@ -1033,7 +1032,7 @@ public void Tasking_works()
 
             DotMP.Parallel.ParallelRegion(num_threads: threads, action: () =>
             {
-                DotMP.Parallel.Single(0, () =>
+                DotMP.Parallel.Single(() =>
                 {
                     for (int i = 0; i < tasks_to_spawn; i++)
                     {
@@ -1093,7 +1092,7 @@ public void Nested_tasks_work()
 
             DotMP.Parallel.ParallelRegion(num_threads: threads, action: () =>
             {
-                DotMP.Parallel.Single(0, () =>
+                DotMP.Parallel.Single(() =>
                 {
                     DotMP.Parallel.Task(() =>
                     {
@@ -1263,7 +1262,7 @@ public void Non_parallel_single_should_except()
         {
             Assert.Throws<DotMP.NotInParallelRegionException>(() =>
             {
-                DotMP.Parallel.Single(0, () => { });
+                DotMP.Parallel.Single(() => { });
             });
         }
 
@@ -1275,7 +1274,7 @@ public void Non_parallel_critical_should_except()
         {
             Assert.Throws<DotMP.NotInParallelRegionException>(() =>
             {
-                DotMP.Parallel.Critical(0, () => { });
+                DotMP.Parallel.Critical(() => { });
             });
         }
 
@@ -1289,13 +1288,13 @@ public void Nested_worksharing_should_except()
             {
                 Assert.Throws<DotMP.CannotPerformNestedWorksharingException>(() =>
                 {
-                    DotMP.Parallel.Single(0, () => { });
+                    DotMP.Parallel.Single(() => { });
                 });
             });
 
             DotMP.Parallel.ParallelRegion(num_threads: 4, action: () =>
             {
-                DotMP.Parallel.Single(0, () =>
+                DotMP.Parallel.Single(() =>
                 {
                     Assert.Throws<DotMP.CannotPerformNestedWorksharingException>(() =>
                     {
@@ -1321,7 +1320,7 @@ public void Non_for_ordered_should_except()
         {
             Assert.Throws<DotMP.NotInParallelRegionException>(() =>
             {
-                DotMP.Parallel.Ordered(0, () => { });
+                DotMP.Parallel.Ordered(() => { });
             });
         }
 

From 5e90c890edf324165cb80a0a29f0adb23eafe992 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 13:08:57 -0600
Subject: [PATCH 32/61] testing forcollapse performance in heat transfer, will
 fully implement forcollapse later

---
 DotMP/GPU/AcceleratorHandler.cs       | 139 ++++++++++++++++++++++----
 DotMP/GPU/Gpu.cs                      | 113 ++++++++++++++++-----
 DotMP/GPU/Index.cs                    |  60 ++++++++++-
 benchmarks/GPUHeatTransfer/Program.cs |  34 ++-----
 4 files changed, 271 insertions(+), 75 deletions(-)

diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs
index c7be93df..3608a6c1 100644
--- a/DotMP/GPU/AcceleratorHandler.cs
+++ b/DotMP/GPU/AcceleratorHandler.cs
@@ -15,6 +15,7 @@
 */
 
 using System;
+using System.Collections.Generic;
 using System.Linq;
 using ILGPU;
 using ILGPU.Runtime;
@@ -39,9 +40,13 @@ internal class AcceleratorHandler
         /// </summary>
         internal static Accelerator accelerator;
         /// <summary>
-        /// 
+        /// Block size to use for kernels.
         /// </summary>
         private static int block_size;
+        /// <summary>
+        /// Kernel cache.
+        /// </summary>
+        private static Dictionary<string, dynamic> kernels = new Dictionary<string, dynamic>();
 
         /// <summary>
         /// Default constructor. If this is the first time it's called, it initializes all relevant singleton data.
@@ -64,6 +69,7 @@ internal AcceleratorHandler()
             }
 
             accelerator = selectedDevice.CreateAccelerator(context);
+            //accelerator = context.Devices[0].CreateAccelerator(context);
 
             Console.WriteLine("Using {0} accelerator.", accelerator.AcceleratorType.ToString());
             initialized = true;
@@ -75,6 +81,82 @@ internal AcceleratorHandler()
         /// </summary>
         private void Synchronize() => accelerator.Synchronize();
 
+        /// <summary>
+        /// Get the kernel associated with this lambda.
+        /// </summary>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <param name="action">The action provided on the CPU.</param>
+        /// <param name="src">The calling location.</param>
+        /// <returns>The GPU kernel.</returns>
+        private Action<KernelConfig, Index, GPUArray<T>> GetKernel<T>(Action<Index, GPUArray<T>> action, string src)
+            where T : unmanaged
+        {
+            if (!kernels.ContainsKey(src))
+                kernels.Add(src, accelerator.LoadStreamKernel(action));
+
+            return (Action<KernelConfig, Index, GPUArray<T>>)kernels[src];
+        }
+
+        /// <summary>
+        /// Get the kernel associated with this lambda.
+        /// </summary>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <param name="action">The action provided on the CPU.</param>
+        /// <param name="src">The calling location.</param>
+        /// <returns>The GPU kernel.</returns>
+        private Action<KernelConfig, Index, GPUArray<T>, GPUArray<U>> GetKernel<T, U>(Action<Index, GPUArray<T>, GPUArray<U>> action, string src)
+            where T : unmanaged
+            where U : unmanaged
+        {
+            if (!kernels.ContainsKey(src))
+                kernels.Add(src, accelerator.LoadStreamKernel(action));
+
+            return (Action<KernelConfig, Index, GPUArray<T>, GPUArray<U>>)kernels[src];
+        }
+
+        /// <summary>
+        /// Get the kernel associated with this lambda.
+        /// </summary>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <param name="action">The action provided on the CPU.</param>
+        /// <param name="src">The calling location.</param>
+        /// <returns>The GPU kernel.</returns>
+        private Action<KernelConfig, Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> GetKernel<T, U, V>(Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> action, string src)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+        {
+            if (!kernels.ContainsKey(src))
+                kernels.Add(src, accelerator.LoadStreamKernel(action));
+
+            return (Action<KernelConfig, Index, GPUArray<T>, GPUArray<U>, GPUArray<V>>)kernels[src];
+        }
+
+        /// <summary>
+        /// Get the kernel associated with this lambda.
+        /// </summary>
+        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+        /// <param name="action">The action provided on the CPU.</param>
+        /// <param name="src">The calling location.</param>
+        /// <returns>The GPU kernel.</returns>
+        private Action<KernelConfig, Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> GetKernel<T, U, V, W>(Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> action, string src)
+            where T : unmanaged
+            where U : unmanaged
+            where V : unmanaged
+            where W : unmanaged
+        {
+            if (!kernels.ContainsKey(src))
+                kernels.Add(src, accelerator.LoadStreamKernel(action));
+
+            return (Action<KernelConfig, Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>>)kernels[src];
+        }
+
         /// <summary>
         /// Dispatches a kernel with one parameter.
         /// </summary>
@@ -82,13 +164,14 @@ internal AcceleratorHandler()
         /// <param name="end">The end of the loop, exclusive.</param>
         /// <param name="buf">The buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T>(int start, int end, Buffer<T> buf, Action<Index, GPUArray<T>> action)
+        internal void DispatchKernel<T>(int start, int end, Buffer<T> buf, Action<Index, GPUArray<T>> action, string src)
             where T : unmanaged
         {
             var idx = new Index(start);
 
-            var kernel = accelerator.LoadStreamKernel(action);
+            var kernel = GetKernel(action, src);
 
             kernel(((end - start) / block_size, block_size), idx,
                 new GPUArray<T>(buf));
@@ -99,22 +182,23 @@ internal void DispatchKernel<T>(int start, int end, Buffer<T> buf, Action<Index,
         /// <summary>
         /// Dispatches a kernel with two parameters.
         /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="ranges">The starts and ends of the loop.</param>
         /// <param name="buf1">The first buffer to run the kernel with.</param>
         /// <param name="buf2">The second buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action)
+        internal void DispatchKernel<T, U>((int, int)[] ranges, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action, string src)
             where T : unmanaged
             where U : unmanaged
         {
-            var idx = new Index(start);
+            int len = ranges.Select(tup => tup.Item2 - tup.Item1).Aggregate((x, y) => x * y);
+            var idx = new Index(ranges);
 
-            var kernel = accelerator.LoadStreamKernel(action);
+            var kernel = GetKernel(action, src);
 
-            kernel(((end - start) / block_size, block_size), idx,
+            kernel((len / block_size, block_size), idx,
                 new GPUArray<T>(buf1),
                 new GPUArray<U>(buf2));
 
@@ -130,17 +214,18 @@ internal void DispatchKernel<T, U>(int start, int end, Buffer<T> buf1, Buffer<U>
         /// <param name="buf2">The second buffer to run the kernel with.</param>
         /// <param name="buf3">The third buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> action)
+        internal void DispatchKernel<T, U, V>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> action, string src)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
         {
             var idx = new Index(start);
 
-            var kernel = accelerator.LoadStreamKernel(action);
+            var kernel = GetKernel(action, src);
 
             kernel(((end - start) / block_size, block_size), idx,
                 new GPUArray<T>(buf1),
@@ -160,11 +245,12 @@ internal void DispatchKernel<T, U, V>(int start, int end, Buffer<T> buf1, Buffer
         /// <param name="buf3">The third buffer to run the kernel with.</param>
         /// <param name="buf4">The fourth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> action)
+        internal void DispatchKernel<T, U, V, W>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> action, string src)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -172,7 +258,7 @@ internal void DispatchKernel<T, U, V, W>(int start, int end, Buffer<T> buf1, Buf
         {
             var idx = new Index(start);
 
-            var kernel = accelerator.LoadStreamKernel(action);
+            var kernel = GetKernel(action, src);
 
             kernel(((end - start) / block_size, block_size), idx,
                 new GPUArray<T>(buf1),
@@ -194,12 +280,13 @@ internal void DispatchKernel<T, U, V, W>(int start, int end, Buffer<T> buf1, Buf
         /// <param name="buf4">The fourth buffer to run the kernel with.</param>
         /// <param name="buf5">The fifth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>> action)
+        internal void DispatchKernel<T, U, V, W, X>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>> action, string src)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -232,13 +319,14 @@ internal void DispatchKernel<T, U, V, W, X>(int start, int end, Buffer<T> buf1,
         /// <param name="buf5">The fifth buffer to run the kernel with.</param>
         /// <param name="buf6">The sixth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X, Y>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>> action)
+        internal void DispatchKernel<T, U, V, W, X, Y>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>> action, string src)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -274,6 +362,7 @@ internal void DispatchKernel<T, U, V, W, X, Y>(int start, int end, Buffer<T> buf
         /// <param name="buf6">The sixth buffer to run the kernel with.</param>
         /// <param name="buf7">The seventh buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
@@ -281,7 +370,7 @@ internal void DispatchKernel<T, U, V, W, X, Y>(int start, int end, Buffer<T> buf
         /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>> action)
+        internal void DispatchKernel<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>> action, string src)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -320,6 +409,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T>
         /// <param name="buf7">The seventh buffer to run the kernel with.</param>
         /// <param name="buf8">The eighth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
@@ -328,7 +418,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T>
         /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X, Y, Z, A>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>> action)
+        internal void DispatchKernel<T, U, V, W, X, Y, Z, A>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>> action, string src)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -370,6 +460,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A>(int start, int end, Buffer<
         /// <param name="buf8">The eighth buffer to run the kernel with.</param>
         /// <param name="buf9">The ninth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
@@ -379,7 +470,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A>(int start, int end, Buffer<
         /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>> action)
+        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>> action, string src)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -424,6 +515,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B>(int start, int end, Buff
         /// <param name="buf9">The ninth buffer to run the kernel with.</param>
         /// <param name="buf10">The tenth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
@@ -434,7 +526,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B>(int start, int end, Buff
         /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>> action)
+        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>> action, string src)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -482,6 +574,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C>(int start, int end, B
         /// <param name="buf10">The tenth buffer to run the kernel with.</param>
         /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
@@ -493,7 +586,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C>(int start, int end, B
         /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>> action)
+        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>> action, string src)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -544,6 +637,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int end
         /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
         /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
@@ -556,7 +650,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int end
         /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>> action)
+        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>> action, string src)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -610,6 +704,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, int
         /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
         /// <param name="buf13">The thirteenth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
@@ -623,7 +718,7 @@ internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, int
         /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="F">The base type of the thirteenth argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E, F>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Buffer<F> buf13, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>, GPUArray<F>> action)
+        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E, F>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Buffer<F> buf13, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>, GPUArray<F>> action, string src)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
diff --git a/DotMP/GPU/Gpu.cs b/DotMP/GPU/Gpu.cs
index c9b8963c..1e2cd7a7 100644
--- a/DotMP/GPU/Gpu.cs
+++ b/DotMP/GPU/Gpu.cs
@@ -15,7 +15,7 @@
 */
 
 using System;
-using ILGPU;
+using System.Runtime.CompilerServices;
 
 namespace DotMP.GPU
 {
@@ -26,6 +26,17 @@ namespace DotMP.GPU
     /// </summary>
     public static class Parallel
     {
+        /// <summary>
+        /// Formats the caller information for determining uniqueness of a call.
+        /// </summary>
+        /// <param name="filename">The calling file.</param>
+        /// <param name="linenum">The calling line number.</param>
+        /// <returns>A formatted string representing "{filename}:{linenum}"</returns>
+        private static string FormatCaller(string filename, int linenum)
+        {
+            return string.Format("{0}:{1}", filename, linenum);
+        }
+
         /// <summary>
         /// Creates a GPU parallel for loop.
         /// The body of the kernel is run on a GPU target.
@@ -35,12 +46,15 @@ public static class Parallel
         /// <param name="end">The end of the loop, exclusive.</param>
         /// <param name="buf">The buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T>(int start, int end, Buffer<T> buf, Action<Index, GPUArray<T>> action)
+        public static void ParallelFor<T>(int start, int end, Buffer<T> buf, Action<Index, GPUArray<T>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
             where T : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, buf, action);
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(start, end, buf, action, src);
         }
 
         /// <summary>
@@ -53,14 +67,26 @@ public static void ParallelFor<T>(int start, int end, Buffer<T> buf, Action<Inde
         /// <param name="buf1">The first buffer to run the kernel with.</param>
         /// <param name="buf2">The second buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action)
+        /*public static void ParallelFor<T, U>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
+            where T : unmanaged
+            where U : unmanaged
+        {
+            var handler = new AcceleratorHandler();
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(start, end, buf1, buf2, action, src);
+        }*/
+
+        public static void ParallelForCollapse<T, U>((int, int) range1, (int, int) range2, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
             where T : unmanaged
             where U : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, buf1, buf2, action);
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(new (int, int)[] { range1, range2 }, buf1, buf2, action, src);
         }
 
         /// <summary>
@@ -74,16 +100,19 @@ public static void ParallelFor<T, U>(int start, int end, Buffer<T> buf1, Buffer<
         /// <param name="buf2">The second buffer to run the kernel with.</param>
         /// <param name="buf3">The third buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> action)
+        public static void ParallelFor<T, U, V>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, action);
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, action, src);
         }
 
         /// <summary>
@@ -98,18 +127,21 @@ public static void ParallelFor<T, U, V>(int start, int end, Buffer<T> buf1, Buff
         /// <param name="buf3">The third buffer to run the kernel with.</param>
         /// <param name="buf4">The fourth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> action)
+        public static void ParallelFor<T, U, V, W>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
             where W : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, action);
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, action, src);
         }
 
         /// <summary>
@@ -125,12 +157,14 @@ public static void ParallelFor<T, U, V, W>(int start, int end, Buffer<T> buf1, B
         /// <param name="buf4">The fourth buffer to run the kernel with.</param>
         /// <param name="buf5">The fifth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>> action)
+        public static void ParallelFor<T, U, V, W, X>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -138,7 +172,8 @@ public static void ParallelFor<T, U, V, W, X>(int start, int end, Buffer<T> buf1
             where X : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, action);
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, action, src);
         }
 
         /// <summary>
@@ -155,13 +190,15 @@ public static void ParallelFor<T, U, V, W, X>(int start, int end, Buffer<T> buf1
         /// <param name="buf5">The fifth buffer to run the kernel with.</param>
         /// <param name="buf6">The sixth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>> action)
+        public static void ParallelFor<T, U, V, W, X, Y>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -170,7 +207,8 @@ public static void ParallelFor<T, U, V, W, X, Y>(int start, int end, Buffer<T> b
             where Y : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, action);
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, action, src);
         }
 
         /// <summary>
@@ -188,6 +226,8 @@ public static void ParallelFor<T, U, V, W, X, Y>(int start, int end, Buffer<T> b
         /// <param name="buf6">The sixth buffer to run the kernel with.</param>
         /// <param name="buf7">The seventh buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
@@ -195,7 +235,7 @@ public static void ParallelFor<T, U, V, W, X, Y>(int start, int end, Buffer<T> b
         /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>> action)
+        public static void ParallelFor<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -205,7 +245,8 @@ public static void ParallelFor<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T
             where Z : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, action);
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, action, src);
         }
 
         /// <summary>
@@ -224,6 +265,8 @@ public static void ParallelFor<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T
         /// <param name="buf7">The seventh buffer to run the kernel with.</param>
         /// <param name="buf8">The eighth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
@@ -232,7 +275,7 @@ public static void ParallelFor<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T
         /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z, A>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>> action)
+        public static void ParallelFor<T, U, V, W, X, Y, Z, A>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -243,7 +286,8 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A>(int start, int end, Buffe
             where A : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, action);
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, action, src);
         }
 
         /// <summary>
@@ -263,6 +307,8 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A>(int start, int end, Buffe
         /// <param name="buf8">The eighth buffer to run the kernel with.</param>
         /// <param name="buf9">The ninth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
@@ -272,7 +318,7 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A>(int start, int end, Buffe
         /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>> action)
+        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -284,7 +330,8 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A, B>(int start, int end, Bu
             where B : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, action);
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, action, src);
         }
 
         /// <summary>
@@ -305,6 +352,8 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A, B>(int start, int end, Bu
         /// <param name="buf9">The ninth buffer to run the kernel with.</param>
         /// <param name="buf10">The tenth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
@@ -315,7 +364,7 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A, B>(int start, int end, Bu
         /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>> action)
+        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -328,7 +377,8 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C>(int start, int end,
             where C : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, action);
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, action, src);
         }
 
         /// <summary>
@@ -350,6 +400,8 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C>(int start, int end,
         /// <param name="buf10">The tenth buffer to run the kernel with.</param>
         /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
@@ -361,7 +413,7 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C>(int start, int end,
         /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>> action)
+        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -375,7 +427,8 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int e
             where D : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, action);
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, action, src);
         }
 
         /// <summary>
@@ -398,6 +451,8 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int e
         /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
         /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
@@ -410,7 +465,7 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int e
         /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>> action)
+        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -425,7 +480,8 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, in
             where E : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, action);
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, action, src);
         }
 
         /// <summary>
@@ -449,6 +505,8 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, in
         /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
         /// <param name="buf13">The thirteenth buffer to run the kernel with.</param>
         /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
@@ -462,7 +520,7 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, in
         /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="F">The base type of the thirteenth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E, F>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Buffer<F> buf13, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>, GPUArray<F>> action)
+        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E, F>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Buffer<F> buf13, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>, GPUArray<F>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
             where T : unmanaged
             where U : unmanaged
             where V : unmanaged
@@ -478,7 +536,8 @@ public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E, F>(int start,
             where F : unmanaged
         {
             var handler = new AcceleratorHandler();
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, action);
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, action, src);
         }
     }
 }
\ No newline at end of file
diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs
index b1e9df75..8a0c09f4 100644
--- a/DotMP/GPU/Index.cs
+++ b/DotMP/GPU/Index.cs
@@ -17,6 +17,8 @@
 using ILGPU;
 using System;
 using System.Diagnostics.CodeAnalysis;
+using System.Linq;
+using System.Runtime.CompilerServices;
 
 namespace DotMP.GPU
 {
@@ -29,7 +31,13 @@ public struct Index
         /// <summary>
         /// The start of the for loop, for index calculations.
         /// </summary>
-        private int start;
+        private int start1;
+        private int start2;
+
+        private int i_prv;
+        private int j_prv;
+
+        private int diff;
 
         /// <summary>
         /// Constructor.
@@ -37,16 +45,62 @@ public struct Index
         /// <param name="start">The start of the parallel for loop.</param>
         internal Index(int start)
         {
-            this.start = start;
+            this.start1 = start;
+            this.start2 = 0;
+            i_prv = -1;
+            j_prv = -1;
+            diff = 0;
+        }
+
+        internal Index((int, int)[] ranges)
+        {
+            start1 = ranges[0].Item1;
+            start2 = ranges[1].Item1;
+            i_prv = -1;
+            j_prv = -1;
+            diff = ranges[1].Item2 - ranges[1].Item1;
         }
 
         /// <summary>
         /// Gets the index of the loop.
         /// </summary>
         /// <param name="h">Unused.</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static implicit operator int(Index h)
         {
-            return Grid.GlobalIndex.X + h.start;
+            return Grid.GlobalLinearIndex + h.start1;
+        }
+
+        public int i
+        {
+            get
+            {
+                if (i_prv == -1)
+                {
+                    i_prv = IntrinsicMath.DivRoundDown(Grid.GlobalLinearIndex, diff);
+                    j_prv = Grid.GlobalLinearIndex - i_prv * diff;
+                    i_prv += start1;
+                    j_prv += start2;
+                }
+
+                return i_prv;
+            }
+        }
+
+        public int j
+        {
+            get
+            {
+                if (j_prv == -1)
+                {
+                    i_prv = IntrinsicMath.DivRoundDown(Grid.GlobalLinearIndex, diff);
+                    j_prv = Grid.GlobalLinearIndex - i_prv * diff;
+                    i_prv += start1;
+                    j_prv += start2;
+                }
+
+                return j_prv;
+            }
         }
     }
 }
\ No newline at end of file
diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs
index b3475816..58ca52e0 100644
--- a/benchmarks/GPUHeatTransfer/Program.cs
+++ b/benchmarks/GPUHeatTransfer/Program.cs
@@ -127,21 +127,15 @@ public void DoStep()
                 break;
 
             case ParType.DMPGPU:
-                DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) =>
+                DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) =>
                 {
-                    for (int j = 1; j < 514 - 1; j++)
-                    {
-                        //set the scratch array to the average of the surrounding cells
-                        scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
-                    }
+                    //set the scratch array to the average of the surrounding cells
+                    scratch[idx.i, idx.j] = 0.25 * (grid[idx.i - 1, idx.j] + grid[idx.i + 1, idx.j] + grid[idx.i, idx.j - 1] + grid[idx.i, idx.j + 1]);
                 });
 
-                DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) =>
+                DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) =>
                 {
-                    for (int j = 1; j < 514 - 1; j++)
-                    {
-                        grid[i, j] = scratch[i, j];
-                    }
+                    grid[idx.i, idx.j] = scratch[idx.i, idx.j];
                 });
                 break;
         }
@@ -245,21 +239,15 @@ public void DoStep()
                 break;
 
             case ParType.DMPGPU:
-                DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) =>
+                DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) =>
                 {
-                    for (int j = 1; j < 514 - 1; j++)
-                    {
-                        //set the scratch array to the average of the surrounding cells
-                        scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
-                    }
+                    //set the scratch array to the average of the surrounding cells
+                    scratch[idx.i, idx.j] = 0.25 * (grid[idx.i - 1, idx.j] + grid[idx.i + 1, idx.j] + grid[idx.i, idx.j - 1] + grid[idx.i, idx.j + 1]);
                 });
 
-                DotMP.GPU.Parallel.ParallelFor(1, dim - 1, gridbuf, scratchbuf, (i, grid, scratch) =>
+                DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) =>
                 {
-                    for (int j = 1; j < 514 - 1; j++)
-                    {
-                        grid[i, j] = scratch[i, j];
-                    }
+                    grid[idx.i, idx.j] = scratch[idx.i, idx.j];
                 });
                 break;
         }
@@ -306,4 +294,4 @@ public static void Main(string[] args)
         else
             BenchmarkRunner.Run<HeatTransfer>();
     }
-}
\ No newline at end of file
+}

From 348ad0c829ed1175a14766ee8c2d8785c0025988 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 17:20:37 -0600
Subject: [PATCH 33/61] implement T4 template for acceleratorhandler

---
 DotMP/GPU/AcceleratorHandler.tt | 713 ++++++++++++++++++++++++++++++++
 1 file changed, 713 insertions(+)
 create mode 100644 DotMP/GPU/AcceleratorHandler.tt

diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt
new file mode 100644
index 00000000..9542d5cf
--- /dev/null
+++ b/DotMP/GPU/AcceleratorHandler.tt
@@ -0,0 +1,713 @@
+/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+
+<#@ template debug="false" hostspecific="false" language="C#" #>
+<#@ output extension=".cs" #>
+<# var letters = new char[] { 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'A', 'B', 'C', 'D', 'E', 'F' };
+   int max = 13; #>
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using ILGPU;
+using ILGPU.Runtime;
+
+namespace DotMP.GPU
+{
+    /// <summary>
+    /// The handler class managing GPU acceleration.
+    /// </summary>
+    internal class AcceleratorHandler
+    {
+        /// <summary>
+        /// Determines if a GPU context has been initialized yet.
+        /// </summary>
+        private static bool initialized = false;
+        /// <summary>
+        /// The GPU context.
+        /// </summary>
+        private static Context context;
+        /// <summary>
+        /// The accelerator object.
+        /// </summary>
+        internal static Accelerator accelerator;
+        /// <summary>
+        /// Block size to use for kernels.
+        /// </summary>
+        private static int block_size;
+        /// <summary>
+        /// Kernel cache.
+        /// </summary>
+        private static Dictionary<string, dynamic> kernels = new Dictionary<string, dynamic>();
+
+        /// <summary>
+        /// Default constructor. If this is the first time it's called, it initializes all relevant singleton data.
+        /// </summary>
+        internal AcceleratorHandler()
+        {
+            if (initialized) return;
+
+            context = Context.CreateDefault();
+            var selectedDevice = context.Devices[0];
+
+            foreach (var d in context.Devices)
+            {
+                Console.WriteLine("Detected {0} accelerator.", d.ToString());
+
+                if (selectedDevice.AcceleratorType == AcceleratorType.CPU && d.AcceleratorType == AcceleratorType.OpenCL)
+                    selectedDevice = d;
+                if (selectedDevice.AcceleratorType != AcceleratorType.Cuda && d.AcceleratorType == AcceleratorType.Cuda)
+                    selectedDevice = d;
+            }
+
+            accelerator = selectedDevice.CreateAccelerator(context);
+            //accelerator = context.Devices[0].CreateAccelerator(context);
+
+            Console.WriteLine("Using {0} accelerator.", accelerator.AcceleratorType.ToString());
+            initialized = true;
+            block_size = accelerator.AcceleratorType == AcceleratorType.CPU ? 16 : 256;
+        }
+
+        /// <summary>
+        /// Synchronize pending operations.
+        /// </summary>
+        private void Synchronize() => accelerator.Synchronize();
+
+        /// <summary>
+        /// Get the kernel associated with this lambda.
+        /// </summary>
+        /// <param name="action">The action provided on the CPU.</param>
+        /// <param name="src">The calling location.</param>
+        /// <returns>The GPU kernel.</returns>
+<# for (int c = 1; c <= max; c++) { #>
+        private Action<KernelConfig, Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+> GetKernel<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+>(Action<Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #>
+> action, string src)
+<# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #>
+        {
+            if (!kernels.ContainsKey(src))
+                kernels.Add(src, accelerator.LoadStreamKernel(action));
+
+            return (Action<KernelConfig, Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> 
+            >) kernels[src];
+    }
+<# } #>
+
+    /// <summary>
+    /// Dispatches a kernel with one parameter.
+    /// </summary>
+    /// <param name="start">The start of the loop, inclusive.</param>
+    /// <param name="end">The end of the loop, exclusive.</param>
+    /// <param name="buf">The buffer to run the kernel with.</param>
+    /// <param name="action">The kernel to run on the GPU.</param>
+    /// <param name="src">The originating caller location.</param>
+    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+    internal void DispatchKernel<T>(int start, int end, Buffer<T> buf, Action<Index, GPUArray<T>> action, string src)
+        where T : unmanaged
+    {
+        var idx = new Index(start);
+
+        var kernel = GetKernel(action, src);
+
+        kernel(((end - start) / block_size, block_size), idx,
+            new GPUArray<T>(buf));
+
+        Synchronize();
+    }
+
+    /// <summary>
+    /// Dispatches a kernel with two parameters.
+    /// </summary>
+    /// <param name="ranges">The starts and ends of the loop.</param>
+    /// <param name="buf1">The first buffer to run the kernel with.</param>
+    /// <param name="buf2">The second buffer to run the kernel with.</param>
+    /// <param name="action">The kernel to run on the GPU.</param>
+    /// <param name="src">The originating caller location.</param>
+    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+    internal void DispatchKernel<T, U>((int, int)[] ranges, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action, string src)
+        where T : unmanaged
+        where U : unmanaged
+    {
+        int len = ranges.Select(tup => tup.Item2 - tup.Item1).Aggregate((x, y) => x * y);
+        var idx = new Index(ranges);
+
+        var kernel = GetKernel(action, src);
+
+        kernel((len / block_size, block_size), idx,
+            new GPUArray<T>(buf1),
+            new GPUArray<U>(buf2));
+
+        Synchronize();
+    }
+
+    /// <summary>
+    /// Dispatches a kernel with three parameters.
+    /// </summary>
+    /// <param name="start">The start of the loop, inclusive.</param>
+    /// <param name="end">The end of the loop, exclusive.</param>
+    /// <param name="buf1">The first buffer to run the kernel with.</param>
+    /// <param name="buf2">The second buffer to run the kernel with.</param>
+    /// <param name="buf3">The third buffer to run the kernel with.</param>
+    /// <param name="action">The kernel to run on the GPU.</param>
+    /// <param name="src">The originating caller location.</param>
+    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+    internal void DispatchKernel<T, U, V>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> action, string src)
+        where T : unmanaged
+        where U : unmanaged
+        where V : unmanaged
+    {
+        var idx = new Index(start);
+
+        var kernel = GetKernel(action, src);
+
+        kernel(((end - start) / block_size, block_size), idx,
+            new GPUArray<T>(buf1),
+            new GPUArray<U>(buf2),
+            new GPUArray<V>(buf3));
+
+        Synchronize();
+    }
+
+    /// <summary>
+    /// Dispatches a kernel with four parameters.
+    /// </summary>
+    /// <param name="start">The start of the loop, inclusive.</param>
+    /// <param name="end">The end of the loop, exclusive.</param>
+    /// <param name="buf1">The first buffer to run the kernel with.</param>
+    /// <param name="buf2">The second buffer to run the kernel with.</param>
+    /// <param name="buf3">The third buffer to run the kernel with.</param>
+    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+    /// <param name="action">The kernel to run on the GPU.</param>
+    /// <param name="src">The originating caller location.</param>
+    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+    internal void DispatchKernel<T, U, V, W>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> action, string src)
+        where T : unmanaged
+        where U : unmanaged
+        where V : unmanaged
+        where W : unmanaged
+    {
+        var idx = new Index(start);
+
+        var kernel = GetKernel(action, src);
+
+        kernel(((end - start) / block_size, block_size), idx,
+            new GPUArray<T>(buf1),
+            new GPUArray<U>(buf2),
+            new GPUArray<V>(buf3),
+            new GPUArray<W>(buf4));
+
+        Synchronize();
+    }
+
+    /// <summary>
+    /// Dispatches a kernel with five parameters.
+    /// </summary>
+    /// <param name="start">The start of the loop, inclusive.</param>
+    /// <param name="end">The end of the loop, exclusive.</param>
+    /// <param name="buf1">The first buffer to run the kernel with.</param>
+    /// <param name="buf2">The second buffer to run the kernel with.</param>
+    /// <param name="buf3">The third buffer to run the kernel with.</param>
+    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+    /// <param name="action">The kernel to run on the GPU.</param>
+    /// <param name="src">The originating caller location.</param>
+    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+    internal void DispatchKernel<T, U, V, W, X>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>> action, string src)
+        where T : unmanaged
+        where U : unmanaged
+        where V : unmanaged
+        where W : unmanaged
+        where X : unmanaged
+    {
+        var idx = new Index(start);
+
+        var kernel = accelerator.LoadStreamKernel(action);
+
+        kernel(((end - start) / block_size, block_size), idx,
+            new GPUArray<T>(buf1),
+            new GPUArray<U>(buf2),
+            new GPUArray<V>(buf3),
+            new GPUArray<W>(buf4),
+            new GPUArray<X>(buf5));
+
+        Synchronize();
+    }
+
+    /// <summary>
+    /// Dispatches a kernel with six parameters.
+    /// </summary>
+    /// <param name="start">The start of the loop, inclusive.</param>
+    /// <param name="end">The end of the loop, exclusive.</param>
+    /// <param name="buf1">The first buffer to run the kernel with.</param>
+    /// <param name="buf2">The second buffer to run the kernel with.</param>
+    /// <param name="buf3">The third buffer to run the kernel with.</param>
+    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+    /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+    /// <param name="action">The kernel to run on the GPU.</param>
+    /// <param name="src">The originating caller location.</param>
+    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+    internal void DispatchKernel<T, U, V, W, X, Y>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>> action, string src)
+        where T : unmanaged
+        where U : unmanaged
+        where V : unmanaged
+        where W : unmanaged
+        where X : unmanaged
+        where Y : unmanaged
+    {
+        var idx = new Index(start);
+
+        var kernel = accelerator.LoadStreamKernel(action);
+
+        kernel(((end - start) / block_size, block_size), idx,
+            new GPUArray<T>(buf1),
+            new GPUArray<U>(buf2),
+            new GPUArray<V>(buf3),
+            new GPUArray<W>(buf4),
+            new GPUArray<X>(buf5),
+            new GPUArray<Y>(buf6));
+
+        Synchronize();
+    }
+
+    /// <summary>
+    /// Dispatches a kernel with seven parameters.
+    /// </summary>
+    /// <param name="start">The start of the loop, inclusive.</param>
+    /// <param name="end">The end of the loop, exclusive.</param>
+    /// <param name="buf1">The first buffer to run the kernel with.</param>
+    /// <param name="buf2">The second buffer to run the kernel with.</param>
+    /// <param name="buf3">The third buffer to run the kernel with.</param>
+    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+    /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+    /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+    /// <param name="action">The kernel to run on the GPU.</param>
+    /// <param name="src">The originating caller location.</param>
+    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+    internal void DispatchKernel<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>> action, string src)
+        where T : unmanaged
+        where U : unmanaged
+        where V : unmanaged
+        where W : unmanaged
+        where X : unmanaged
+        where Y : unmanaged
+        where Z : unmanaged
+    {
+        var idx = new Index(start);
+
+        var kernel = accelerator.LoadStreamKernel(action);
+
+        kernel(((end - start) / block_size, block_size), idx,
+            new GPUArray<T>(buf1),
+            new GPUArray<U>(buf2),
+            new GPUArray<V>(buf3),
+            new GPUArray<W>(buf4),
+            new GPUArray<X>(buf5),
+            new GPUArray<Y>(buf6),
+            new GPUArray<Z>(buf7));
+
+        Synchronize();
+    }
+
+    /// <summary>
+    /// Dispatches a kernel with eight parameters.
+    /// </summary>
+    /// <param name="start">The start of the loop, inclusive.</param>
+    /// <param name="end">The end of the loop, exclusive.</param>
+    /// <param name="buf1">The first buffer to run the kernel with.</param>
+    /// <param name="buf2">The second buffer to run the kernel with.</param>
+    /// <param name="buf3">The third buffer to run the kernel with.</param>
+    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+    /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+    /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+    /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+    /// <param name="action">The kernel to run on the GPU.</param>
+    /// <param name="src">The originating caller location.</param>
+    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+    internal void DispatchKernel<T, U, V, W, X, Y, Z, A>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>> action, string src)
+        where T : unmanaged
+        where U : unmanaged
+        where V : unmanaged
+        where W : unmanaged
+        where X : unmanaged
+        where Y : unmanaged
+        where Z : unmanaged
+        where A : unmanaged
+    {
+        var idx = new Index(start);
+
+        var kernel = accelerator.LoadStreamKernel(action);
+
+        kernel(((end - start) / block_size, block_size), idx,
+            new GPUArray<T>(buf1),
+            new GPUArray<U>(buf2),
+            new GPUArray<V>(buf3),
+            new GPUArray<W>(buf4),
+            new GPUArray<X>(buf5),
+            new GPUArray<Y>(buf6),
+            new GPUArray<Z>(buf7),
+            new GPUArray<A>(buf8));
+
+        Synchronize();
+    }
+
+    /// <summary>
+    /// Dispatches a kernel with nine parameters.
+    /// </summary>
+    /// <param name="start">The start of the loop, inclusive.</param>
+    /// <param name="end">The end of the loop, exclusive.</param>
+    /// <param name="buf1">The first buffer to run the kernel with.</param>
+    /// <param name="buf2">The second buffer to run the kernel with.</param>
+    /// <param name="buf3">The third buffer to run the kernel with.</param>
+    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+    /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+    /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+    /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+    /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+    /// <param name="action">The kernel to run on the GPU.</param>
+    /// <param name="src">The originating caller location.</param>
+    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+    internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>> action, string src)
+        where T : unmanaged
+        where U : unmanaged
+        where V : unmanaged
+        where W : unmanaged
+        where X : unmanaged
+        where Y : unmanaged
+        where Z : unmanaged
+        where A : unmanaged
+        where B : unmanaged
+    {
+        var idx = new Index(start);
+
+        var kernel = accelerator.LoadStreamKernel(action);
+
+        kernel(((end - start) / block_size, block_size), idx,
+            new GPUArray<T>(buf1),
+            new GPUArray<U>(buf2),
+            new GPUArray<V>(buf3),
+            new GPUArray<W>(buf4),
+            new GPUArray<X>(buf5),
+            new GPUArray<Y>(buf6),
+            new GPUArray<Z>(buf7),
+            new GPUArray<A>(buf8),
+            new GPUArray<B>(buf9));
+
+        Synchronize();
+    }
+
+    /// <summary>
+    /// Dispatches a kernel with ten parameters.
+    /// </summary>
+    /// <param name="start">The start of the loop, inclusive.</param>
+    /// <param name="end">The end of the loop, exclusive.</param>
+    /// <param name="buf1">The first buffer to run the kernel with.</param>
+    /// <param name="buf2">The second buffer to run the kernel with.</param>
+    /// <param name="buf3">The third buffer to run the kernel with.</param>
+    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+    /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+    /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+    /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+    /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+    /// <param name="buf10">The tenth buffer to run the kernel with.</param>
+    /// <param name="action">The kernel to run on the GPU.</param>
+    /// <param name="src">The originating caller location.</param>
+    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
+    internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>> action, string src)
+        where T : unmanaged
+        where U : unmanaged
+        where V : unmanaged
+        where W : unmanaged
+        where X : unmanaged
+        where Y : unmanaged
+        where Z : unmanaged
+        where A : unmanaged
+        where B : unmanaged
+        where C : unmanaged
+    {
+        var idx = new Index(start);
+
+        var kernel = accelerator.LoadStreamKernel(action);
+
+        kernel(((end - start) / block_size, block_size), idx,
+            new GPUArray<T>(buf1),
+            new GPUArray<U>(buf2),
+            new GPUArray<V>(buf3),
+            new GPUArray<W>(buf4),
+            new GPUArray<X>(buf5),
+            new GPUArray<Y>(buf6),
+            new GPUArray<Z>(buf7),
+            new GPUArray<A>(buf8),
+            new GPUArray<B>(buf9),
+            new GPUArray<C>(buf10));
+
+        Synchronize();
+    }
+
+    /// <summary>
+    /// Dispatches a kernel with eleven parameters.
+    /// </summary>
+    /// <param name="start">The start of the loop, inclusive.</param>
+    /// <param name="end">The end of the loop, exclusive.</param>
+    /// <param name="buf1">The first buffer to run the kernel with.</param>
+    /// <param name="buf2">The second buffer to run the kernel with.</param>
+    /// <param name="buf3">The third buffer to run the kernel with.</param>
+    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+    /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+    /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+    /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+    /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+    /// <param name="buf10">The tenth buffer to run the kernel with.</param>
+    /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
+    /// <param name="action">The kernel to run on the GPU.</param>
+    /// <param name="src">The originating caller location.</param>
+    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
+    internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>> action, string src)
+        where T : unmanaged
+        where U : unmanaged
+        where V : unmanaged
+        where W : unmanaged
+        where X : unmanaged
+        where Y : unmanaged
+        where Z : unmanaged
+        where A : unmanaged
+        where B : unmanaged
+        where C : unmanaged
+        where D : unmanaged
+    {
+        var idx = new Index(start);
+
+        var kernel = accelerator.LoadStreamKernel(action);
+
+        kernel(((end - start) / block_size, block_size), idx,
+            new GPUArray<T>(buf1),
+            new GPUArray<U>(buf2),
+            new GPUArray<V>(buf3),
+            new GPUArray<W>(buf4),
+            new GPUArray<X>(buf5),
+            new GPUArray<Y>(buf6),
+            new GPUArray<Z>(buf7),
+            new GPUArray<A>(buf8),
+            new GPUArray<B>(buf9),
+            new GPUArray<C>(buf10),
+            new GPUArray<D>(buf11));
+
+        Synchronize();
+    }
+
+    /// <summary>
+    /// Dispatches a kernel with twelve parameters.
+    /// </summary>
+    /// <param name="start">The start of the loop, inclusive.</param>
+    /// <param name="end">The end of the loop, exclusive.</param>
+    /// <param name="buf1">The first buffer to run the kernel with.</param>
+    /// <param name="buf2">The second buffer to run the kernel with.</param>
+    /// <param name="buf3">The third buffer to run the kernel with.</param>
+    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+    /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+    /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+    /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+    /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+    /// <param name="buf10">The tenth buffer to run the kernel with.</param>
+    /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
+    /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
+    /// <param name="action">The kernel to run on the GPU.</param>
+    /// <param name="src">The originating caller location.</param>
+    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
+    internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>> action, string src)
+        where T : unmanaged
+        where U : unmanaged
+        where V : unmanaged
+        where W : unmanaged
+        where X : unmanaged
+        where Y : unmanaged
+        where Z : unmanaged
+        where A : unmanaged
+        where B : unmanaged
+        where C : unmanaged
+        where D : unmanaged
+        where E : unmanaged
+    {
+        var idx = new Index(start);
+
+        var kernel = accelerator.LoadStreamKernel(action);
+
+        kernel(((end - start) / block_size, block_size), idx,
+            new GPUArray<T>(buf1),
+            new GPUArray<U>(buf2),
+            new GPUArray<V>(buf3),
+            new GPUArray<W>(buf4),
+            new GPUArray<X>(buf5),
+            new GPUArray<Y>(buf6),
+            new GPUArray<Z>(buf7),
+            new GPUArray<A>(buf8),
+            new GPUArray<B>(buf9),
+            new GPUArray<C>(buf10),
+            new GPUArray<D>(buf11),
+            new GPUArray<E>(buf12));
+
+        Synchronize();
+    }
+
+    /// <summary>
+    /// Dispatches a kernel with thirteen parameters.
+    /// </summary>
+    /// <param name="start">The start of the loop, inclusive.</param>
+    /// <param name="end">The end of the loop, exclusive.</param>
+    /// <param name="buf1">The first buffer to run the kernel with.</param>
+    /// <param name="buf2">The second buffer to run the kernel with.</param>
+    /// <param name="buf3">The third buffer to run the kernel with.</param>
+    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
+    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
+    /// <param name="buf6">The sixth buffer to run the kernel with.</param>
+    /// <param name="buf7">The seventh buffer to run the kernel with.</param>
+    /// <param name="buf8">The eighth buffer to run the kernel with.</param>
+    /// <param name="buf9">The ninth buffer to run the kernel with.</param>
+    /// <param name="buf10">The tenth buffer to run the kernel with.</param>
+    /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
+    /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
+    /// <param name="buf13">The thirteenth buffer to run the kernel with.</param>
+    /// <param name="action">The kernel to run on the GPU.</param>
+    /// <param name="src">The originating caller location.</param>
+    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
+    /// <typeparam name="F">The base type of the thirteenth argument. Must be an unmanaged type.</typeparam>
+    internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E, F>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Buffer<F> buf13, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>, GPUArray<F>> action, string src)
+        where T : unmanaged
+        where U : unmanaged
+        where V : unmanaged
+        where W : unmanaged
+        where X : unmanaged
+        where Y : unmanaged
+        where Z : unmanaged
+        where A : unmanaged
+        where B : unmanaged
+        where C : unmanaged
+        where D : unmanaged
+        where E : unmanaged
+        where F : unmanaged
+    {
+        var idx = new Index(start);
+
+        var kernel = accelerator.LoadStreamKernel(action);
+
+        kernel(((end - start) / block_size, block_size), idx,
+            new GPUArray<T>(buf1),
+            new GPUArray<U>(buf2),
+            new GPUArray<V>(buf3),
+            new GPUArray<W>(buf4),
+            new GPUArray<X>(buf5),
+            new GPUArray<Y>(buf6),
+            new GPUArray<Z>(buf7),
+            new GPUArray<A>(buf8),
+            new GPUArray<B>(buf9),
+            new GPUArray<C>(buf10),
+            new GPUArray<D>(buf11),
+            new GPUArray<E>(buf12),
+            new GPUArray<F>(buf13));
+
+        Synchronize();
+    }
+}
+}
\ No newline at end of file

From 91f16921638143f64597a87087fc37a112314bd4 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 17:20:47 -0600
Subject: [PATCH 34/61] ignore generated acceleratorhandler file

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index a24fbd40..df42484a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,7 @@ docs/*
 .vscode
 *.opencover.xml
 *.sln
-parfor_dump.cs
+AcceleratorHandler.cs
 ProcessedREADME.md
 
 # User-specific files

From 65408250c534d29704e55c00549f1157e54800b3 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 17:20:54 -0600
Subject: [PATCH 35/61] add T4 stuff

---
 DotMP/DotMP.csproj | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/DotMP/DotMP.csproj b/DotMP/DotMP.csproj
index b618b701..00a47b68 100644
--- a/DotMP/DotMP.csproj
+++ b/DotMP/DotMP.csproj
@@ -23,7 +23,14 @@
 
   <ItemGroup>
     <PackageReference Include="ILGPU" Version="1.5.1" />
-    <PackageReference Include="Mono.Cecil" Version="0.11.5" />
+
+    <None Include="GPU/AcceleratorHandler.cs">
+      <DesignTime>True</DesignTime>
+      <AutoGen>True</AutoGen>
+      <DependentUpon>GPU/AcceleratorHandler.tt</DependentUpon>
+    </None>
+
+    <PackageReference Include="T4.Build" Version="0.2.4" />
   </ItemGroup>
 
 </Project>

From a8872d12342883ab2a1c0724bc42753613c3ac87 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 17:21:04 -0600
Subject: [PATCH 36/61] delete now unnecessary files

---
 DotMP/GPU/AcceleratorHandler.cs  | 758 -------------------------------
 DotMP/GPU/Python/dispatch_gen.py |  93 ----
 DotMP/GPU/Python/parfor_gen.py   |  88 ----
 3 files changed, 939 deletions(-)
 delete mode 100644 DotMP/GPU/AcceleratorHandler.cs
 delete mode 100644 DotMP/GPU/Python/dispatch_gen.py
 delete mode 100644 DotMP/GPU/Python/parfor_gen.py

diff --git a/DotMP/GPU/AcceleratorHandler.cs b/DotMP/GPU/AcceleratorHandler.cs
deleted file mode 100644
index 3608a6c1..00000000
--- a/DotMP/GPU/AcceleratorHandler.cs
+++ /dev/null
@@ -1,758 +0,0 @@
-/*
-* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
-* Copyright (C) 2023 Phillip Allen Lane
-*
-* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
-* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
-* (at your option) any later version.
-*
-* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
-* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-* License for more details.
-*
-* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
-* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-*/
-
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using ILGPU;
-using ILGPU.Runtime;
-
-namespace DotMP.GPU
-{
-    /// <summary>
-    /// The handler class managing GPU acceleration.
-    /// </summary>
-    internal class AcceleratorHandler
-    {
-        /// <summary>
-        /// Determines if a GPU context has been initialized yet.
-        /// </summary>
-        private static bool initialized = false;
-        /// <summary>
-        /// The GPU context.
-        /// </summary>
-        private static Context context;
-        /// <summary>
-        /// The accelerator object.
-        /// </summary>
-        internal static Accelerator accelerator;
-        /// <summary>
-        /// Block size to use for kernels.
-        /// </summary>
-        private static int block_size;
-        /// <summary>
-        /// Kernel cache.
-        /// </summary>
-        private static Dictionary<string, dynamic> kernels = new Dictionary<string, dynamic>();
-
-        /// <summary>
-        /// Default constructor. If this is the first time it's called, it initializes all relevant singleton data.
-        /// </summary>
-        internal AcceleratorHandler()
-        {
-            if (initialized) return;
-
-            context = Context.CreateDefault();
-            var selectedDevice = context.Devices[0];
-
-            foreach (var d in context.Devices)
-            {
-                Console.WriteLine("Detected {0} accelerator.", d.ToString());
-
-                if (selectedDevice.AcceleratorType == AcceleratorType.CPU && d.AcceleratorType == AcceleratorType.OpenCL)
-                    selectedDevice = d;
-                if (selectedDevice.AcceleratorType != AcceleratorType.Cuda && d.AcceleratorType == AcceleratorType.Cuda)
-                    selectedDevice = d;
-            }
-
-            accelerator = selectedDevice.CreateAccelerator(context);
-            //accelerator = context.Devices[0].CreateAccelerator(context);
-
-            Console.WriteLine("Using {0} accelerator.", accelerator.AcceleratorType.ToString());
-            initialized = true;
-            block_size = accelerator.AcceleratorType == AcceleratorType.CPU ? 16 : 256;
-        }
-
-        /// <summary>
-        /// Synchronize pending operations.
-        /// </summary>
-        private void Synchronize() => accelerator.Synchronize();
-
-        /// <summary>
-        /// Get the kernel associated with this lambda.
-        /// </summary>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <param name="action">The action provided on the CPU.</param>
-        /// <param name="src">The calling location.</param>
-        /// <returns>The GPU kernel.</returns>
-        private Action<KernelConfig, Index, GPUArray<T>> GetKernel<T>(Action<Index, GPUArray<T>> action, string src)
-            where T : unmanaged
-        {
-            if (!kernels.ContainsKey(src))
-                kernels.Add(src, accelerator.LoadStreamKernel(action));
-
-            return (Action<KernelConfig, Index, GPUArray<T>>)kernels[src];
-        }
-
-        /// <summary>
-        /// Get the kernel associated with this lambda.
-        /// </summary>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <param name="action">The action provided on the CPU.</param>
-        /// <param name="src">The calling location.</param>
-        /// <returns>The GPU kernel.</returns>
-        private Action<KernelConfig, Index, GPUArray<T>, GPUArray<U>> GetKernel<T, U>(Action<Index, GPUArray<T>, GPUArray<U>> action, string src)
-            where T : unmanaged
-            where U : unmanaged
-        {
-            if (!kernels.ContainsKey(src))
-                kernels.Add(src, accelerator.LoadStreamKernel(action));
-
-            return (Action<KernelConfig, Index, GPUArray<T>, GPUArray<U>>)kernels[src];
-        }
-
-        /// <summary>
-        /// Get the kernel associated with this lambda.
-        /// </summary>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <param name="action">The action provided on the CPU.</param>
-        /// <param name="src">The calling location.</param>
-        /// <returns>The GPU kernel.</returns>
-        private Action<KernelConfig, Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> GetKernel<T, U, V>(Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> action, string src)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-        {
-            if (!kernels.ContainsKey(src))
-                kernels.Add(src, accelerator.LoadStreamKernel(action));
-
-            return (Action<KernelConfig, Index, GPUArray<T>, GPUArray<U>, GPUArray<V>>)kernels[src];
-        }
-
-        /// <summary>
-        /// Get the kernel associated with this lambda.
-        /// </summary>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <param name="action">The action provided on the CPU.</param>
-        /// <param name="src">The calling location.</param>
-        /// <returns>The GPU kernel.</returns>
-        private Action<KernelConfig, Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> GetKernel<T, U, V, W>(Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> action, string src)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-        {
-            if (!kernels.ContainsKey(src))
-                kernels.Add(src, accelerator.LoadStreamKernel(action));
-
-            return (Action<KernelConfig, Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>>)kernels[src];
-        }
-
-        /// <summary>
-        /// Dispatches a kernel with one parameter.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf">The buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="src">The originating caller location.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T>(int start, int end, Buffer<T> buf, Action<Index, GPUArray<T>> action, string src)
-            where T : unmanaged
-        {
-            var idx = new Index(start);
-
-            var kernel = GetKernel(action, src);
-
-            kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf));
-
-            Synchronize();
-        }
-
-        /// <summary>
-        /// Dispatches a kernel with two parameters.
-        /// </summary>
-        /// <param name="ranges">The starts and ends of the loop.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="src">The originating caller location.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U>((int, int)[] ranges, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action, string src)
-            where T : unmanaged
-            where U : unmanaged
-        {
-            int len = ranges.Select(tup => tup.Item2 - tup.Item1).Aggregate((x, y) => x * y);
-            var idx = new Index(ranges);
-
-            var kernel = GetKernel(action, src);
-
-            kernel((len / block_size, block_size), idx,
-                new GPUArray<T>(buf1),
-                new GPUArray<U>(buf2));
-
-            Synchronize();
-        }
-
-        /// <summary>
-        /// Dispatches a kernel with three parameters.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="src">The originating caller location.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> action, string src)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-        {
-            var idx = new Index(start);
-
-            var kernel = GetKernel(action, src);
-
-            kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1),
-                new GPUArray<U>(buf2),
-                new GPUArray<V>(buf3));
-
-            Synchronize();
-        }
-
-        /// <summary>
-        /// Dispatches a kernel with four parameters.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="src">The originating caller location.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> action, string src)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-        {
-            var idx = new Index(start);
-
-            var kernel = GetKernel(action, src);
-
-            kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1),
-                new GPUArray<U>(buf2),
-                new GPUArray<V>(buf3),
-                new GPUArray<W>(buf4));
-
-            Synchronize();
-        }
-
-        /// <summary>
-        /// Dispatches a kernel with five parameters.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="src">The originating caller location.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>> action, string src)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-        {
-            var idx = new Index(start);
-
-            var kernel = accelerator.LoadStreamKernel(action);
-
-            kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1),
-                new GPUArray<U>(buf2),
-                new GPUArray<V>(buf3),
-                new GPUArray<W>(buf4),
-                new GPUArray<X>(buf5));
-
-            Synchronize();
-        }
-
-        /// <summary>
-        /// Dispatches a kernel with six parameters.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="src">The originating caller location.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X, Y>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>> action, string src)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-        {
-            var idx = new Index(start);
-
-            var kernel = accelerator.LoadStreamKernel(action);
-
-            kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1),
-                new GPUArray<U>(buf2),
-                new GPUArray<V>(buf3),
-                new GPUArray<W>(buf4),
-                new GPUArray<X>(buf5),
-                new GPUArray<Y>(buf6));
-
-            Synchronize();
-        }
-
-        /// <summary>
-        /// Dispatches a kernel with seven parameters.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="src">The originating caller location.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>> action, string src)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-        {
-            var idx = new Index(start);
-
-            var kernel = accelerator.LoadStreamKernel(action);
-
-            kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1),
-                new GPUArray<U>(buf2),
-                new GPUArray<V>(buf3),
-                new GPUArray<W>(buf4),
-                new GPUArray<X>(buf5),
-                new GPUArray<Y>(buf6),
-                new GPUArray<Z>(buf7));
-
-            Synchronize();
-        }
-
-        /// <summary>
-        /// Dispatches a kernel with eight parameters.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="src">The originating caller location.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X, Y, Z, A>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>> action, string src)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-            where A : unmanaged
-        {
-            var idx = new Index(start);
-
-            var kernel = accelerator.LoadStreamKernel(action);
-
-            kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1),
-                new GPUArray<U>(buf2),
-                new GPUArray<V>(buf3),
-                new GPUArray<W>(buf4),
-                new GPUArray<X>(buf5),
-                new GPUArray<Y>(buf6),
-                new GPUArray<Z>(buf7),
-                new GPUArray<A>(buf8));
-
-            Synchronize();
-        }
-
-        /// <summary>
-        /// Dispatches a kernel with nine parameters.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="src">The originating caller location.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>> action, string src)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-            where A : unmanaged
-            where B : unmanaged
-        {
-            var idx = new Index(start);
-
-            var kernel = accelerator.LoadStreamKernel(action);
-
-            kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1),
-                new GPUArray<U>(buf2),
-                new GPUArray<V>(buf3),
-                new GPUArray<W>(buf4),
-                new GPUArray<X>(buf5),
-                new GPUArray<Y>(buf6),
-                new GPUArray<Z>(buf7),
-                new GPUArray<A>(buf8),
-                new GPUArray<B>(buf9));
-
-            Synchronize();
-        }
-
-        /// <summary>
-        /// Dispatches a kernel with ten parameters.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="src">The originating caller location.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>> action, string src)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-            where A : unmanaged
-            where B : unmanaged
-            where C : unmanaged
-        {
-            var idx = new Index(start);
-
-            var kernel = accelerator.LoadStreamKernel(action);
-
-            kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1),
-                new GPUArray<U>(buf2),
-                new GPUArray<V>(buf3),
-                new GPUArray<W>(buf4),
-                new GPUArray<X>(buf5),
-                new GPUArray<Y>(buf6),
-                new GPUArray<Z>(buf7),
-                new GPUArray<A>(buf8),
-                new GPUArray<B>(buf9),
-                new GPUArray<C>(buf10));
-
-            Synchronize();
-        }
-
-        /// <summary>
-        /// Dispatches a kernel with eleven parameters.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
-        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="src">The originating caller location.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>> action, string src)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-            where A : unmanaged
-            where B : unmanaged
-            where C : unmanaged
-            where D : unmanaged
-        {
-            var idx = new Index(start);
-
-            var kernel = accelerator.LoadStreamKernel(action);
-
-            kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1),
-                new GPUArray<U>(buf2),
-                new GPUArray<V>(buf3),
-                new GPUArray<W>(buf4),
-                new GPUArray<X>(buf5),
-                new GPUArray<Y>(buf6),
-                new GPUArray<Z>(buf7),
-                new GPUArray<A>(buf8),
-                new GPUArray<B>(buf9),
-                new GPUArray<C>(buf10),
-                new GPUArray<D>(buf11));
-
-            Synchronize();
-        }
-
-        /// <summary>
-        /// Dispatches a kernel with twelve parameters.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
-        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
-        /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="src">The originating caller location.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>> action, string src)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-            where A : unmanaged
-            where B : unmanaged
-            where C : unmanaged
-            where D : unmanaged
-            where E : unmanaged
-        {
-            var idx = new Index(start);
-
-            var kernel = accelerator.LoadStreamKernel(action);
-
-            kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1),
-                new GPUArray<U>(buf2),
-                new GPUArray<V>(buf3),
-                new GPUArray<W>(buf4),
-                new GPUArray<X>(buf5),
-                new GPUArray<Y>(buf6),
-                new GPUArray<Z>(buf7),
-                new GPUArray<A>(buf8),
-                new GPUArray<B>(buf9),
-                new GPUArray<C>(buf10),
-                new GPUArray<D>(buf11),
-                new GPUArray<E>(buf12));
-
-            Synchronize();
-        }
-
-        /// <summary>
-        /// Dispatches a kernel with thirteen parameters.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
-        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
-        /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
-        /// <param name="buf13">The thirteenth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="src">The originating caller location.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="F">The base type of the thirteenth argument. Must be an unmanaged type.</typeparam>
-        internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E, F>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Buffer<F> buf13, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>, GPUArray<F>> action, string src)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-            where A : unmanaged
-            where B : unmanaged
-            where C : unmanaged
-            where D : unmanaged
-            where E : unmanaged
-            where F : unmanaged
-        {
-            var idx = new Index(start);
-
-            var kernel = accelerator.LoadStreamKernel(action);
-
-            kernel(((end - start) / block_size, block_size), idx,
-                new GPUArray<T>(buf1),
-                new GPUArray<U>(buf2),
-                new GPUArray<V>(buf3),
-                new GPUArray<W>(buf4),
-                new GPUArray<X>(buf5),
-                new GPUArray<Y>(buf6),
-                new GPUArray<Z>(buf7),
-                new GPUArray<A>(buf8),
-                new GPUArray<B>(buf9),
-                new GPUArray<C>(buf10),
-                new GPUArray<D>(buf11),
-                new GPUArray<E>(buf12),
-                new GPUArray<F>(buf13));
-
-            Synchronize();
-        }
-    }
-}
\ No newline at end of file
diff --git a/DotMP/GPU/Python/dispatch_gen.py b/DotMP/GPU/Python/dispatch_gen.py
deleted file mode 100644
index bb4152cd..00000000
--- a/DotMP/GPU/Python/dispatch_gen.py
+++ /dev/null
@@ -1,93 +0,0 @@
-"""
-* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
-* Copyright (C) 2023 Phillip Allen Lane
-*
-* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
-* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
-* (at your option) any later version.
-*
-* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
-* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-* License for more details.
-*
-* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
-* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-"""
-
-ofile = open("./dispatch_dump.cs", "w")
-
-cardinals = ["one", "two", "three", "four", "five", "six", "seven", "eight",
-             "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"]
-ordinals = ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
-            "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth"]
-
-letters = ["T", "U", "V", "W", "X", "Y", "Z",
-           "A", "B", "C", "D", "E", "F", "G", "H", "I"]
-
-for i in range(0, 13):
-    funcstr = ""
-
-    funcstr += """/// <summary>
-/// Dispatches a kernel with {c} parameters.
-/// </summary>
-/// <param name="start">The start of the loop, inclusive.</param>
-/// <param name="end">The end of the loop, exclusive.</param>""".format(c=cardinals[i])
-
-    for j in range(i + 1):
-        adjusted = j + 1
-
-        funcstr += """
-/// <param name="buf{a}">The {o} buffer to run the kernel with.</param>""".format(a=j + 1, o=ordinals[j])
-
-    funcstr += """
-/// <param name="action">The kernel to run on the GPU.</param>"""
-
-    for j in range(i + 1):
-        funcstr += """
-/// <typeparam name="{l}">The base type of the {o} argument. Must be an unmanaged type.</typeparam>""".format(l=letters[j], o=ordinals[j])
-
-    funcstr += """
-internal void DispatchKernel<"""
-
-    for j in range(i):
-        funcstr += "{l}, ".format(l=letters[j])
-
-    funcstr += "{l}>(int start, int end, ".format(l=letters[i])
-
-    for j in range(i + 1):
-        adjusted = j + 1
-        funcstr += "Buffer<{l}> buf{a}, ".format(l=letters[j], a=adjusted)
-
-    funcstr += "Action<Index, "
-
-    for j in range(i):
-        adjusted = j + 1
-        funcstr += "GPUArray<{l}>, ".format(l=letters[j])
-
-    funcstr += "GPUArray<{l}>> action)".format(l=letters[i])
-
-    for j in range(i + 1):
-        funcstr += "\n    where {l} : unmanaged".format(l=letters[j])
-
-    funcstr += """
-{
-    var idx = new Index();
-
-    var kernel = accelerator.LoadStreamKernel(action);
-
-    kernel(((end - start) / block_size, block_size), idx,
-"""
-
-    for j in range(i):
-        adjusted = j + 1
-        funcstr += """        new GPUArray<{l}>(buf{a}.View),
-""".format(l=letters[j], a=adjusted)
-
-    funcstr += """        new GPUArray<{l}>(buf{a}.View));
-
-    Synchronize();
-""".format(l=letters[i], a=i + 1)
-
-    funcstr += "}\n\n"
-
-    ofile.write(funcstr)
diff --git a/DotMP/GPU/Python/parfor_gen.py b/DotMP/GPU/Python/parfor_gen.py
deleted file mode 100644
index e960e861..00000000
--- a/DotMP/GPU/Python/parfor_gen.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""
-* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
-* Copyright (C) 2023 Phillip Allen Lane
-*
-* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
-* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
-* (at your option) any later version.
-*
-* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
-* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-* License for more details.
-*
-* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
-* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-"""
-
-ofile = open("./parfor_dump.cs", "w")
-
-cardinals = ["one", "two", "three", "four", "five", "six", "seven", "eight",
-             "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"]
-ordinals = ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth",
-            "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth"]
-
-letters = ["T", "U", "V", "W", "X", "Y", "Z",
-           "A", "B", "C", "D", "E", "F", "G", "H", "I"]
-
-for i in range(0, 13):
-    funcstr = ""
-
-    funcstr += """/// <summary>
-/// Creates a GPU parallel for loop.
-/// The body of the kernel is run on a GPU target.
-/// This overload specifies that {c} arrays are used on the GPU.
-/// </summary>
-/// <param name="start">The start of the loop, inclusive.</param>
-/// <param name="end">The end of the loop, exclusive.</param>""".format(c=cardinals[i])
-
-    for j in range(i + 1):
-        adjusted = j + 1
-
-        funcstr += """
-/// <param name="buf{a}">The {o} buffer to run the kernel with.</param>""".format(a=j + 1, o=ordinals[j])
-
-    funcstr += """
-/// <param name="action">The kernel to run on the GPU.</param>"""
-
-    for j in range(i + 1):
-        funcstr += """
-/// <typeparam name="{l}">The base type of the {o} argument. Must be an unmanaged type.</typeparam>""".format(l=letters[j], o=ordinals[j])
-
-    funcstr += """
-public static void ParallelFor<"""
-
-    for j in range(i):
-        funcstr += "{l}, ".format(l=letters[j])
-
-    funcstr += "{l}>(int start, int end, ".format(l=letters[i])
-
-    for j in range(i + 1):
-        adjusted = j + 1
-        funcstr += "Buffer<{l}> buf{a}, ".format(l=letters[j], a=adjusted)
-
-    funcstr += "Action<Index, "
-
-    for j in range(i):
-        adjusted = j + 1
-        funcstr += "GPUArray<{l}>, ".format(l=letters[j])
-
-    funcstr += "GPUArray<{l}>> action)".format(l=letters[i])
-
-    for j in range(i + 1):
-        funcstr += "\n    where {l} : unmanaged".format(l=letters[j])
-
-    funcstr += """
-{
-    var handler = new AcceleratorHandler();
-    handler.DispatchKernel(start, end, """
-
-    for j in range(i + 1):
-        adjusted = j + 1
-        funcstr += "buf{a}, ".format(a=adjusted)
-
-    funcstr += """action);
-}
-
-"""
-
-    ofile.write(funcstr)

From cdf34c17d0ee252163147d33ae30643018c5717e Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 17:37:34 -0600
Subject: [PATCH 37/61] more autogen

---
 DotMP/GPU/AcceleratorHandler.tt | 607 ++------------------------------
 1 file changed, 22 insertions(+), 585 deletions(-)

diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt
index 9542d5cf..96cb8336 100644
--- a/DotMP/GPU/AcceleratorHandler.tt
+++ b/DotMP/GPU/AcceleratorHandler.tt
@@ -87,13 +87,13 @@ namespace DotMP.GPU
         /// </summary>
         private void Synchronize() => accelerator.Synchronize();
 
+<# for (int c = 1; c <= max; c++) { #>
         /// <summary>
         /// Get the kernel associated with this lambda.
         /// </summary>
         /// <param name="action">The action provided on the CPU.</param>
         /// <param name="src">The calling location.</param>
         /// <returns>The GPU kernel.</returns>
-<# for (int c = 1; c <= max; c++) { #>
         private Action<KernelConfig, Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
 > GetKernel<
@@ -112,602 +112,39 @@ namespace DotMP.GPU
     }
 <# } #>
 
+<# for (int c = 1; c <= max; c++) { #>
     /// <summary>
-    /// Dispatches a kernel with one parameter.
-    /// </summary>
-    /// <param name="start">The start of the loop, inclusive.</param>
-    /// <param name="end">The end of the loop, exclusive.</param>
-    /// <param name="buf">The buffer to run the kernel with.</param>
-    /// <param name="action">The kernel to run on the GPU.</param>
-    /// <param name="src">The originating caller location.</param>
-    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-    internal void DispatchKernel<T>(int start, int end, Buffer<T> buf, Action<Index, GPUArray<T>> action, string src)
-        where T : unmanaged
-    {
-        var idx = new Index(start);
-
-        var kernel = GetKernel(action, src);
-
-        kernel(((end - start) / block_size, block_size), idx,
-            new GPUArray<T>(buf));
-
-        Synchronize();
-    }
-
-    /// <summary>
-    /// Dispatches a kernel with two parameters.
-    /// </summary>
-    /// <param name="ranges">The starts and ends of the loop.</param>
-    /// <param name="buf1">The first buffer to run the kernel with.</param>
-    /// <param name="buf2">The second buffer to run the kernel with.</param>
-    /// <param name="action">The kernel to run on the GPU.</param>
-    /// <param name="src">The originating caller location.</param>
-    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-    internal void DispatchKernel<T, U>((int, int)[] ranges, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action, string src)
-        where T : unmanaged
-        where U : unmanaged
-    {
-        int len = ranges.Select(tup => tup.Item2 - tup.Item1).Aggregate((x, y) => x * y);
-        var idx = new Index(ranges);
-
-        var kernel = GetKernel(action, src);
-
-        kernel((len / block_size, block_size), idx,
-            new GPUArray<T>(buf1),
-            new GPUArray<U>(buf2));
-
-        Synchronize();
-    }
-
-    /// <summary>
-    /// Dispatches a kernel with three parameters.
-    /// </summary>
-    /// <param name="start">The start of the loop, inclusive.</param>
-    /// <param name="end">The end of the loop, exclusive.</param>
-    /// <param name="buf1">The first buffer to run the kernel with.</param>
-    /// <param name="buf2">The second buffer to run the kernel with.</param>
-    /// <param name="buf3">The third buffer to run the kernel with.</param>
-    /// <param name="action">The kernel to run on the GPU.</param>
-    /// <param name="src">The originating caller location.</param>
-    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-    internal void DispatchKernel<T, U, V>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> action, string src)
-        where T : unmanaged
-        where U : unmanaged
-        where V : unmanaged
-    {
-        var idx = new Index(start);
-
-        var kernel = GetKernel(action, src);
-
-        kernel(((end - start) / block_size, block_size), idx,
-            new GPUArray<T>(buf1),
-            new GPUArray<U>(buf2),
-            new GPUArray<V>(buf3));
-
-        Synchronize();
-    }
-
-    /// <summary>
-    /// Dispatches a kernel with four parameters.
-    /// </summary>
-    /// <param name="start">The start of the loop, inclusive.</param>
-    /// <param name="end">The end of the loop, exclusive.</param>
-    /// <param name="buf1">The first buffer to run the kernel with.</param>
-    /// <param name="buf2">The second buffer to run the kernel with.</param>
-    /// <param name="buf3">The third buffer to run the kernel with.</param>
-    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-    /// <param name="action">The kernel to run on the GPU.</param>
-    /// <param name="src">The originating caller location.</param>
-    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-    internal void DispatchKernel<T, U, V, W>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> action, string src)
-        where T : unmanaged
-        where U : unmanaged
-        where V : unmanaged
-        where W : unmanaged
-    {
-        var idx = new Index(start);
-
-        var kernel = GetKernel(action, src);
-
-        kernel(((end - start) / block_size, block_size), idx,
-            new GPUArray<T>(buf1),
-            new GPUArray<U>(buf2),
-            new GPUArray<V>(buf3),
-            new GPUArray<W>(buf4));
-
-        Synchronize();
-    }
-
-    /// <summary>
-    /// Dispatches a kernel with five parameters.
-    /// </summary>
-    /// <param name="start">The start of the loop, inclusive.</param>
-    /// <param name="end">The end of the loop, exclusive.</param>
-    /// <param name="buf1">The first buffer to run the kernel with.</param>
-    /// <param name="buf2">The second buffer to run the kernel with.</param>
-    /// <param name="buf3">The third buffer to run the kernel with.</param>
-    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-    /// <param name="action">The kernel to run on the GPU.</param>
-    /// <param name="src">The originating caller location.</param>
-    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-    internal void DispatchKernel<T, U, V, W, X>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>> action, string src)
-        where T : unmanaged
-        where U : unmanaged
-        where V : unmanaged
-        where W : unmanaged
-        where X : unmanaged
-    {
-        var idx = new Index(start);
-
-        var kernel = accelerator.LoadStreamKernel(action);
-
-        kernel(((end - start) / block_size, block_size), idx,
-            new GPUArray<T>(buf1),
-            new GPUArray<U>(buf2),
-            new GPUArray<V>(buf3),
-            new GPUArray<W>(buf4),
-            new GPUArray<X>(buf5));
-
-        Synchronize();
-    }
-
-    /// <summary>
-    /// Dispatches a kernel with six parameters.
-    /// </summary>
-    /// <param name="start">The start of the loop, inclusive.</param>
-    /// <param name="end">The end of the loop, exclusive.</param>
-    /// <param name="buf1">The first buffer to run the kernel with.</param>
-    /// <param name="buf2">The second buffer to run the kernel with.</param>
-    /// <param name="buf3">The third buffer to run the kernel with.</param>
-    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-    /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-    /// <param name="action">The kernel to run on the GPU.</param>
-    /// <param name="src">The originating caller location.</param>
-    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-    internal void DispatchKernel<T, U, V, W, X, Y>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>> action, string src)
-        where T : unmanaged
-        where U : unmanaged
-        where V : unmanaged
-        where W : unmanaged
-        where X : unmanaged
-        where Y : unmanaged
-    {
-        var idx = new Index(start);
-
-        var kernel = accelerator.LoadStreamKernel(action);
-
-        kernel(((end - start) / block_size, block_size), idx,
-            new GPUArray<T>(buf1),
-            new GPUArray<U>(buf2),
-            new GPUArray<V>(buf3),
-            new GPUArray<W>(buf4),
-            new GPUArray<X>(buf5),
-            new GPUArray<Y>(buf6));
-
-        Synchronize();
-    }
-
-    /// <summary>
-    /// Dispatches a kernel with seven parameters.
-    /// </summary>
-    /// <param name="start">The start of the loop, inclusive.</param>
-    /// <param name="end">The end of the loop, exclusive.</param>
-    /// <param name="buf1">The first buffer to run the kernel with.</param>
-    /// <param name="buf2">The second buffer to run the kernel with.</param>
-    /// <param name="buf3">The third buffer to run the kernel with.</param>
-    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-    /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-    /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-    /// <param name="action">The kernel to run on the GPU.</param>
-    /// <param name="src">The originating caller location.</param>
-    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-    internal void DispatchKernel<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>> action, string src)
-        where T : unmanaged
-        where U : unmanaged
-        where V : unmanaged
-        where W : unmanaged
-        where X : unmanaged
-        where Y : unmanaged
-        where Z : unmanaged
-    {
-        var idx = new Index(start);
-
-        var kernel = accelerator.LoadStreamKernel(action);
-
-        kernel(((end - start) / block_size, block_size), idx,
-            new GPUArray<T>(buf1),
-            new GPUArray<U>(buf2),
-            new GPUArray<V>(buf3),
-            new GPUArray<W>(buf4),
-            new GPUArray<X>(buf5),
-            new GPUArray<Y>(buf6),
-            new GPUArray<Z>(buf7));
-
-        Synchronize();
-    }
-
-    /// <summary>
-    /// Dispatches a kernel with eight parameters.
-    /// </summary>
-    /// <param name="start">The start of the loop, inclusive.</param>
-    /// <param name="end">The end of the loop, exclusive.</param>
-    /// <param name="buf1">The first buffer to run the kernel with.</param>
-    /// <param name="buf2">The second buffer to run the kernel with.</param>
-    /// <param name="buf3">The third buffer to run the kernel with.</param>
-    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-    /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-    /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-    /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-    /// <param name="action">The kernel to run on the GPU.</param>
-    /// <param name="src">The originating caller location.</param>
-    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-    internal void DispatchKernel<T, U, V, W, X, Y, Z, A>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>> action, string src)
-        where T : unmanaged
-        where U : unmanaged
-        where V : unmanaged
-        where W : unmanaged
-        where X : unmanaged
-        where Y : unmanaged
-        where Z : unmanaged
-        where A : unmanaged
-    {
-        var idx = new Index(start);
-
-        var kernel = accelerator.LoadStreamKernel(action);
-
-        kernel(((end - start) / block_size, block_size), idx,
-            new GPUArray<T>(buf1),
-            new GPUArray<U>(buf2),
-            new GPUArray<V>(buf3),
-            new GPUArray<W>(buf4),
-            new GPUArray<X>(buf5),
-            new GPUArray<Y>(buf6),
-            new GPUArray<Z>(buf7),
-            new GPUArray<A>(buf8));
-
-        Synchronize();
-    }
-
-    /// <summary>
-    /// Dispatches a kernel with nine parameters.
-    /// </summary>
-    /// <param name="start">The start of the loop, inclusive.</param>
-    /// <param name="end">The end of the loop, exclusive.</param>
-    /// <param name="buf1">The first buffer to run the kernel with.</param>
-    /// <param name="buf2">The second buffer to run the kernel with.</param>
-    /// <param name="buf3">The third buffer to run the kernel with.</param>
-    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-    /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-    /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-    /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-    /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-    /// <param name="action">The kernel to run on the GPU.</param>
-    /// <param name="src">The originating caller location.</param>
-    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-    internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>> action, string src)
-        where T : unmanaged
-        where U : unmanaged
-        where V : unmanaged
-        where W : unmanaged
-        where X : unmanaged
-        where Y : unmanaged
-        where Z : unmanaged
-        where A : unmanaged
-        where B : unmanaged
-    {
-        var idx = new Index(start);
-
-        var kernel = accelerator.LoadStreamKernel(action);
-
-        kernel(((end - start) / block_size, block_size), idx,
-            new GPUArray<T>(buf1),
-            new GPUArray<U>(buf2),
-            new GPUArray<V>(buf3),
-            new GPUArray<W>(buf4),
-            new GPUArray<X>(buf5),
-            new GPUArray<Y>(buf6),
-            new GPUArray<Z>(buf7),
-            new GPUArray<A>(buf8),
-            new GPUArray<B>(buf9));
-
-        Synchronize();
-    }
-
-    /// <summary>
-    /// Dispatches a kernel with ten parameters.
-    /// </summary>
-    /// <param name="start">The start of the loop, inclusive.</param>
-    /// <param name="end">The end of the loop, exclusive.</param>
-    /// <param name="buf1">The first buffer to run the kernel with.</param>
-    /// <param name="buf2">The second buffer to run the kernel with.</param>
-    /// <param name="buf3">The third buffer to run the kernel with.</param>
-    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-    /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-    /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-    /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-    /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-    /// <param name="buf10">The tenth buffer to run the kernel with.</param>
-    /// <param name="action">The kernel to run on the GPU.</param>
-    /// <param name="src">The originating caller location.</param>
-    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-    internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>> action, string src)
-        where T : unmanaged
-        where U : unmanaged
-        where V : unmanaged
-        where W : unmanaged
-        where X : unmanaged
-        where Y : unmanaged
-        where Z : unmanaged
-        where A : unmanaged
-        where B : unmanaged
-        where C : unmanaged
-    {
-        var idx = new Index(start);
-
-        var kernel = accelerator.LoadStreamKernel(action);
-
-        kernel(((end - start) / block_size, block_size), idx,
-            new GPUArray<T>(buf1),
-            new GPUArray<U>(buf2),
-            new GPUArray<V>(buf3),
-            new GPUArray<W>(buf4),
-            new GPUArray<X>(buf5),
-            new GPUArray<Y>(buf6),
-            new GPUArray<Z>(buf7),
-            new GPUArray<A>(buf8),
-            new GPUArray<B>(buf9),
-            new GPUArray<C>(buf10));
-
-        Synchronize();
-    }
-
-    /// <summary>
-    /// Dispatches a kernel with eleven parameters.
-    /// </summary>
-    /// <param name="start">The start of the loop, inclusive.</param>
-    /// <param name="end">The end of the loop, exclusive.</param>
-    /// <param name="buf1">The first buffer to run the kernel with.</param>
-    /// <param name="buf2">The second buffer to run the kernel with.</param>
-    /// <param name="buf3">The third buffer to run the kernel with.</param>
-    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-    /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-    /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-    /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-    /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-    /// <param name="buf10">The tenth buffer to run the kernel with.</param>
-    /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
-    /// <param name="action">The kernel to run on the GPU.</param>
-    /// <param name="src">The originating caller location.</param>
-    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
-    internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>> action, string src)
-        where T : unmanaged
-        where U : unmanaged
-        where V : unmanaged
-        where W : unmanaged
-        where X : unmanaged
-        where Y : unmanaged
-        where Z : unmanaged
-        where A : unmanaged
-        where B : unmanaged
-        where C : unmanaged
-        where D : unmanaged
-    {
-        var idx = new Index(start);
-
-        var kernel = accelerator.LoadStreamKernel(action);
-
-        kernel(((end - start) / block_size, block_size), idx,
-            new GPUArray<T>(buf1),
-            new GPUArray<U>(buf2),
-            new GPUArray<V>(buf3),
-            new GPUArray<W>(buf4),
-            new GPUArray<X>(buf5),
-            new GPUArray<Y>(buf6),
-            new GPUArray<Z>(buf7),
-            new GPUArray<A>(buf8),
-            new GPUArray<B>(buf9),
-            new GPUArray<C>(buf10),
-            new GPUArray<D>(buf11));
-
-        Synchronize();
-    }
-
-    /// <summary>
-    /// Dispatches a kernel with twelve parameters.
+    /// Dispatches a kernel with the given number of parameters.
     /// </summary>
     /// <param name="start">The start of the loop, inclusive.</param>
     /// <param name="end">The end of the loop, exclusive.</param>
-    /// <param name="buf1">The first buffer to run the kernel with.</param>
-    /// <param name="buf2">The second buffer to run the kernel with.</param>
-    /// <param name="buf3">The third buffer to run the kernel with.</param>
-    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-    /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-    /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-    /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-    /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-    /// <param name="buf10">The tenth buffer to run the kernel with.</param>
-    /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
-    /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
+<# for (int i = 0; i < c; i++) { #>
+    /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
+<# } #>
     /// <param name="action">The kernel to run on the GPU.</param>
     /// <param name="src">The originating caller location.</param>
-    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
-    internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>> action, string src)
-        where T : unmanaged
-        where U : unmanaged
-        where V : unmanaged
-        where W : unmanaged
-        where X : unmanaged
-        where Y : unmanaged
-        where Z : unmanaged
-        where A : unmanaged
-        where B : unmanaged
-        where C : unmanaged
-        where D : unmanaged
-        where E : unmanaged
+    internal void DispatchKernel<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+    >(int start, int end,
+<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
+    Action<Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+    > action, string src)
+<# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #>
     {
         var idx = new Index(start);
 
-        var kernel = accelerator.LoadStreamKernel(action);
+    var kernel = GetKernel(action, src);
 
-        kernel(((end - start) / block_size, block_size), idx,
-            new GPUArray<T>(buf1),
-            new GPUArray<U>(buf2),
-            new GPUArray<V>(buf3),
-            new GPUArray<W>(buf4),
-            new GPUArray<X>(buf5),
-            new GPUArray<Y>(buf6),
-            new GPUArray<Z>(buf7),
-            new GPUArray<A>(buf8),
-            new GPUArray<B>(buf9),
-            new GPUArray<C>(buf10),
-            new GPUArray<D>(buf11),
-            new GPUArray<E>(buf12));
+    kernel(((end - start) / block_size, block_size), idx
+<# for (int i = 0; i < c; i++) { #>
+            , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>)
+<# } #>
+            );
 
         Synchronize();
-    }
-
-    /// <summary>
-    /// Dispatches a kernel with thirteen parameters.
-    /// </summary>
-    /// <param name="start">The start of the loop, inclusive.</param>
-    /// <param name="end">The end of the loop, exclusive.</param>
-    /// <param name="buf1">The first buffer to run the kernel with.</param>
-    /// <param name="buf2">The second buffer to run the kernel with.</param>
-    /// <param name="buf3">The third buffer to run the kernel with.</param>
-    /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-    /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-    /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-    /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-    /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-    /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-    /// <param name="buf10">The tenth buffer to run the kernel with.</param>
-    /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
-    /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
-    /// <param name="buf13">The thirteenth buffer to run the kernel with.</param>
-    /// <param name="action">The kernel to run on the GPU.</param>
-    /// <param name="src">The originating caller location.</param>
-    /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
-    /// <typeparam name="F">The base type of the thirteenth argument. Must be an unmanaged type.</typeparam>
-    internal void DispatchKernel<T, U, V, W, X, Y, Z, A, B, C, D, E, F>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Buffer<F> buf13, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>, GPUArray<F>> action, string src)
-        where T : unmanaged
-        where U : unmanaged
-        where V : unmanaged
-        where W : unmanaged
-        where X : unmanaged
-        where Y : unmanaged
-        where Z : unmanaged
-        where A : unmanaged
-        where B : unmanaged
-        where C : unmanaged
-        where D : unmanaged
-        where E : unmanaged
-        where F : unmanaged
-    {
-        var idx = new Index(start);
-
-        var kernel = accelerator.LoadStreamKernel(action);
-
-        kernel(((end - start) / block_size, block_size), idx,
-            new GPUArray<T>(buf1),
-            new GPUArray<U>(buf2),
-            new GPUArray<V>(buf3),
-            new GPUArray<W>(buf4),
-            new GPUArray<X>(buf5),
-            new GPUArray<Y>(buf6),
-            new GPUArray<Z>(buf7),
-            new GPUArray<A>(buf8),
-            new GPUArray<B>(buf9),
-            new GPUArray<C>(buf10),
-            new GPUArray<D>(buf11),
-            new GPUArray<E>(buf12),
-            new GPUArray<F>(buf13));
+}
+<# } #>
 
-        Synchronize();
-    }
 }
 }
\ No newline at end of file

From 267a789f309d7ad7577472cc61aaf78de50214d8 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 17:37:42 -0600
Subject: [PATCH 38/61] revert collapse

---
 DotMP/GPU/Gpu.cs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/DotMP/GPU/Gpu.cs b/DotMP/GPU/Gpu.cs
index 1e2cd7a7..5a68462f 100644
--- a/DotMP/GPU/Gpu.cs
+++ b/DotMP/GPU/Gpu.cs
@@ -71,23 +71,23 @@ public static void ParallelFor<T>(int start, int end, Buffer<T> buf, Action<Inde
         /// <param name="path">The path to the file this method was called from.</param>
         /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
         /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /*public static void ParallelFor<T, U>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
+        public static void ParallelFor<T, U>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
             where T : unmanaged
             where U : unmanaged
         {
             var handler = new AcceleratorHandler();
             string src = FormatCaller(path, line);
             handler.DispatchKernel(start, end, buf1, buf2, action, src);
-        }*/
+        }
 
-        public static void ParallelForCollapse<T, U>((int, int) range1, (int, int) range2, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
+        /*public static void ParallelForCollapse<T, U>((int, int) range1, (int, int) range2, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
             where T : unmanaged
             where U : unmanaged
         {
             var handler = new AcceleratorHandler();
             string src = FormatCaller(path, line);
             handler.DispatchKernel(new (int, int)[] { range1, range2 }, buf1, buf2, action, src);
-        }
+        }*/
 
         /// <summary>
         /// Creates a GPU parallel for loop.

From 3d610b43c561cad80884a4fb9a7bedacd39cca4f Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 17:48:17 -0600
Subject: [PATCH 39/61] remove excess newlines

---
 DotMP/GPU/AcceleratorHandler.tt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt
index 96cb8336..3f2af598 100644
--- a/DotMP/GPU/AcceleratorHandler.tt
+++ b/DotMP/GPU/AcceleratorHandler.tt
@@ -14,7 +14,6 @@
 * write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */
 
-
 <#@ template debug="false" hostspecific="false" language="C#" #>
 <#@ output extension=".cs" #>
 <# var letters = new char[] { 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'A', 'B', 'C', 'D', 'E', 'F' };
@@ -145,6 +144,5 @@ namespace DotMP.GPU
         Synchronize();
 }
 <# } #>
-
 }
 }
\ No newline at end of file

From d03d31bbf9a6757172fc4a41707d77650c20418f Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 18:04:03 -0600
Subject: [PATCH 40/61] get parallelfor t4 gen working

---
 .gitignore                      |   1 +
 DotMP/GPU/AcceleratorHandler.tt |  46 +--
 DotMP/GPU/Gpu.cs                | 543 --------------------------------
 DotMP/GPU/Gpu.tt                |  76 +++++
 4 files changed, 100 insertions(+), 566 deletions(-)
 delete mode 100644 DotMP/GPU/Gpu.cs
 create mode 100644 DotMP/GPU/Gpu.tt

diff --git a/.gitignore b/.gitignore
index df42484a..ac1e66dc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ docs/*
 *.opencover.xml
 *.sln
 AcceleratorHandler.cs
+Gpu.cs
 ProcessedREADME.md
 
 # User-specific files
diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt
index 3f2af598..20fd8fe3 100644
--- a/DotMP/GPU/AcceleratorHandler.tt
+++ b/DotMP/GPU/AcceleratorHandler.tt
@@ -95,11 +95,11 @@ namespace DotMP.GPU
         /// <returns>The GPU kernel.</returns>
         private Action<KernelConfig, Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
-> GetKernel<
+        > GetKernel<
 <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
->(Action<Index,
+        >(Action<Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #>
-> action, string src)
+        > action, string src)
 <# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #>
         {
             if (!kernels.ContainsKey(src))
@@ -108,41 +108,41 @@ namespace DotMP.GPU
             return (Action<KernelConfig, Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> 
             >) kernels[src];
-    }
+        }
 <# } #>
 
 <# for (int c = 1; c <= max; c++) { #>
-    /// <summary>
-    /// Dispatches a kernel with the given number of parameters.
-    /// </summary>
-    /// <param name="start">The start of the loop, inclusive.</param>
-    /// <param name="end">The end of the loop, exclusive.</param>
+        /// <summary>
+        /// Dispatches a kernel with the given number of parameters.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
 <# for (int i = 0; i < c; i++) { #>
-    /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
+        /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
 <# } #>
-    /// <param name="action">The kernel to run on the GPU.</param>
-    /// <param name="src">The originating caller location.</param>
-    internal void DispatchKernel<
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
+        internal void DispatchKernel<
 <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
-    >(int start, int end,
+        >(int start, int end,
 <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
-    Action<Index,
+        Action<Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
-    > action, string src)
+        > action, string src)
 <# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #>
-    {
-        var idx = new Index(start);
+        {
+            var idx = new Index(start);
 
-    var kernel = GetKernel(action, src);
+            var kernel = GetKernel(action, src);
 
-    kernel(((end - start) / block_size, block_size), idx
+            kernel(((end - start) / block_size, block_size), idx
 <# for (int i = 0; i < c; i++) { #>
             , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>)
 <# } #>
             );
 
-        Synchronize();
-}
+            Synchronize();
+        }
 <# } #>
-}
+    }
 }
\ No newline at end of file
diff --git a/DotMP/GPU/Gpu.cs b/DotMP/GPU/Gpu.cs
deleted file mode 100644
index 5a68462f..00000000
--- a/DotMP/GPU/Gpu.cs
+++ /dev/null
@@ -1,543 +0,0 @@
-/*
-* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
-* Copyright (C) 2023 Phillip Allen Lane
-*
-* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
-* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
-* (at your option) any later version.
-*
-* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
-* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
-* License for more details.
-*
-* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
-* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-*/
-
-using System;
-using System.Runtime.CompilerServices;
-
-namespace DotMP.GPU
-{
-    /// <summary>
-    /// The main class of DotMP's GPU API, powered by the ILGPU project.
-    /// Contains all the main methods for constructing and running GPU kernels.
-    /// The GPU API is not thread-safe at the current moment, so its methods should not be called from within a Parallel.ParallelRegion!
-    /// </summary>
-    public static class Parallel
-    {
-        /// <summary>
-        /// Formats the caller information for determining uniqueness of a call.
-        /// </summary>
-        /// <param name="filename">The calling file.</param>
-        /// <param name="linenum">The calling line number.</param>
-        /// <returns>A formatted string representing "{filename}:{linenum}"</returns>
-        private static string FormatCaller(string filename, int linenum)
-        {
-            return string.Format("{0}:{1}", filename, linenum);
-        }
-
-        /// <summary>
-        /// Creates a GPU parallel for loop.
-        /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that one array is used on the GPU.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf">The buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="line">The line number this method was called from.</param>
-        /// <param name="path">The path to the file this method was called from.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T>(int start, int end, Buffer<T> buf, Action<Index, GPUArray<T>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
-            where T : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            string src = FormatCaller(path, line);
-            handler.DispatchKernel(start, end, buf, action, src);
-        }
-
-        /// <summary>
-        /// Creates a GPU parallel for loop.
-        /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that two arrays are used on the GPU.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="line">The line number this method was called from.</param>
-        /// <param name="path">The path to the file this method was called from.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
-            where T : unmanaged
-            where U : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            string src = FormatCaller(path, line);
-            handler.DispatchKernel(start, end, buf1, buf2, action, src);
-        }
-
-        /*public static void ParallelForCollapse<T, U>((int, int) range1, (int, int) range2, Buffer<T> buf1, Buffer<U> buf2, Action<Index, GPUArray<T>, GPUArray<U>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
-            where T : unmanaged
-            where U : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            string src = FormatCaller(path, line);
-            handler.DispatchKernel(new (int, int)[] { range1, range2 }, buf1, buf2, action, src);
-        }*/
-
-        /// <summary>
-        /// Creates a GPU parallel for loop.
-        /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that three arrays are used on the GPU.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="line">The line number this method was called from.</param>
-        /// <param name="path">The path to the file this method was called from.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            string src = FormatCaller(path, line);
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, action, src);
-        }
-
-        /// <summary>
-        /// Creates a GPU parallel for loop.
-        /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that four arrays are used on the GPU.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="line">The line number this method was called from.</param>
-        /// <param name="path">The path to the file this method was called from.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            string src = FormatCaller(path, line);
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, action, src);
-        }
-
-        /// <summary>
-        /// Creates a GPU parallel for loop.
-        /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that five arrays are used on the GPU.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="line">The line number this method was called from.</param>
-        /// <param name="path">The path to the file this method was called from.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            string src = FormatCaller(path, line);
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, action, src);
-        }
-
-        /// <summary>
-        /// Creates a GPU parallel for loop.
-        /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that six arrays are used on the GPU.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="line">The line number this method was called from.</param>
-        /// <param name="path">The path to the file this method was called from.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            string src = FormatCaller(path, line);
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, action, src);
-        }
-
-        /// <summary>
-        /// Creates a GPU parallel for loop.
-        /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that seven arrays are used on the GPU.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="line">The line number this method was called from.</param>
-        /// <param name="path">The path to the file this method was called from.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            string src = FormatCaller(path, line);
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, action, src);
-        }
-
-        /// <summary>
-        /// Creates a GPU parallel for loop.
-        /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that eight arrays are used on the GPU.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="line">The line number this method was called from.</param>
-        /// <param name="path">The path to the file this method was called from.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z, A>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-            where A : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            string src = FormatCaller(path, line);
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, action, src);
-        }
-
-        /// <summary>
-        /// Creates a GPU parallel for loop.
-        /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that nine arrays are used on the GPU.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="line">The line number this method was called from.</param>
-        /// <param name="path">The path to the file this method was called from.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-            where A : unmanaged
-            where B : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            string src = FormatCaller(path, line);
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, action, src);
-        }
-
-        /// <summary>
-        /// Creates a GPU parallel for loop.
-        /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that ten arrays are used on the GPU.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="line">The line number this method was called from.</param>
-        /// <param name="path">The path to the file this method was called from.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-            where A : unmanaged
-            where B : unmanaged
-            where C : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            string src = FormatCaller(path, line);
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, action, src);
-        }
-
-        /// <summary>
-        /// Creates a GPU parallel for loop.
-        /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that eleven arrays are used on the GPU.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
-        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="line">The line number this method was called from.</param>
-        /// <param name="path">The path to the file this method was called from.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-            where A : unmanaged
-            where B : unmanaged
-            where C : unmanaged
-            where D : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            string src = FormatCaller(path, line);
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, action, src);
-        }
-
-        /// <summary>
-        /// Creates a GPU parallel for loop.
-        /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that twelve arrays are used on the GPU.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
-        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
-        /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="line">The line number this method was called from.</param>
-        /// <param name="path">The path to the file this method was called from.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-            where A : unmanaged
-            where B : unmanaged
-            where C : unmanaged
-            where D : unmanaged
-            where E : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            string src = FormatCaller(path, line);
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, action, src);
-        }
-
-        /// <summary>
-        /// Creates a GPU parallel for loop.
-        /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that thirteen arrays are used on the GPU.
-        /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
-        /// <param name="buf1">The first buffer to run the kernel with.</param>
-        /// <param name="buf2">The second buffer to run the kernel with.</param>
-        /// <param name="buf3">The third buffer to run the kernel with.</param>
-        /// <param name="buf4">The fourth buffer to run the kernel with.</param>
-        /// <param name="buf5">The fifth buffer to run the kernel with.</param>
-        /// <param name="buf6">The sixth buffer to run the kernel with.</param>
-        /// <param name="buf7">The seventh buffer to run the kernel with.</param>
-        /// <param name="buf8">The eighth buffer to run the kernel with.</param>
-        /// <param name="buf9">The ninth buffer to run the kernel with.</param>
-        /// <param name="buf10">The tenth buffer to run the kernel with.</param>
-        /// <param name="buf11">The eleventh buffer to run the kernel with.</param>
-        /// <param name="buf12">The twelfth buffer to run the kernel with.</param>
-        /// <param name="buf13">The thirteenth buffer to run the kernel with.</param>
-        /// <param name="action">The kernel to run on the GPU.</param>
-        /// <param name="line">The line number this method was called from.</param>
-        /// <param name="path">The path to the file this method was called from.</param>
-        /// <typeparam name="T">The base type of the first argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="U">The base type of the second argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="V">The base type of the third argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="W">The base type of the fourth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="X">The base type of the fifth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Y">The base type of the sixth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="Z">The base type of the seventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="A">The base type of the eighth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="B">The base type of the ninth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="C">The base type of the tenth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="D">The base type of the eleventh argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="E">The base type of the twelfth argument. Must be an unmanaged type.</typeparam>
-        /// <typeparam name="F">The base type of the thirteenth argument. Must be an unmanaged type.</typeparam>
-        public static void ParallelFor<T, U, V, W, X, Y, Z, A, B, C, D, E, F>(int start, int end, Buffer<T> buf1, Buffer<U> buf2, Buffer<V> buf3, Buffer<W> buf4, Buffer<X> buf5, Buffer<Y> buf6, Buffer<Z> buf7, Buffer<A> buf8, Buffer<B> buf9, Buffer<C> buf10, Buffer<D> buf11, Buffer<E> buf12, Buffer<F> buf13, Action<Index, GPUArray<T>, GPUArray<U>, GPUArray<V>, GPUArray<W>, GPUArray<X>, GPUArray<Y>, GPUArray<Z>, GPUArray<A>, GPUArray<B>, GPUArray<C>, GPUArray<D>, GPUArray<E>, GPUArray<F>> action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
-            where T : unmanaged
-            where U : unmanaged
-            where V : unmanaged
-            where W : unmanaged
-            where X : unmanaged
-            where Y : unmanaged
-            where Z : unmanaged
-            where A : unmanaged
-            where B : unmanaged
-            where C : unmanaged
-            where D : unmanaged
-            where E : unmanaged
-            where F : unmanaged
-        {
-            var handler = new AcceleratorHandler();
-            string src = FormatCaller(path, line);
-            handler.DispatchKernel(start, end, buf1, buf2, buf3, buf4, buf5, buf6, buf7, buf8, buf9, buf10, buf11, buf12, buf13, action, src);
-        }
-    }
-}
\ No newline at end of file
diff --git a/DotMP/GPU/Gpu.tt b/DotMP/GPU/Gpu.tt
new file mode 100644
index 00000000..872b0750
--- /dev/null
+++ b/DotMP/GPU/Gpu.tt
@@ -0,0 +1,76 @@
+/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+<#@ template debug="false" hostspecific="false" language="C#" #>
+<#@ output extension=".cs" #>
+<# var letters = new char[] { 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'A', 'B', 'C', 'D', 'E', 'F' };
+   int max = 13; #>
+
+using System;
+using System.Runtime.CompilerServices;
+
+namespace DotMP.GPU
+{
+    /// <summary>
+    /// The main class of DotMP's GPU API, powered by the ILGPU project.
+    /// Contains all the main methods for constructing and running GPU kernels.
+    /// The GPU API is not thread-safe at the current moment, so its methods should not be called from within a Parallel.ParallelRegion!
+    /// </summary>
+    public static class Parallel
+    {
+        /// <summary>
+        /// Formats the caller information for determining uniqueness of a call.
+        /// </summary>
+        /// <param name="filename">The calling file.</param>
+        /// <param name="linenum">The calling line number.</param>
+        /// <returns>A formatted string representing "{filename}:{linenum}"</returns>
+        private static string FormatCaller(string filename, int linenum)
+        {
+            return string.Format("{0}:{1}", filename, linenum);
+        }
+
+<# for (int c = 1; c <= max; c++) { #>
+        /// <summary>
+        /// Creates a GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// This overload specifies that one array is used on the GPU.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+<# for (int i = 0; i < c; i++) { #>
+        /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
+<# } #>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
+        public static void ParallelFor<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+        >(int start, int end, 
+<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
+        Action<Index, 
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+        > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
+<# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #>
+        {
+            var handler = new AcceleratorHandler();
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(start, end, 
+<# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #>
+            action, src);
+        }
+<# } #>
+    }
+}
\ No newline at end of file

From 945f7e74637afdcb86be95aff2c9556643602374 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sat, 11 Nov 2023 18:15:05 -0600
Subject: [PATCH 41/61] implement collapsed for loops

---
 DotMP/GPU/AcceleratorHandler.tt | 10 +++++-----
 DotMP/GPU/Gpu.tt                | 33 ++++++++++++++++++++++++++++++++-
 DotMP/GPU/Index.cs              | 32 +++++++++++++++++---------------
 3 files changed, 54 insertions(+), 21 deletions(-)

diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt
index 20fd8fe3..88434822 100644
--- a/DotMP/GPU/AcceleratorHandler.tt
+++ b/DotMP/GPU/AcceleratorHandler.tt
@@ -115,8 +115,7 @@ namespace DotMP.GPU
         /// <summary>
         /// Dispatches a kernel with the given number of parameters.
         /// </summary>
-        /// <param name="start">The start of the loop, inclusive.</param>
-        /// <param name="end">The end of the loop, exclusive.</param>
+        /// <param name="ranges">The ranges of the for loop.</param>
 <# for (int i = 0; i < c; i++) { #>
         /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
 <# } #>
@@ -124,18 +123,19 @@ namespace DotMP.GPU
         /// <param name="src">The originating caller location.</param>
         internal void DispatchKernel<
 <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
-        >(int start, int end,
+        >((int, int)[] ranges,
 <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
         Action<Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
         > action, string src)
 <# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #>
         {
-            var idx = new Index(start);
+            var len = ranges.Select(tup => tup.Item2 - tup.Item1).Aggregate((x, y) => x * y);
+            var idx = new Index(ranges);
 
             var kernel = GetKernel(action, src);
 
-            kernel(((end - start) / block_size, block_size), idx
+            kernel((len / block_size, block_size), idx
 <# for (int i = 0; i < c; i++) { #>
             , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>)
 <# } #>
diff --git a/DotMP/GPU/Gpu.tt b/DotMP/GPU/Gpu.tt
index 872b0750..d6962b02 100644
--- a/DotMP/GPU/Gpu.tt
+++ b/DotMP/GPU/Gpu.tt
@@ -67,7 +67,38 @@ namespace DotMP.GPU
         {
             var handler = new AcceleratorHandler();
             string src = FormatCaller(path, line);
-            handler.DispatchKernel(start, end, 
+            handler.DispatchKernel(new (int, int)[] { (start, end) }, 
+<# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #>
+            action, src);
+        }
+<# } #>
+
+<# for (int c = 1; c <= max; c++) { #>
+        /// <summary>
+        /// Creates a collapsed GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// This overload specifies that one array is used on the GPU.
+        /// </summary>
+        /// <param name="range1">The range of the outer for loop.</param>
+        /// <param name="range2">The range of the outer for loop.</param>
+<# for (int i = 0; i < c; i++) { #>
+        /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
+<# } #>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
+        public static void ParallelForCollapse<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+        >((int, int) range1, (int, int) range2, 
+<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
+        Action<Index, 
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+        > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
+<# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #>
+        {
+            var handler = new AcceleratorHandler();
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(new (int, int)[] { range1, range2 }, 
 <# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #>
             action, src);
         }
diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs
index 8a0c09f4..83d9f7e6 100644
--- a/DotMP/GPU/Index.cs
+++ b/DotMP/GPU/Index.cs
@@ -42,23 +42,25 @@ public struct Index
         /// <summary>
         /// Constructor.
         /// </summary>
-        /// <param name="start">The start of the parallel for loop.</param>
-        internal Index(int start)
-        {
-            this.start1 = start;
-            this.start2 = 0;
-            i_prv = -1;
-            j_prv = -1;
-            diff = 0;
-        }
-
+        /// <param name="ranges">The ranges of the for loop.</param>
         internal Index((int, int)[] ranges)
         {
-            start1 = ranges[0].Item1;
-            start2 = ranges[1].Item1;
-            i_prv = -1;
-            j_prv = -1;
-            diff = ranges[1].Item2 - ranges[1].Item1;
+            if (ranges.Length == 1)
+            {
+                start1 = ranges[0].Item1;
+                start2 = -1;
+                i_prv = -1;
+                j_prv = -1;
+                diff = -1;
+            }
+            else
+            {
+                start1 = ranges[0].Item1;
+                start2 = ranges[1].Item1;
+                i_prv = -1;
+                j_prv = -1;
+                diff = ranges[1].Item2 - ranges[1].Item1;
+            }
         }
 
         /// <summary>

From d90279e3923fbc7fddfc7681b93108cd4ca5d8da Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sun, 12 Nov 2023 13:03:10 -0600
Subject: [PATCH 42/61] remove erroneous comment line

---
 DotMP/GPU/Gpu.tt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/DotMP/GPU/Gpu.tt b/DotMP/GPU/Gpu.tt
index d6962b02..4ac1f49e 100644
--- a/DotMP/GPU/Gpu.tt
+++ b/DotMP/GPU/Gpu.tt
@@ -46,7 +46,6 @@ namespace DotMP.GPU
         /// <summary>
         /// Creates a GPU parallel for loop.
         /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that one array is used on the GPU.
         /// </summary>
         /// <param name="start">The start of the loop, inclusive.</param>
         /// <param name="end">The end of the loop, exclusive.</param>
@@ -77,7 +76,6 @@ namespace DotMP.GPU
         /// <summary>
         /// Creates a collapsed GPU parallel for loop.
         /// The body of the kernel is run on a GPU target.
-        /// This overload specifies that one array is used on the GPU.
         /// </summary>
         /// <param name="range1">The range of the outer for loop.</param>
         /// <param name="range2">The range of the outer for loop.</param>

From f22ac639a6300e4db8a8790030992ef7efa9ae46 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sun, 12 Nov 2023 13:03:26 -0600
Subject: [PATCH 43/61] test with 500x500 instead of 514x514

---
 benchmarks/GPUHeatTransfer/Program.cs | 38 ++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs
index 58ca52e0..1800e673 100644
--- a/benchmarks/GPUHeatTransfer/Program.cs
+++ b/benchmarks/GPUHeatTransfer/Program.cs
@@ -37,7 +37,7 @@ public class HeatTransfer
     public enum ParType { DMPFor, DMPGPU }
 
     // test dims of 100x100, 1000x1000, and 5000x5000
-    [Params(514)]
+    [Params(500)]
     public int dim;
 
     // test with 10 steps and 100 steps
@@ -64,8 +64,13 @@ public void Setup()
         scratch = new double[dim, dim];
         grid = new double[dim, dim];
 
-        grid[0, dim / 2 - 1] = 100.0;
-        grid[0, dim / 2] = 100.0;
+        for (int i = 0; i < dim; i++)
+        {
+            grid[0, i] = 100.0;
+            grid[i, 0] = 100.0;
+            grid[dim - 1, i] = 100.0;
+            grid[i, dim - 1] = 100.0;
+        }
 
         if (type == ParType.DMPGPU)
         {
@@ -129,13 +134,17 @@ public void DoStep()
             case ParType.DMPGPU:
                 DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) =>
                 {
+                    int i = idx.i;
+                    int j = idx.j;
                     //set the scratch array to the average of the surrounding cells
-                    scratch[idx.i, idx.j] = 0.25 * (grid[idx.i - 1, idx.j] + grid[idx.i + 1, idx.j] + grid[idx.i, idx.j - 1] + grid[idx.i, idx.j + 1]);
+                    scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
                 });
 
                 DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) =>
                 {
-                    grid[idx.i, idx.j] = scratch[idx.i, idx.j];
+                    int i = idx.i;
+                    int j = idx.j;
+                    grid[i, j] = scratch[i, j];
                 });
                 break;
         }
@@ -154,7 +163,7 @@ public class HeatTransferVerify
     public enum ParType { DMPFor, DMPGPU }
 
     // test dims of 100x100, 1000x1000, and 5000x5000
-    public int dim = 514;
+    public int dim = 500;
 
     // test with 10 steps and 100 steps
     public int steps = 100;
@@ -177,8 +186,13 @@ public void Setup()
         scratch = new double[dim, dim];
         grid = new double[dim, dim];
 
-        grid[0, dim / 2 - 1] = 100.0;
-        grid[0, dim / 2] = 100.0;
+        for (int i = 0; i < dim; i++)
+        {
+            grid[0, i] = 100.0;
+            grid[i, 0] = 100.0;
+            grid[dim - 1, i] = 100.0;
+            grid[i, dim - 1] = 100.0;
+        }
 
         if (type == ParType.DMPGPU)
         {
@@ -241,13 +255,17 @@ public void DoStep()
             case ParType.DMPGPU:
                 DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) =>
                 {
+                    int i = idx.i;
+                    int j = idx.j;
                     //set the scratch array to the average of the surrounding cells
-                    scratch[idx.i, idx.j] = 0.25 * (grid[idx.i - 1, idx.j] + grid[idx.i + 1, idx.j] + grid[idx.i, idx.j - 1] + grid[idx.i, idx.j + 1]);
+                    scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
                 });
 
                 DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) =>
                 {
-                    grid[idx.i, idx.j] = scratch[idx.i, idx.j];
+                    int i = idx.i;
+                    int j = idx.j;
+                    grid[i, j] = scratch[i, j];
                 });
                 break;
         }

From aeba3b837f36790a080ea21c49d93edf51091b76 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sun, 12 Nov 2023 13:03:52 -0600
Subject: [PATCH 44/61] properly handle loops not divisible by block size

---
 DotMP/GPU/AcceleratorHandler.tt | 13 +++++++++++++
 DotMP/GPU/Index.cs              | 19 +++++++++++++------
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt
index 88434822..260354f7 100644
--- a/DotMP/GPU/AcceleratorHandler.tt
+++ b/DotMP/GPU/AcceleratorHandler.tt
@@ -141,6 +141,19 @@ namespace DotMP.GPU
 <# } #>
             );
 
+            int not_done = len % block_size;
+
+            if (not_done > 0)
+            {
+                idx = new Index(ranges, len - (not_done));
+
+                kernel((1, not_done), idx
+<# for (int i = 0; i < c; i++) { #>
+                , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>)
+<# } #>
+                );
+            }
+
             Synchronize();
         }
 <# } #>
diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs
index 83d9f7e6..be52d892 100644
--- a/DotMP/GPU/Index.cs
+++ b/DotMP/GPU/Index.cs
@@ -15,6 +15,7 @@
 */
 
 using ILGPU;
+using ILGPU.Runtime.Cuda;
 using System;
 using System.Diagnostics.CodeAnalysis;
 using System.Linq;
@@ -39,12 +40,16 @@ public struct Index
 
         private int diff;
 
+        private int offset;
+
         /// <summary>
         /// Constructor.
         /// </summary>
         /// <param name="ranges">The ranges of the for loop.</param>
-        internal Index((int, int)[] ranges)
+        internal Index((int, int)[] ranges, int offset = 0)
         {
+            this.offset = offset;
+
             if (ranges.Length == 1)
             {
                 start1 = ranges[0].Item1;
@@ -70,7 +75,7 @@ internal Index((int, int)[] ranges)
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static implicit operator int(Index h)
         {
-            return Grid.GlobalLinearIndex + h.start1;
+            return Grid.GlobalLinearIndex + h.start1 + h.offset;
         }
 
         public int i
@@ -79,8 +84,9 @@ public int i
             {
                 if (i_prv == -1)
                 {
-                    i_prv = IntrinsicMath.DivRoundDown(Grid.GlobalLinearIndex, diff);
-                    j_prv = Grid.GlobalLinearIndex - i_prv * diff;
+                    int idxoffset = Grid.GlobalLinearIndex + offset;
+                    i_prv = IntrinsicMath.DivRoundDown(idxoffset, diff);
+                    j_prv = idxoffset - i_prv * diff;
                     i_prv += start1;
                     j_prv += start2;
                 }
@@ -95,8 +101,9 @@ public int j
             {
                 if (j_prv == -1)
                 {
-                    i_prv = IntrinsicMath.DivRoundDown(Grid.GlobalLinearIndex, diff);
-                    j_prv = Grid.GlobalLinearIndex - i_prv * diff;
+                    int idxoffset = Grid.GlobalLinearIndex + offset;
+                    i_prv = IntrinsicMath.DivRoundDown(idxoffset, diff);
+                    j_prv = idxoffset - i_prv * diff;
                     i_prv += start1;
                     j_prv += start2;
                 }

From a93d9fafc93e48f45c275e6b53c3e39ef69b0a13 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sun, 12 Nov 2023 13:11:44 -0600
Subject: [PATCH 45/61] turn array bounds into off-256-divisble size for better
 testing

---
 DotMP-Tests/GPUTests.cs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/DotMP-Tests/GPUTests.cs b/DotMP-Tests/GPUTests.cs
index c155fd2d..244f063c 100644
--- a/DotMP-Tests/GPUTests.cs
+++ b/DotMP-Tests/GPUTests.cs
@@ -23,11 +23,11 @@ public class GPUTests
         [Fact]
         public void GPU_for_works()
         {
-            double[] a = new double[65536];
-            double[] x = new double[65536];
-            double[] y = new double[65536];
-            float[] res = new float[65536];
-            float[] res_cpu = new float[65536];
+            double[] a = new double[50000];
+            double[] x = new double[50000];
+            double[] y = new double[50000];
+            float[] res = new float[50000];
+            float[] res_cpu = new float[50000];
 
             random_init(a);
             random_init(x);
@@ -81,4 +81,4 @@ private void random_init<T>(T[] arr)
             }
         }
     }
-}
\ No newline at end of file
+}

From 11d4baf3a41b97aa568b92cd9bfcbaa009f74d30 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sun, 12 Nov 2023 18:20:47 -0600
Subject: [PATCH 46/61] enable more optimizations

---
 DotMP/GPU/AcceleratorHandler.tt | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt
index 260354f7..31b71070 100644
--- a/DotMP/GPU/AcceleratorHandler.tt
+++ b/DotMP/GPU/AcceleratorHandler.tt
@@ -60,7 +60,12 @@ namespace DotMP.GPU
         {
             if (initialized) return;
 
-            context = Context.CreateDefault();
+            context = Context.Create()
+                .Optimize(OptimizationLevel.O2)
+                .Inlining(InliningMode.Aggressive)
+                .AllAccelerators()
+                //.Math(MathMode.Fast32BitOnly)
+                .ToContext();
             var selectedDevice = context.Devices[0];
 
             foreach (var d in context.Devices)
@@ -158,4 +163,4 @@ namespace DotMP.GPU
         }
 <# } #>
     }
-}
\ No newline at end of file
+}

From 1c12e3911144a612afc495bf10e7402ba90d8698 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Sun, 12 Nov 2023 18:20:56 -0600
Subject: [PATCH 47/61] add GPU kernel launch overhead benchmark

---
 benchmarks/GPUOverhead/GPUOverhead.csproj | 18 ++++++++
 benchmarks/GPUOverhead/Program.cs         | 56 +++++++++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 benchmarks/GPUOverhead/GPUOverhead.csproj
 create mode 100644 benchmarks/GPUOverhead/Program.cs

diff --git a/benchmarks/GPUOverhead/GPUOverhead.csproj b/benchmarks/GPUOverhead/GPUOverhead.csproj
new file mode 100644
index 00000000..9cf0a6f0
--- /dev/null
+++ b/benchmarks/GPUOverhead/GPUOverhead.csproj
@@ -0,0 +1,18 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net6.0</TargetFramework>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="BenchmarkDotNet" Version="0.13.10" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\..\DotMP\DotMP.csproj" />
+  </ItemGroup>
+
+</Project>
diff --git a/benchmarks/GPUOverhead/Program.cs b/benchmarks/GPUOverhead/Program.cs
new file mode 100644
index 00000000..b9ff18a5
--- /dev/null
+++ b/benchmarks/GPUOverhead/Program.cs
@@ -0,0 +1,56 @@
+﻿/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Jobs;
+using BenchmarkDotNet.Running;
+using BenchmarkDotNet.Diagnosers;
+
+/* jscpd:ignore-start */
+
+[SimpleJob(RuntimeMoniker.Net60)]
+[ThreadingDiagnoser]
+[HardwareCounters]
+[EventPipeProfiler(EventPipeProfile.CpuSampling)]
+public class Overhead
+{
+    DotMP.GPU.Buffer<byte> buf;
+
+    // run the setup
+    [GlobalSetup]
+    public void Setup()
+    {
+	buf = new DotMP.GPU.Buffer<byte>(new byte[1], DotMP.GPU.Buffer.Behavior.NoCopy);
+    }
+
+    //run the simulation
+    [Benchmark]
+    public void TestOverhead()
+    {
+	DotMP.GPU.Parallel.ParallelFor(0, 1, buf, (i, buf) => { });
+    }
+}
+
+/* jscpd:ignore-end */
+
+// driver
+public class Program
+{
+    public static void Main(string[] args)
+    {
+        BenchmarkRunner.Run<Overhead>();
+    }
+}

From f2b2360f771411c0c6fadc902510295b8441dfe1 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Mon, 13 Nov 2023 07:39:35 -0600
Subject: [PATCH 48/61] begin progress towards better index integration

---
 DotMP/GPU/AcceleratorHandler.tt       | 191 +++++++++++++-
 DotMP/GPU/Gpu.tt                      |  49 +++-
 DotMP/GPU/Index.cs                    | 345 ++++++++++++++++++++++----
 benchmarks/GPUHeatTransfer/Program.cs |  16 +-
 4 files changed, 516 insertions(+), 85 deletions(-)

diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt
index 31b71070..0fbe6ee5 100644
--- a/DotMP/GPU/AcceleratorHandler.tt
+++ b/DotMP/GPU/AcceleratorHandler.tt
@@ -98,19 +98,75 @@ namespace DotMP.GPU
         /// <param name="action">The action provided on the CPU.</param>
         /// <param name="src">The calling location.</param>
         /// <returns>The GPU kernel.</returns>
-        private Action<KernelConfig, Index,
+        private Action<KernelConfig, IndexI,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
         > GetKernel<
 <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
-        >(Action<Index,
+        >(Action<IndexI,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #>
         > action, string src)
-<# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #>
+<# for (int i = 0; i < c; i++) { #>
+            where <#= letters[i] #> : unmanaged
+<# } #>
+        {
+            if (!kernels.ContainsKey(src))
+                kernels.Add(src, accelerator.LoadStreamKernel(action));
+
+            return (Action<KernelConfig, IndexI,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> 
+            >) kernels[src];
+        }
+<# } #>
+
+<# for (int c = 1; c <= max - 1; c++) { #>
+        /// <summary>
+        /// Get the kernel associated with this lambda.
+        /// </summary>
+        /// <param name="action">The action provided on the CPU.</param>
+        /// <param name="src">The calling location.</param>
+        /// <returns>The GPU kernel.</returns>
+        private Action<KernelConfig, IndexI, IndexJ,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+        > GetKernel<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+        >(Action<IndexI, IndexJ,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #>
+        > action, string src)
+<# for (int i = 0; i < c; i++) { #>
+            where <#= letters[i] #> : unmanaged
+<# } #>
+        {
+            if (!kernels.ContainsKey(src))
+                kernels.Add(src, accelerator.LoadStreamKernel(action));
+
+            return (Action<KernelConfig, IndexI, IndexJ,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> 
+            >) kernels[src];
+        }
+<# } #>
+
+<# for (int c = 1; c <= max - 2; c++) { #>
+        /// <summary>
+        /// Get the kernel associated with this lambda.
+        /// </summary>
+        /// <param name="action">The action provided on the CPU.</param>
+        /// <param name="src">The calling location.</param>
+        /// <returns>The GPU kernel.</returns>
+        private Action<KernelConfig, IndexI, IndexJ, IndexK,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+        > GetKernel<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+        >(Action<IndexI, IndexJ, IndexK,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #>
+        > action, string src)
+<# for (int i = 0; i < c; i++) { #>
+            where <#= letters[i] #> : unmanaged
+<# } #>
         {
             if (!kernels.ContainsKey(src))
                 kernels.Add(src, accelerator.LoadStreamKernel(action));
 
-            return (Action<KernelConfig, Index,
+            return (Action<KernelConfig, IndexI, IndexJ, IndexK,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> 
             >) kernels[src];
         }
@@ -118,9 +174,9 @@ namespace DotMP.GPU
 
 <# for (int c = 1; c <= max; c++) { #>
         /// <summary>
-        /// Dispatches a kernel with the given number of parameters.
+        /// Dispatches a linear kernel with the given number of parameters.
         /// </summary>
-        /// <param name="ranges">The ranges of the for loop.</param>
+        /// <param name="range1">The range of the for loop.</param>
 <# for (int i = 0; i < c; i++) { #>
         /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
 <# } #>
@@ -128,15 +184,17 @@ namespace DotMP.GPU
         /// <param name="src">The originating caller location.</param>
         internal void DispatchKernel<
 <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
-        >((int, int)[] ranges,
+        >((int, int) range1,
 <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
-        Action<Index,
+        Action<IndexI,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
         > action, string src)
-<# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #>
+<# for (int i = 0; i < c; i++) { #>
+            where <#= letters[i] #> : unmanaged
+<# } #>
         {
-            var len = ranges.Select(tup => tup.Item2 - tup.Item1).Aggregate((x, y) => x * y);
-            var idx = new Index(ranges);
+            var len = range1.Item2 - range1.Item1;
+            var idx = new IndexI(range1);
 
             var kernel = GetKernel(action, src);
 
@@ -150,7 +208,7 @@ namespace DotMP.GPU
 
             if (not_done > 0)
             {
-                idx = new Index(ranges, len - (not_done));
+                idx = new IndexI(range1, len - (not_done));
 
                 kernel((1, not_done), idx
 <# for (int i = 0; i < c; i++) { #>
@@ -162,5 +220,114 @@ namespace DotMP.GPU
             Synchronize();
         }
 <# } #>
+
+<# for (int c = 1; c <= max - 1; c++) { #>
+        /// <summary>
+        /// Dispatches a 2D kernel with the given number of parameters.
+        /// </summary>
+        /// <param name="range1">The outer range of the for loop.</param>
+        /// <param name="range2">The inner range of the for loop.</param>
+<# for (int i = 0; i < c; i++) { #>
+        /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
+<# } #>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
+        internal void DispatchKernel<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+        >((int, int) range1, (int, int) range2,
+<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
+        Action<IndexI, IndexJ,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+        > action, string src)
+<# for (int i = 0; i < c; i++) { #>
+            where <#= letters[i] #> : unmanaged
+<# } #>
+        {
+            var len = (range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1);
+            var i = new IndexI(range1, range2);
+            var j = new IndexJ(range1, range2);
+
+            var kernel = GetKernel(action, src);
+
+            kernel((len / block_size, block_size), i, j
+<# for (int i = 0; i < c; i++) { #>
+            , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>)
+<# } #>
+            );
+
+            int not_done = len % block_size;
+
+            if (not_done > 0)
+            {
+                int offset = len - not_done;
+                i = new IndexI(range1, range2, offset);
+                j = new IndexJ(range1, range2, offset);
+
+                kernel((1, not_done), i, j
+<# for (int i = 0; i < c; i++) { #>
+                , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>)
+<# } #>
+                );
+            }
+
+            Synchronize();
+        }
+<# } #>
+
+<# for (int c = 1; c <= max - 2; c++) { #>
+        /// <summary>
+        /// Dispatches a 3D kernel with the given number of parameters.
+        /// </summary>
+        /// <param name="range1">The outer range of the for loop.</param>
+        /// <param name="range2">The middle range of the for loop.</param>
+        /// <param name="range3">The inner range of the for loop.</param>
+<# for (int i = 0; i < c; i++) { #>
+        /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
+<# } #>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
+        internal void DispatchKernel<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+        >((int, int) range1, (int, int) range2, (int, int) range3,
+<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
+        Action<IndexI, IndexJ, IndexK,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+        > action, string src)
+<# for (int i = 0; i < c; i++) { #>
+            where <#= letters[i] #> : unmanaged
+<# } #>
+        {
+            var len = (range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1);
+            var i = new IndexI(range1, range2, range3);
+            var j = new IndexJ(range1, range2, range3);
+            var k = new IndexK(range1, range2, range3);
+
+            var kernel = GetKernel(action, src);
+
+            kernel((len / block_size, block_size), i, j, k
+<# for (int i = 0; i < c; i++) { #>
+            , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>)
+<# } #>
+            );
+
+            int not_done = len % block_size;
+
+            if (not_done > 0)
+            {
+                int offset = len - not_done;
+                i = new IndexI(range1, range2, range3, offset);
+                j = new IndexJ(range1, range2, range3, offset);
+                k = new IndexK(range1, range2, range3, offset);
+
+                kernel((1, not_done), i, j, k
+<# for (int i = 0; i < c; i++) { #>
+                , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>)
+<# } #>
+                );
+            }
+
+            Synchronize();
+        }
+<# } #>
     }
 }
diff --git a/DotMP/GPU/Gpu.tt b/DotMP/GPU/Gpu.tt
index 4ac1f49e..060c3d8a 100644
--- a/DotMP/GPU/Gpu.tt
+++ b/DotMP/GPU/Gpu.tt
@@ -59,26 +59,26 @@ namespace DotMP.GPU
 <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
         >(int start, int end, 
 <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
-        Action<Index, 
+        Action<IndexI, 
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
         > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
 <# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #>
         {
             var handler = new AcceleratorHandler();
             string src = FormatCaller(path, line);
-            handler.DispatchKernel(new (int, int)[] { (start, end) }, 
+            handler.DispatchKernel((start, end), 
 <# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #>
             action, src);
         }
 <# } #>
 
-<# for (int c = 1; c <= max; c++) { #>
+<# for (int c = 1; c <= max - 1; c++) { #>
         /// <summary>
         /// Creates a collapsed GPU parallel for loop.
         /// The body of the kernel is run on a GPU target.
         /// </summary>
         /// <param name="range1">The range of the outer for loop.</param>
-        /// <param name="range2">The range of the outer for loop.</param>
+        /// <param name="range2">The range of the inner for loop.</param>
 <# for (int i = 0; i < c; i++) { #>
         /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
 <# } #>
@@ -89,14 +89,49 @@ namespace DotMP.GPU
 <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
         >((int, int) range1, (int, int) range2, 
 <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
-        Action<Index, 
+        Action<IndexI, IndexJ,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
         > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
-<# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #>
+<# for (int i = 0; i < c; i++) { #>
+            where <#= letters[i] #> : unmanaged
+<# } #>
+        {
+            var handler = new AcceleratorHandler();
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(range1, range2, 
+<# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #>
+            action, src);
+        }
+<# } #>
+
+<# for (int c = 1; c <= max - 2; c++) { #>
+        /// <summary>
+        /// Creates a collapsed GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// </summary>
+        /// <param name="range1">The range of the outer for loop.</param>
+        /// <param name="range2">The range of the middle for loop.</param>
+        /// <param name="range3">The range of the inner for loop.</param>
+<# for (int i = 0; i < c; i++) { #>
+        /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
+<# } #>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
+        public static void ParallelForCollapse<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+        >((int, int) range1, (int, int) range2, (int, int) range3,
+<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
+        Action<IndexI, IndexJ, IndexK,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+        > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
+<# for (int i = 0; i < c; i++) { #>
+            where <#= letters[i] #> : unmanaged
+<# } #>
         {
             var handler = new AcceleratorHandler();
             string src = FormatCaller(path, line);
-            handler.DispatchKernel(new (int, int)[] { range1, range2 }, 
+            handler.DispatchKernel(range1, range2, range3, 
 <# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #>
             action, src);
         }
diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs
index be52d892..f80cf57d 100644
--- a/DotMP/GPU/Index.cs
+++ b/DotMP/GPU/Index.cs
@@ -15,101 +15,338 @@
 */
 
 using ILGPU;
-using ILGPU.Runtime.Cuda;
-using System;
 using System.Diagnostics.CodeAnalysis;
-using System.Linq;
 using System.Runtime.CompilerServices;
 
 namespace DotMP.GPU
 {
     /// <summary>
-    /// Handle for a GPU kernel to retrieve its kernel variables.
+    /// Represents an index passed as the first index argument.
     /// </summary>
     [ExcludeFromCodeCoverage]
-    public struct Index
+    public struct IndexI
     {
         /// <summary>
-        /// The start of the for loop, for index calculations.
+        /// The start of the first for loop, for index calculations.
         /// </summary>
         private int start1;
+
+        /// <summary>
+        /// The start of the second for loop, for index calculations.
+        /// </summary>
         private int start2;
 
-        private int i_prv;
-        private int j_prv;
+        /// <summary>
+        /// The start of the third for loop, for index calculations.
+        /// </summary>
+        private int start3;
+
+        /// <summary>
+        /// The index to return.
+        /// </summary>
+        private int idx_prv;
+
+        /// <summary>
+        /// The difference between the second set of ranges.
+        /// </summary>
+        private int diff2;
 
-        private int diff;
+        /// <summary>
+        /// The difference between the third set of ranges.
+        /// </summary>
+        private int diff3;
 
+        /// <summary>
+        /// The offset, in case of a followup kernel.
+        /// </summary>
         private int offset;
 
+        /// <summary>
+        /// The number of dimensions.
+        /// </summary>
+        private int dims;
+
         /// <summary>
         /// Constructor.
         /// </summary>
-        /// <param name="ranges">The ranges of the for loop.</param>
-        internal Index((int, int)[] ranges, int offset = 0)
+        /// <param name="range">The range of the for loop.</param>
+        /// <param name="offset">The offset for followup kernels.</param>
+        internal IndexI((int, int) range, int offset = 0)
         {
             this.offset = offset;
 
-            if (ranges.Length == 1)
-            {
-                start1 = ranges[0].Item1;
-                start2 = -1;
-                i_prv = -1;
-                j_prv = -1;
-                diff = -1;
-            }
-            else
+            start1 = range.Item1;
+            start2 = -1;
+            start3 = -1;
+            idx_prv = -1;
+            diff2 = -1;
+            diff3 = -1;
+            dims = 1;
+        }
+
+        /// <summary>
+        /// Constructor.
+        /// </summary>
+        /// <param name="range1">The outer range of the for loop.</param>
+        /// <param name="range2">The inner range of the for loop.</param>
+        /// <param name="offset">The offset for followup kernels.</param>
+        internal IndexI((int, int) range1, (int, int) range2, int offset = 0)
+        {
+            this.offset = offset;
+
+            start1 = range1.Item1;
+            start2 = range2.Item1;
+            start3 = -1;
+            idx_prv = -1;
+            diff2 = range2.Item2 - range2.Item1;
+            diff3 = -1;
+            dims = 2;
+        }
+
+        /// <summary>
+        /// Constructor.
+        /// </summary>
+        /// <param name="range1">The outer range of the for loop.</param>
+        /// <param name="range2">The middle range of the for loop.</param>
+        /// <param name="range3">The inner range of the for loop.</param>
+        /// <param name="offset">The offset for followup kernels.</param>
+        internal IndexI((int, int) range1, (int, int) range2, (int, int) range3, int offset = 0)
+        {
+            this.offset = offset;
+
+            start1 = range1.Item1;
+            start2 = range2.Item1;
+            start3 = range3.Item1;
+            idx_prv = -1;
+            diff2 = range2.Item2 - range2.Item1;
+            diff3 = range3.Item2 - range3.Item1;
+            dims = 3;
+        }
+
+        /// <summary>
+        /// Casts an index to an int.
+        /// </summary>
+        /// <param name="h">The Index struct to cast.</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static implicit operator int(IndexI h)
+        {
+            switch (h.dims)
             {
-                start1 = ranges[0].Item1;
-                start2 = ranges[1].Item1;
-                i_prv = -1;
-                j_prv = -1;
-                diff = ranges[1].Item2 - ranges[1].Item1;
+                default:
+                case 1:
+                    if (h.idx_prv == -1)
+                        h.idx_prv = Grid.GlobalLinearIndex + h.start1 + h.offset;
+
+                    return h.idx_prv;
+
+                case 2:
+                    if (h.idx_prv == -1)
+                    {
+                        int idxoffset = Grid.GlobalLinearIndex + h.offset;
+                        h.idx_prv = IntrinsicMath.DivRoundDown(idxoffset, h.diff2) + h.start1;
+                    }
+
+                    return h.idx_prv;
+
+                case 3:
+                    if (h.idx_prv == -1)
+                    {
+
+                    }
+
+                    return h.idx_prv;
             }
         }
+    }
 
+    /// <summary>
+    /// Represents an index passed as the second index argument.
+    /// </summary>
+    [ExcludeFromCodeCoverage]
+    public struct IndexJ
+    {
         /// <summary>
-        /// Gets the index of the loop.
+        /// The start of the first for loop, for index calculations.
         /// </summary>
-        /// <param name="h">Unused.</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static implicit operator int(Index h)
+        private int start1;
+
+        /// <summary>
+        /// The start of the second for loop, for index calculations.
+        /// </summary>
+        private int start2;
+
+        /// <summary>
+        /// The start of the third for loop, for index calculations.
+        /// </summary>
+        private int start3;
+
+        /// <summary>
+        /// The index to return.
+        /// </summary>
+        private int idx_prv;
+
+        /// <summary>
+        /// The difference between the second set of ranges.
+        /// </summary>
+        private int diff2;
+
+        /// <summary>
+        /// The difference between the third set of ranges.
+        /// </summary>
+        private int diff3;
+
+        /// <summary>
+        /// The offset, in case of a followup kernel.
+        /// </summary>
+        private int offset;
+
+        /// <summary>
+        /// The number of dimensions.
+        /// </summary>
+        private int dims;
+
+        /// <summary>
+        /// Constructor.
+        /// </summary>
+        /// <param name="range1">The outer range of the for loop.</param>
+        /// <param name="range2">The inner range of the for loop.</param>
+        /// <param name="offset">The offset for followup kernels.</param>
+        internal IndexJ((int, int) range1, (int, int) range2, int offset = 0)
         {
-            return Grid.GlobalLinearIndex + h.start1 + h.offset;
+            this.offset = offset;
+
+            start1 = range1.Item1;
+            start2 = range2.Item1;
+            start3 = -1;
+            idx_prv = -1;
+            diff2 = range2.Item2 - range2.Item1;
+            diff3 = -1;
+            dims = 2;
         }
 
-        public int i
+        /// <summary>
+        /// Constructor.
+        /// </summary>
+        /// <param name="range1">The outer range of the for loop.</param>
+        /// <param name="range2">The middle range of the for loop.</param>
+        /// <param name="range3">The inner range of the for loop.</param>
+        /// <param name="offset">The offset for followup kernels.</param>
+        internal IndexJ((int, int) range1, (int, int) range2, (int, int) range3, int offset = 0)
         {
-            get
+            this.offset = offset;
+
+            start1 = range1.Item1;
+            start2 = range2.Item1;
+            start3 = range3.Item1;
+            idx_prv = -1;
+            diff2 = range2.Item2 - range2.Item1;
+            diff3 = range3.Item2 - range3.Item1;
+            dims = 3;
+        }
+
+        /// <summary>
+        /// Casts an index to an int.
+        /// </summary>
+        /// <param name="h">The Index struct to cast.</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static implicit operator int(IndexJ h)
+        {
+            switch (h.dims)
             {
-                if (i_prv == -1)
-                {
-                    int idxoffset = Grid.GlobalLinearIndex + offset;
-                    i_prv = IntrinsicMath.DivRoundDown(idxoffset, diff);
-                    j_prv = idxoffset - i_prv * diff;
-                    i_prv += start1;
-                    j_prv += start2;
-                }
-
-                return i_prv;
+                default:
+                case 2:
+                    if (h.idx_prv == -1)
+                    {
+                        int idxoffset = Grid.GlobalLinearIndex + h.offset;
+                        h.idx_prv = (idxoffset % h.diff2) + h.start2;
+                    }
+
+                    return h.idx_prv;
+
+                case 3:
+                    if (h.idx_prv == -1)
+                    {
+
+                    }
+
+                    return h.idx_prv;
             }
         }
+    }
+
+    /// <summary>
+    /// Represents an index passed as the third index argument.
+    /// </summary>
+    [ExcludeFromCodeCoverage]
+    public struct IndexK
+    {
+        /// <summary>
+        /// The start of the first for loop, for index calculations.
+        /// </summary>
+        private int start1;
+
+        /// <summary>
+        /// The start of the second for loop, for index calculations.
+        /// </summary>
+        private int start2;
+
+        /// <summary>
+        /// The start of the third for loop, for index calculations.
+        /// </summary>
+        private int start3;
+
+        /// <summary>
+        /// The index to return.
+        /// </summary>
+        private int idx_prv;
+
+        /// <summary>
+        /// The difference between the second set of ranges.
+        /// </summary>
+        private int diff2;
+
+        /// <summary>
+        /// The difference between the third set of ranges.
+        /// </summary>
+        private int diff3;
+
+        /// <summary>
+        /// The offset, in case of a followup kernel.
+        /// </summary>
+        private int offset;
+
+        /// <summary>
+        /// Constructor.
+        /// </summary>
+        /// <param name="range1">The outer range of the for loop.</param>
+        /// <param name="range2">The middle range of the for loop.</param>
+        /// <param name="range3">The inner range of the for loop.</param>
+        /// <param name="offset">The offset for followup kernels.</param>
+        internal IndexK((int, int) range1, (int, int) range2, (int, int) range3, int offset = 0)
+        {
+            this.offset = offset;
+
+            start1 = range1.Item1;
+            start2 = range2.Item1;
+            start3 = range3.Item1;
+            idx_prv = -1;
+            diff2 = range2.Item2 - range2.Item1;
+            diff3 = range3.Item2 - range3.Item1;
+        }
 
-        public int j
+        /// <summary>
+        /// Casts an index to an int.
+        /// </summary>
+        /// <param name="h">The Index struct to cast.</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static implicit operator int(IndexK h)
         {
-            get
+            if (h.idx_prv == -1)
             {
-                if (j_prv == -1)
-                {
-                    int idxoffset = Grid.GlobalLinearIndex + offset;
-                    i_prv = IntrinsicMath.DivRoundDown(idxoffset, diff);
-                    j_prv = idxoffset - i_prv * diff;
-                    i_prv += start1;
-                    j_prv += start2;
-                }
-
-                return j_prv;
+
             }
+
+            return h.idx_prv;
         }
     }
 }
\ No newline at end of file
diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs
index 1800e673..0d332da2 100644
--- a/benchmarks/GPUHeatTransfer/Program.cs
+++ b/benchmarks/GPUHeatTransfer/Program.cs
@@ -132,18 +132,14 @@ public void DoStep()
                 break;
 
             case ParType.DMPGPU:
-                DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) =>
+                DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (i, j, grid, scratch) =>
                 {
-                    int i = idx.i;
-                    int j = idx.j;
                     //set the scratch array to the average of the surrounding cells
                     scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
                 });
 
-                DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) =>
+                DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (i, j, grid, scratch) =>
                 {
-                    int i = idx.i;
-                    int j = idx.j;
                     grid[i, j] = scratch[i, j];
                 });
                 break;
@@ -253,18 +249,14 @@ public void DoStep()
                 break;
 
             case ParType.DMPGPU:
-                DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) =>
+                DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (i, j, grid, scratch) =>
                 {
-                    int i = idx.i;
-                    int j = idx.j;
                     //set the scratch array to the average of the surrounding cells
                     scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
                 });
 
-                DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (idx, grid, scratch) =>
+                DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (i, j, grid, scratch) =>
                 {
-                    int i = idx.i;
-                    int j = idx.j;
                     grid[i, j] = scratch[i, j];
                 });
                 break;

From 1f201884a0b2f5d41ccad7d5138e63d203293c4f Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Mon, 13 Nov 2023 07:53:33 -0600
Subject: [PATCH 49/61] add test for forcollapse

---
 DotMP-Tests/GPUTests.cs | 44 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/DotMP-Tests/GPUTests.cs b/DotMP-Tests/GPUTests.cs
index 244f063c..f2b7d9b8 100644
--- a/DotMP-Tests/GPUTests.cs
+++ b/DotMP-Tests/GPUTests.cs
@@ -5,6 +5,7 @@
 using System.Text.Json.Serialization;
 using System.Threading;
 using DotMP;
+using DotMP.GPU;
 using FluentAssertions;
 using Xunit;
 using Xunit.Abstractions;
@@ -71,6 +72,49 @@ public void GPU_for_works()
             Assert.Equal(a, a_old);
         }
 
+        /// <summary>
+        /// Tests to make sure that DotMP.GPU.Parallel.ForCollapse produces correct results.
+        /// </summary>
+        [Fact]
+        public void Collapse_works()
+        {
+            int[,] iters_hit = new int[1024, 1024];
+
+            using (var buf = new Buffer<int>(iters_hit, DotMP.GPU.Buffer.Behavior.ToFrom))
+            {
+                DotMP.GPU.Parallel.ParallelForCollapse((258, 512), (512, 600), buf, action: (i, j, iters_hit) =>
+                {
+                    iters_hit[i, j]++;
+                });
+            }
+
+            for (int i = 0; i < 1024; i++)
+                for (int j = 0; j < 1024; j++)
+                    if (i >= 258 && i < 512 && j >= 512 && j < 600)
+                        iters_hit[i, j].Should().Be(1);
+                    else
+                        iters_hit[i, j].Should().Be(0);
+
+            /*iters_hit = null;
+
+            int[,,] iters_hit_3 = new int[128, 128, 64];
+
+            DotMP.Parallel.ParallelForCollapse((35, 64), (16, 100), (10, 62), num_threads: 8, chunk_size: 3, schedule: Schedule.Dynamic, action: (i, j, k) =>
+            {
+                DotMP.Atomic.Inc(ref iters_hit_3[i, j, k]);
+            });
+
+            for (int i = 0; i < 128; i++)
+                for (int j = 0; j < 128; j++)
+                    for (int k = 0; k < 64; k++)
+                        if (i >= 35 && i < 64 && j >= 16 && j < 100 && k >= 10 && k < 62)
+                            iters_hit_3[i, j, k].Should().Be(1);
+                        else
+                            iters_hit_3[i, j, k].Should().Be(0);
+
+            iters_hit_3 = null;*/
+        }
+
         private void random_init<T>(T[] arr)
         {
             Random r = new Random();

From 141aafdc2920a69d97e6f29e9dc59f7262b2ee15 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Mon, 13 Nov 2023 12:41:09 -0600
Subject: [PATCH 50/61] tidy up calls

---
 DotMP-Tests/GPUTests.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DotMP-Tests/GPUTests.cs b/DotMP-Tests/GPUTests.cs
index f2b7d9b8..0ff2e67c 100644
--- a/DotMP-Tests/GPUTests.cs
+++ b/DotMP-Tests/GPUTests.cs
@@ -82,7 +82,7 @@ public void Collapse_works()
 
             using (var buf = new Buffer<int>(iters_hit, DotMP.GPU.Buffer.Behavior.ToFrom))
             {
-                DotMP.GPU.Parallel.ParallelForCollapse((258, 512), (512, 600), buf, action: (i, j, iters_hit) =>
+                DotMP.GPU.Parallel.ParallelForCollapse((258, 512), (512, 600), buf, (i, j, iters_hit) =>
                 {
                     iters_hit[i, j]++;
                 });

From b5ca7313fd33b38fe452e6c6ea8742fa006f0425 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Mon, 13 Nov 2023 12:41:38 -0600
Subject: [PATCH 51/61] implement caching of indices

---
 DotMP/GPU/AcceleratorHandler.tt | 183 +++++++++++++++++++++++++++-----
 1 file changed, 155 insertions(+), 28 deletions(-)

diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt
index 0fbe6ee5..e3582530 100644
--- a/DotMP/GPU/AcceleratorHandler.tt
+++ b/DotMP/GPU/AcceleratorHandler.tt
@@ -30,7 +30,7 @@ namespace DotMP.GPU
     /// <summary>
     /// The handler class managing GPU acceleration.
     /// </summary>
-    internal class AcceleratorHandler
+    internal sealed class AcceleratorHandler
     {
         /// <summary>
         /// Determines if a GPU context has been initialized yet.
@@ -51,7 +51,15 @@ namespace DotMP.GPU
         /// <summary>
         /// Kernel cache.
         /// </summary>
-        private static Dictionary<string, dynamic> kernels = new Dictionary<string, dynamic>();
+        private static Dictionary<string, Delegate> kernels = new Dictionary<string, Delegate>();
+
+        private static Dictionary<string, ValueTuple<int, int, Buffer<int>>> indices1d = new Dictionary<string, ValueTuple<int, int, Buffer<int>>>();
+
+        private static Dictionary<string, ValueTuple<int, int, int, int, Buffer<int>, Buffer<int>>> indices2d =
+            new Dictionary<string, ValueTuple<int, int, int, int, Buffer<int>, Buffer<int>>>();
+
+        private static Dictionary<string, ValueTuple<ValueTuple<int, int>, ValueTuple<int, int>, ValueTuple<int, int>, Buffer<int>, Buffer<int>, Buffer<int>>> indices3d =
+            new Dictionary<string, ValueTuple<ValueTuple<int, int>, ValueTuple<int, int>, ValueTuple<int, int>, Buffer<int>, Buffer<int>, Buffer<int>>>();
 
         /// <summary>
         /// Default constructor. If this is the first time it's called, it initializes all relevant singleton data.
@@ -98,11 +106,11 @@ namespace DotMP.GPU
         /// <param name="action">The action provided on the CPU.</param>
         /// <param name="src">The calling location.</param>
         /// <returns>The GPU kernel.</returns>
-        private Action<KernelConfig, IndexI,
+        private Action<KernelConfig, Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
         > GetKernel<
 <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
-        >(Action<IndexI,
+        >(Action<Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #>
         > action, string src)
 <# for (int i = 0; i < c; i++) { #>
@@ -112,7 +120,7 @@ namespace DotMP.GPU
             if (!kernels.ContainsKey(src))
                 kernels.Add(src, accelerator.LoadStreamKernel(action));
 
-            return (Action<KernelConfig, IndexI,
+            return (Action<KernelConfig, Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> 
             >) kernels[src];
         }
@@ -125,11 +133,11 @@ namespace DotMP.GPU
         /// <param name="action">The action provided on the CPU.</param>
         /// <param name="src">The calling location.</param>
         /// <returns>The GPU kernel.</returns>
-        private Action<KernelConfig, IndexI, IndexJ,
+        private Action<KernelConfig, Index, Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
         > GetKernel<
 <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
-        >(Action<IndexI, IndexJ,
+        >(Action<Index, Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #>
         > action, string src)
 <# for (int i = 0; i < c; i++) { #>
@@ -139,7 +147,7 @@ namespace DotMP.GPU
             if (!kernels.ContainsKey(src))
                 kernels.Add(src, accelerator.LoadStreamKernel(action));
 
-            return (Action<KernelConfig, IndexI, IndexJ,
+            return (Action<KernelConfig, Index, Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> 
             >) kernels[src];
         }
@@ -152,11 +160,11 @@ namespace DotMP.GPU
         /// <param name="action">The action provided on the CPU.</param>
         /// <param name="src">The calling location.</param>
         /// <returns>The GPU kernel.</returns>
-        private Action<KernelConfig, IndexI, IndexJ, IndexK,
+        private Action<KernelConfig, Index, Index, Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
         > GetKernel<
 <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
-        >(Action<IndexI, IndexJ, IndexK,
+        >(Action<Index, Index, Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #>
         > action, string src)
 <# for (int i = 0; i < c; i++) { #>
@@ -166,12 +174,129 @@ namespace DotMP.GPU
             if (!kernels.ContainsKey(src))
                 kernels.Add(src, accelerator.LoadStreamKernel(action));
 
-            return (Action<KernelConfig, IndexI, IndexJ, IndexK,
+            return (Action<KernelConfig, Index, Index, Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> 
             >) kernels[src];
         }
 <# } #>
 
+        /// <summary>
+        /// Precomputes and caches the indices for a 1D for loop.
+        /// </summary>
+        /// <param name="range">The range of the for loop.</param>
+        /// <param name="src">The calling location in the source code.</param>
+        /// <returns>A buffer representing the indices.</returns>
+        internal Index Get1DIdx((int, int) range, string src)
+        {
+            if (indices1d.ContainsKey(src))
+            {
+                var data = indices1d[src];
+                if (data.Item1 == range.Item1 && data.Item2 == range.Item2)
+                    return new Index(data.Item3);
+                else
+                    data.Item3.Dispose();
+            }
+
+            int[] indices = new int[range.Item2 - range.Item1];
+
+            for (int i = 0; i < indices.Length; i++)
+                indices[i] = i + range.Item1;
+
+            var buf = new Buffer<int>(indices, Buffer.Behavior.To);
+            indices1d[src] = (range.Item1, range.Item2, buf);
+            return new Index(buf);
+        }
+
+        internal ValueTuple<Index, Index> Get2DIdx((int, int) range1, (int, int) range2, string src)
+        {
+            if (indices2d.ContainsKey(src))
+            {
+                var data = indices2d[src];
+                if (data.Item1 == range1.Item1 && data.Item2 == range1.Item2 &&
+                    data.Item3 == range2.Item1 && data.Item4 == range2.Item2)
+                    return (new Index(data.Item5), new Index(data.Item6));
+                else
+                {
+                    data.Item5.Dispose();
+                    data.Item6.Dispose();
+                }
+            }
+
+            int[] indi = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1)];
+            int[] indj = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1)];
+
+            int ci = range1.Item1, cj = range2.Item1;
+
+            for (int i = 0; i < indi.Length; i++)
+            {
+                indi[i] = ci;
+                indj[i] = cj;
+
+                if (++cj == range2.Item2)
+                {
+                    cj = range2.Item1;
+                    ++ci;
+                }
+            }
+
+            Console.WriteLine("Computed new indices...");
+            var b1 = new Buffer<int>(indi, Buffer.Behavior.To);
+            var b2 = new Buffer<int>(indj, Buffer.Behavior.To);
+            indices2d[src] = (range1.Item1, range1.Item2, range2.Item1, range2.Item2, b1, b2);
+
+            return (new Index(b1), new Index(b2));
+        }
+
+        internal ValueTuple<Index, Index, Index> Get3DIdx((int, int) range1, (int, int) range2, (int, int) range3, string src)
+        {
+            if (indices3d.ContainsKey(src))
+            {
+                var data = indices3d[src];
+                if (data.Item1.Item1 == range1.Item1 && data.Item1.Item2 == range1.Item2 &&
+                    data.Item2.Item1 == range2.Item1 && data.Item2.Item2 == range2.Item2 &&
+                    data.Item3.Item1 == range3.Item1 && data.Item3.Item2 == range3.Item2)
+                    return (new Index(data.Item4), new Index(data.Item5), new Index(data.Item6));
+                else
+                {
+                    data.Item4.Dispose();
+                    data.Item5.Dispose();
+                    data.Item6.Dispose();
+                }
+            }
+
+            int[] indi = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1)];
+            int[] indj = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1)];
+            int[] indk = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1)];
+
+            int ci = range1.Item1, cj = range2.Item1, ck = range3.Item1;
+
+            for (int i = 0; i < indi.Length; i++)
+            {
+                indi[i] = ci;
+                indj[i] = cj;
+                indk[i] = ck;
+
+                if (++ck == range3.Item2)
+                {
+                    ck = range3.Item1;
+
+                    if (++cj == range2.Item2)
+                    {
+                        cj = range2.Item1;
+                        ++ci;
+                    }
+                }
+            }
+
+            var b1 = new Buffer<int>(indi, Buffer.Behavior.To);
+            var b2 = new Buffer<int>(indj, Buffer.Behavior.To);
+            var b3 = new Buffer<int>(indk, Buffer.Behavior.To);
+            indices3d[src] = ((range1.Item1, range1.Item2), (range2.Item1, range2.Item2), (range3.Item1, range3.Item2), b1, b2, b3);
+
+            return (new Index(b1), new Index(b2), new Index(b3));
+        }
+
+
 <# for (int c = 1; c <= max; c++) { #>
         /// <summary>
         /// Dispatches a linear kernel with the given number of parameters.
@@ -186,15 +311,15 @@ namespace DotMP.GPU
 <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
         >((int, int) range1,
 <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
-        Action<IndexI,
+        Action<Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
         > action, string src)
 <# for (int i = 0; i < c; i++) { #>
             where <#= letters[i] #> : unmanaged
 <# } #>
         {
+            var idx = Get1DIdx(range1, src);
             var len = range1.Item2 - range1.Item1;
-            var idx = new IndexI(range1);
 
             var kernel = GetKernel(action, src);
 
@@ -208,7 +333,8 @@ namespace DotMP.GPU
 
             if (not_done > 0)
             {
-                idx = new IndexI(range1, len - (not_done));
+                int offset = len - not_done;
+                idx.AddOffset(offset);
 
                 kernel((1, not_done), idx
 <# for (int i = 0; i < c; i++) { #>
@@ -236,7 +362,7 @@ namespace DotMP.GPU
 <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
         >((int, int) range1, (int, int) range2,
 <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
-        Action<IndexI, IndexJ,
+        Action<Index, Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
         > action, string src)
 <# for (int i = 0; i < c; i++) { #>
@@ -244,14 +370,17 @@ namespace DotMP.GPU
 <# } #>
         {
             var len = (range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1);
-            var i = new IndexI(range1, range2);
-            var j = new IndexJ(range1, range2);
+            (var i, var j) = Get2DIdx(range1, range2, src);
 
             var kernel = GetKernel(action, src);
 
+<# for (int i = 0; i < c; i++) { #>
+            var gpu<#= i + 1 #> = new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>);
+<# } #>
+
             kernel((len / block_size, block_size), i, j
 <# for (int i = 0; i < c; i++) { #>
-            , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>)
+            , gpu<#= i + 1 #>
 <# } #>
             );
 
@@ -260,12 +389,12 @@ namespace DotMP.GPU
             if (not_done > 0)
             {
                 int offset = len - not_done;
-                i = new IndexI(range1, range2, offset);
-                j = new IndexJ(range1, range2, offset);
+                i.AddOffset(offset);
+                j.AddOffset(offset);
 
                 kernel((1, not_done), i, j
 <# for (int i = 0; i < c; i++) { #>
-                , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>)
+                , gpu<#= i + 1 #>
 <# } #>
                 );
             }
@@ -290,7 +419,7 @@ namespace DotMP.GPU
 <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
         >((int, int) range1, (int, int) range2, (int, int) range3,
 <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
-        Action<IndexI, IndexJ, IndexK,
+        Action<Index, Index, Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
         > action, string src)
 <# for (int i = 0; i < c; i++) { #>
@@ -298,9 +427,7 @@ namespace DotMP.GPU
 <# } #>
         {
             var len = (range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1);
-            var i = new IndexI(range1, range2, range3);
-            var j = new IndexJ(range1, range2, range3);
-            var k = new IndexK(range1, range2, range3);
+            (var i, var j, var k) = Get3DIdx(range1, range2, range3, src);
 
             var kernel = GetKernel(action, src);
 
@@ -315,9 +442,9 @@ namespace DotMP.GPU
             if (not_done > 0)
             {
                 int offset = len - not_done;
-                i = new IndexI(range1, range2, range3, offset);
-                j = new IndexJ(range1, range2, range3, offset);
-                k = new IndexK(range1, range2, range3, offset);
+                i.AddOffset(offset);
+                j.AddOffset(offset);
+                k.AddOffset(offset);
 
                 kernel((1, not_done), i, j, k
 <# for (int i = 0; i < c; i++) { #>

From 2275b97f0fb143329a43238144bd987a1d914790 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Mon, 13 Nov 2023 12:42:08 -0600
Subject: [PATCH 52/61] add support for 3D buffers

---
 DotMP/GPU/Buffer.cs | 53 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/DotMP/GPU/Buffer.cs b/DotMP/GPU/Buffer.cs
index 87b756e3..df45c507 100644
--- a/DotMP/GPU/Buffer.cs
+++ b/DotMP/GPU/Buffer.cs
@@ -50,7 +50,7 @@ public enum Behavior
     /// <summary>
     /// Buffer to manage GPU memory. Should only be created on the CPU.
     /// </summary>
-    public class Buffer<T> : IDisposable
+    public sealed class Buffer<T> : IDisposable
         where T : unmanaged
     {
         /// <summary>
@@ -63,6 +63,11 @@ public class Buffer<T> : IDisposable
         /// </summary>
         private MemoryBuffer2D<T, Stride2D.DenseY> buf2d;
 
+        /// <summary>
+        /// The ILGPU buffer for 3D arrays.
+        /// </summary>
+        private MemoryBuffer3D<T, Stride3D.DenseZY> buf3d;
+
         /// <summary>
         /// Behavior of the data, as specified by Behavior.
         /// </summary>
@@ -78,6 +83,11 @@ public class Buffer<T> : IDisposable
         /// </summary>
         private T[,] data2d;
 
+        /// <summary>
+        /// The CPU 3D array, so that we can copy the data back.
+        /// </summary>
+        private T[,,] data3d;
+
         /// <summary>
         /// Handler int for the number of dimensions in the array.
         /// </summary>
@@ -156,6 +166,33 @@ public Buffer(T[,] data, Buffer.Behavior behavior)
             Dimensions = 2;
         }
 
+        /// <summary>
+        /// Constructor for buffer object. Allocates a 3D array on the GPU and makes it available for the next GPU kernel.
+        /// </summary>
+        /// <param name="data">The data to allocate on the GPU.</param>
+        /// <param name="behavior">The behavior of the data, see Behavior.</param>
+        public Buffer(T[,,] data, Buffer.Behavior behavior)
+        {
+            new AcceleratorHandler();
+
+            this.behavior = behavior;
+            this.data3d = data;
+
+            switch (behavior)
+            {
+                case Buffer.Behavior.To:
+                case Buffer.Behavior.ToFrom:
+                    buf3d = AcceleratorHandler.accelerator.Allocate3DDenseZY(data);
+                    break;
+                case Buffer.Behavior.From:
+                case Buffer.Behavior.NoCopy:
+                    buf3d = AcceleratorHandler.accelerator.Allocate3DDenseZY<T>((data.GetLength(0), data.GetLength(1), data.GetLength(2)));
+                    break;
+            }
+
+            Dimensions = 3;
+        }
+
         /// <summary>
         /// Dispose of the buffer, freeing GPU memory and copying any relevant data back to the CPU.
         /// </summary>
@@ -179,6 +216,15 @@ public void Dispose()
 
                 buf2d.Dispose();
             }
+            else if (Dimensions == 3)
+            {
+                if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom)
+                {
+                    System.Buffer.BlockCopy(buf3d.GetAsArray3D(), 0, data3d, 0, Unsafe.SizeOf<T>() * data3d.Length);
+                }
+
+                buf3d.Dispose();
+            }
         }
 
         /// <summary>
@@ -190,5 +236,10 @@ public void Dispose()
         /// Get the view of the memory for the GPU.
         /// </summary>
         internal ArrayView2D<T, Stride2D.DenseY> View2D { get => buf2d.View; }
+
+        /// <summary>
+        /// Get the view of the memory for the GPU.
+        /// </summary>
+        internal ArrayView3D<T, Stride3D.DenseZY> View3D { get => buf3d.View; }
     }
 }
\ No newline at end of file

From 81c39ba413456cf9a922c490b68be20c1617dd3d Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Mon, 13 Nov 2023 12:42:36 -0600
Subject: [PATCH 53/61] migrate to new index technique

---
 DotMP/GPU/Gpu.tt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/DotMP/GPU/Gpu.tt b/DotMP/GPU/Gpu.tt
index 060c3d8a..6cf2d841 100644
--- a/DotMP/GPU/Gpu.tt
+++ b/DotMP/GPU/Gpu.tt
@@ -59,7 +59,7 @@ namespace DotMP.GPU
 <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
         >(int start, int end, 
 <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
-        Action<IndexI, 
+        Action<Index, 
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
         > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
 <# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #>
@@ -89,7 +89,7 @@ namespace DotMP.GPU
 <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
         >((int, int) range1, (int, int) range2, 
 <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
-        Action<IndexI, IndexJ,
+        Action<Index, Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
         > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
 <# for (int i = 0; i < c; i++) { #>
@@ -122,7 +122,7 @@ namespace DotMP.GPU
 <# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
         >((int, int) range1, (int, int) range2, (int, int) range3,
 <# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
-        Action<IndexI, IndexJ, IndexK,
+        Action<Index, Index, Index,
 <# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
         > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
 <# for (int i = 0; i < c; i++) { #>
@@ -137,4 +137,4 @@ namespace DotMP.GPU
         }
 <# } #>
     }
-}
\ No newline at end of file
+}

From af259ec49d52f4936c6da3eb6f68cae222aa48d8 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Mon, 13 Nov 2023 12:43:06 -0600
Subject: [PATCH 54/61] WIP

---
 DotMP/GPU/GpuArray.cs | 91 +++++++++++++++++++++++++++----------------
 1 file changed, 58 insertions(+), 33 deletions(-)

diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs
index a7d7d705..b4cbe394 100644
--- a/DotMP/GPU/GpuArray.cs
+++ b/DotMP/GPU/GpuArray.cs
@@ -15,6 +15,7 @@
 */
 
 using ILGPU;
+using ILGPU.IR.Values;
 using ILGPU.Runtime;
 using System;
 using System.Diagnostics.CodeAnalysis;
@@ -30,15 +31,20 @@ public struct GPUArray<T>
         where T : unmanaged
     {
         /// <summary>
-        /// The ILGPU buffer for 1D arrays.
+        /// The ILGPU view for 1D arrays.
         /// </summary>
         private ArrayView1D<T, Stride1D.Dense> view1d;
 
         /// <summary>
-        /// The ILGPU buffer for 2D arrays.
+        /// The ILGPU view for 2D arrays.
         /// </summary>
         private ArrayView2D<T, Stride2D.DenseY> view2d;
 
+        /// <summary>
+        /// The ILGPU view for 3D arrays.
+        /// </summary>
+        private ArrayView3D<T, Stride3D.DenseZY> view3d;
+
         /// <summary>
         /// Number of dimensions.
         /// </summary>
@@ -47,30 +53,37 @@ public struct GPUArray<T>
         /// <summary>
         /// Constructor.
         /// </summary>
-        /// <param name="arrayView">The ArrayView to wrap.</param>
-        public GPUArray(Buffer<T> arrayView)
+        /// <param name="buf">The Buffer to create an array from.</param>
+        internal GPUArray(Buffer<T> buf)
         {
-            if (arrayView.Dimensions == 1)
-            {
-                view1d = arrayView.View1D;
-                // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
-                view2d = new Buffer<T>(new T[1, 1], Buffer.Behavior.NoCopy).View2D;
-            }
-            else if (arrayView.Dimensions == 2)
-            {
-                // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
-                view1d = new Buffer<T>(new T[1], Buffer.Behavior.NoCopy).View1D;
-                view2d = arrayView.View2D;
-            }
-            else
+            switch (buf.Dimensions)
             {
-                // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
-                view1d = new Buffer<T>(new T[1], Buffer.Behavior.NoCopy).View1D;
-                // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
-                view2d = new Buffer<T>(new T[1, 1], Buffer.Behavior.NoCopy).View2D;
+                /*case 1:
+                    view1d = buf.View1D;
+                    // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                    view2d = new Buffer<T>(new T[1, 1], Buffer.Behavior.NoCopy).View2D;
+                    // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                    view3d = new Buffer<T>(new T[1, 1, 1], Buffer.Behavior.NoCopy).View3D;
+                    break;*/
+		default:
+                case 2:
+                    // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                    //view1d = new Buffer<T>(new T[1], Buffer.Behavior.NoCopy).View1D;
+                    view2d = buf.View2D;
+                    // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                    //view3d = new Buffer<T>(new T[1, 1, 1], Buffer.Behavior.NoCopy).View3D;
+                    break;
+                /*case 3:
+                default:
+                    // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                    view1d = new Buffer<T>(new T[1], Buffer.Behavior.NoCopy).View1D;
+                    // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                    view2d = new Buffer<T>(new T[1, 1], Buffer.Behavior.NoCopy).View2D;
+                    view3d = buf.View3D;
+                    break;*/
             }
 
-            dims = arrayView.Dimensions;
+            dims = buf.Dimensions;
         }
 
         /// <summary>
@@ -78,11 +91,10 @@ public GPUArray(Buffer<T> arrayView)
         /// </summary>
         /// <param name="idx">The ID to index into.</param>
         /// <returns>The data at that ID.</returns>
-        public T this[int idx]
-        {
-            get => view1d[idx];
-            set => view1d[idx] = value;
-        }
+        //public ref T this[int idx]
+        //{
+        //    get => ref view1d[idx];
+        //}
 
         /// <summary>
         /// Overload for [,] operator.
@@ -90,12 +102,23 @@ public T this[int idx]
         /// <param name="i">The first ID to index into.</param>
         /// <param name="j">The second ID to index into.</param>
         /// <returns>The data at that ID.</returns>
-        public T this[int i, int j]
+        public ref T this[int i, int j]
         {
-            get => view2d[i, j];
-            set => view2d[i, j] = value;
+            get => ref view2d[i, j];
         }
 
+        /// <summary>
+        /// Overload for [,,] operator.
+        /// </summary>
+        /// <param name="i">The first ID to index into.</param>
+        /// <param name="j">The second ID to index into.</param>
+        /// <param name="k">The third ID to index into.</param>
+        /// <returns>The data at that ID.</returns>
+        //public ref T this[int i, int j, int k]
+        //{
+        //    get => ref view3d[i, j, k];
+        //}
+
         /// <summary>
         /// Gets the length of the array.
         /// </summary>
@@ -105,13 +128,15 @@ public int Length
             {
                 switch (dims)
                 {
-                    case 1:
+                    //case 1:
                     default:
-                        return view1d.IntLength;
+                    //    return view1d.IntLength;
                     case 2:
                         return view2d.IntLength;
+                    //case 3:
+                    //    return view3d.IntLength;
                 }
             }
         }
     }
-}
\ No newline at end of file
+}

From 7e4070aaf1a48c06df632b121bef7ce35c572ffb Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Mon, 13 Nov 2023 12:43:27 -0600
Subject: [PATCH 55/61] new index technique via index caching

---
 DotMP/GPU/Index.cs | 329 +++------------------------------------------
 1 file changed, 17 insertions(+), 312 deletions(-)

diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs
index f80cf57d..e6da53d3 100644
--- a/DotMP/GPU/Index.cs
+++ b/DotMP/GPU/Index.cs
@@ -15,8 +15,11 @@
 */
 
 using ILGPU;
+using ILGPU.Runtime;
 using System.Diagnostics.CodeAnalysis;
+using System.Linq;
 using System.Runtime.CompilerServices;
+using System.Xml;
 
 namespace DotMP.GPU
 {
@@ -24,329 +27,31 @@ namespace DotMP.GPU
     /// Represents an index passed as the first index argument.
     /// </summary>
     [ExcludeFromCodeCoverage]
-    public struct IndexI
+    public struct Index
     {
-        /// <summary>
-        /// The start of the first for loop, for index calculations.
-        /// </summary>
-        private int start1;
-
-        /// <summary>
-        /// The start of the second for loop, for index calculations.
-        /// </summary>
-        private int start2;
-
-        /// <summary>
-        /// The start of the third for loop, for index calculations.
-        /// </summary>
-        private int start3;
-
-        /// <summary>
-        /// The index to return.
-        /// </summary>
-        private int idx_prv;
-
-        /// <summary>
-        /// The difference between the second set of ranges.
-        /// </summary>
-        private int diff2;
-
-        /// <summary>
-        /// The difference between the third set of ranges.
-        /// </summary>
-        private int diff3;
-
-        /// <summary>
-        /// The offset, in case of a followup kernel.
-        /// </summary>
-        private int offset;
-
-        /// <summary>
-        /// The number of dimensions.
-        /// </summary>
-        private int dims;
-
-        /// <summary>
-        /// Constructor.
-        /// </summary>
-        /// <param name="range">The range of the for loop.</param>
-        /// <param name="offset">The offset for followup kernels.</param>
-        internal IndexI((int, int) range, int offset = 0)
-        {
-            this.offset = offset;
-
-            start1 = range.Item1;
-            start2 = -1;
-            start3 = -1;
-            idx_prv = -1;
-            diff2 = -1;
-            diff3 = -1;
-            dims = 1;
-        }
-
-        /// <summary>
-        /// Constructor.
-        /// </summary>
-        /// <param name="range1">The outer range of the for loop.</param>
-        /// <param name="range2">The inner range of the for loop.</param>
-        /// <param name="offset">The offset for followup kernels.</param>
-        internal IndexI((int, int) range1, (int, int) range2, int offset = 0)
-        {
-            this.offset = offset;
-
-            start1 = range1.Item1;
-            start2 = range2.Item1;
-            start3 = -1;
-            idx_prv = -1;
-            diff2 = range2.Item2 - range2.Item1;
-            diff3 = -1;
-            dims = 2;
-        }
-
-        /// <summary>
-        /// Constructor.
-        /// </summary>
-        /// <param name="range1">The outer range of the for loop.</param>
-        /// <param name="range2">The middle range of the for loop.</param>
-        /// <param name="range3">The inner range of the for loop.</param>
-        /// <param name="offset">The offset for followup kernels.</param>
-        internal IndexI((int, int) range1, (int, int) range2, (int, int) range3, int offset = 0)
-        {
-            this.offset = offset;
-
-            start1 = range1.Item1;
-            start2 = range2.Item1;
-            start3 = range3.Item1;
-            idx_prv = -1;
-            diff2 = range2.Item2 - range2.Item1;
-            diff3 = range3.Item2 - range3.Item1;
-            dims = 3;
-        }
-
-        /// <summary>
-        /// Casts an index to an int.
-        /// </summary>
-        /// <param name="h">The Index struct to cast.</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static implicit operator int(IndexI h)
-        {
-            switch (h.dims)
-            {
-                default:
-                case 1:
-                    if (h.idx_prv == -1)
-                        h.idx_prv = Grid.GlobalLinearIndex + h.start1 + h.offset;
-
-                    return h.idx_prv;
-
-                case 2:
-                    if (h.idx_prv == -1)
-                    {
-                        int idxoffset = Grid.GlobalLinearIndex + h.offset;
-                        h.idx_prv = IntrinsicMath.DivRoundDown(idxoffset, h.diff2) + h.start1;
-                    }
-
-                    return h.idx_prv;
-
-                case 3:
-                    if (h.idx_prv == -1)
-                    {
-
-                    }
-
-                    return h.idx_prv;
-            }
-        }
-    }
-
-    /// <summary>
-    /// Represents an index passed as the second index argument.
-    /// </summary>
-    [ExcludeFromCodeCoverage]
-    public struct IndexJ
-    {
-        /// <summary>
-        /// The start of the first for loop, for index calculations.
-        /// </summary>
-        private int start1;
-
-        /// <summary>
-        /// The start of the second for loop, for index calculations.
-        /// </summary>
-        private int start2;
-
-        /// <summary>
-        /// The start of the third for loop, for index calculations.
-        /// </summary>
-        private int start3;
-
-        /// <summary>
-        /// The index to return.
-        /// </summary>
-        private int idx_prv;
-
-        /// <summary>
-        /// The difference between the second set of ranges.
-        /// </summary>
-        private int diff2;
-
-        /// <summary>
-        /// The difference between the third set of ranges.
-        /// </summary>
-        private int diff3;
-
-        /// <summary>
-        /// The offset, in case of a followup kernel.
-        /// </summary>
+        private ArrayView1D<int, Stride1D.Dense> lookup;
         private int offset;
+        private int idx;
 
-        /// <summary>
-        /// The number of dimensions.
-        /// </summary>
-        private int dims;
-
-        /// <summary>
-        /// Constructor.
-        /// </summary>
-        /// <param name="range1">The outer range of the for loop.</param>
-        /// <param name="range2">The inner range of the for loop.</param>
-        /// <param name="offset">The offset for followup kernels.</param>
-        internal IndexJ((int, int) range1, (int, int) range2, int offset = 0)
+        internal Index(Buffer<int> buf)
         {
-            this.offset = offset;
-
-            start1 = range1.Item1;
-            start2 = range2.Item1;
-            start3 = -1;
-            idx_prv = -1;
-            diff2 = range2.Item2 - range2.Item1;
-            diff3 = -1;
-            dims = 2;
+            this.lookup = buf.View1D;
+            offset = 0;
+            idx = -1;
         }
 
-        /// <summary>
-        /// Constructor.
-        /// </summary>
-        /// <param name="range1">The outer range of the for loop.</param>
-        /// <param name="range2">The middle range of the for loop.</param>
-        /// <param name="range3">The inner range of the for loop.</param>
-        /// <param name="offset">The offset for followup kernels.</param>
-        internal IndexJ((int, int) range1, (int, int) range2, (int, int) range3, int offset = 0)
+        internal void AddOffset(int offset)
         {
             this.offset = offset;
-
-            start1 = range1.Item1;
-            start2 = range2.Item1;
-            start3 = range3.Item1;
-            idx_prv = -1;
-            diff2 = range2.Item2 - range2.Item1;
-            diff3 = range3.Item2 - range3.Item1;
-            dims = 3;
         }
 
-        /// <summary>
-        /// Casts an index to an int.
-        /// </summary>
-        /// <param name="h">The Index struct to cast.</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static implicit operator int(IndexJ h)
+	[MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static implicit operator int(Index i)
         {
-            switch (h.dims)
-            {
-                default:
-                case 2:
-                    if (h.idx_prv == -1)
-                    {
-                        int idxoffset = Grid.GlobalLinearIndex + h.offset;
-                        h.idx_prv = (idxoffset % h.diff2) + h.start2;
-                    }
-
-                    return h.idx_prv;
-
-                case 3:
-                    if (h.idx_prv == -1)
-                    {
-
-                    }
-
-                    return h.idx_prv;
-            }
-        }
-    }
-
-    /// <summary>
-    /// Represents an index passed as the third index argument.
-    /// </summary>
-    [ExcludeFromCodeCoverage]
-    public struct IndexK
-    {
-        /// <summary>
-        /// The start of the first for loop, for index calculations.
-        /// </summary>
-        private int start1;
-
-        /// <summary>
-        /// The start of the second for loop, for index calculations.
-        /// </summary>
-        private int start2;
-
-        /// <summary>
-        /// The start of the third for loop, for index calculations.
-        /// </summary>
-        private int start3;
-
-        /// <summary>
-        /// The index to return.
-        /// </summary>
-        private int idx_prv;
-
-        /// <summary>
-        /// The difference between the second set of ranges.
-        /// </summary>
-        private int diff2;
-
-        /// <summary>
-        /// The difference between the third set of ranges.
-        /// </summary>
-        private int diff3;
-
-        /// <summary>
-        /// The offset, in case of a followup kernel.
-        /// </summary>
-        private int offset;
-
-        /// <summary>
-        /// Constructor.
-        /// </summary>
-        /// <param name="range1">The outer range of the for loop.</param>
-        /// <param name="range2">The middle range of the for loop.</param>
-        /// <param name="range3">The inner range of the for loop.</param>
-        /// <param name="offset">The offset for followup kernels.</param>
-        internal IndexK((int, int) range1, (int, int) range2, (int, int) range3, int offset = 0)
-        {
-            this.offset = offset;
-
-            start1 = range1.Item1;
-            start2 = range2.Item1;
-            start3 = range3.Item1;
-            idx_prv = -1;
-            diff2 = range2.Item2 - range2.Item1;
-            diff3 = range3.Item2 - range3.Item1;
-        }
-
-        /// <summary>
-        /// Casts an index to an int.
-        /// </summary>
-        /// <param name="h">The Index struct to cast.</param>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static implicit operator int(IndexK h)
-        {
-            if (h.idx_prv == -1)
-            {
-
-            }
+            if (i.idx == -1)
+                i.idx = i.lookup[Grid.GlobalLinearIndex + i.offset];
 
-            return h.idx_prv;
+            return i.idx;
         }
     }
-}
\ No newline at end of file
+}

From a6d0072c594bf84823856ec2f880119d1ec9946b Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Mon, 13 Nov 2023 12:43:48 -0600
Subject: [PATCH 56/61] update benchmarks

---
 benchmarks/GPUHeatTransfer/Program.cs         |  4 +-
 benchmarks/GPUOverhead/Program.cs             |  6 +-
 benchmarks/ILGPUOverhead/ILGPUOverhead.csproj | 18 ++++++
 benchmarks/ILGPUOverhead/Program.cs           | 63 +++++++++++++++++++
 4 files changed, 86 insertions(+), 5 deletions(-)
 create mode 100644 benchmarks/ILGPUOverhead/ILGPUOverhead.csproj
 create mode 100644 benchmarks/ILGPUOverhead/Program.cs

diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs
index 0d332da2..75d0747f 100644
--- a/benchmarks/GPUHeatTransfer/Program.cs
+++ b/benchmarks/GPUHeatTransfer/Program.cs
@@ -37,7 +37,7 @@ public class HeatTransfer
     public enum ParType { DMPFor, DMPGPU }
 
     // test dims of 100x100, 1000x1000, and 5000x5000
-    [Params(500)]
+    [Params(768)]
     public int dim;
 
     // test with 10 steps and 100 steps
@@ -159,7 +159,7 @@ public class HeatTransferVerify
     public enum ParType { DMPFor, DMPGPU }
 
     // test dims of 100x100, 1000x1000, and 5000x5000
-    public int dim = 500;
+    public int dim = 1000;
 
     // test with 10 steps and 100 steps
     public int steps = 100;
diff --git a/benchmarks/GPUOverhead/Program.cs b/benchmarks/GPUOverhead/Program.cs
index b9ff18a5..579d3868 100644
--- a/benchmarks/GPUOverhead/Program.cs
+++ b/benchmarks/GPUOverhead/Program.cs
@@ -27,20 +27,20 @@
 [EventPipeProfiler(EventPipeProfile.CpuSampling)]
 public class Overhead
 {
-    DotMP.GPU.Buffer<byte> buf;
+    DotMP.GPU.Buffer<int> buf;
 
     // run the setup
     [GlobalSetup]
     public void Setup()
     {
-	buf = new DotMP.GPU.Buffer<byte>(new byte[1], DotMP.GPU.Buffer.Behavior.NoCopy);
+	buf = new DotMP.GPU.Buffer<int>(new int[1, 1], DotMP.GPU.Buffer.Behavior.NoCopy);
     }
 
     //run the simulation
     [Benchmark]
     public void TestOverhead()
     {
-	DotMP.GPU.Parallel.ParallelFor(0, 1, buf, (i, buf) => { });
+	DotMP.GPU.Parallel.ParallelForCollapse((0, 500), (0, 500), buf, (i, j, buf) => { });
     }
 }
 
diff --git a/benchmarks/ILGPUOverhead/ILGPUOverhead.csproj b/benchmarks/ILGPUOverhead/ILGPUOverhead.csproj
new file mode 100644
index 00000000..9cf0a6f0
--- /dev/null
+++ b/benchmarks/ILGPUOverhead/ILGPUOverhead.csproj
@@ -0,0 +1,18 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net6.0</TargetFramework>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="BenchmarkDotNet" Version="0.13.10" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\..\DotMP\DotMP.csproj" />
+  </ItemGroup>
+
+</Project>
diff --git a/benchmarks/ILGPUOverhead/Program.cs b/benchmarks/ILGPUOverhead/Program.cs
new file mode 100644
index 00000000..862ae1af
--- /dev/null
+++ b/benchmarks/ILGPUOverhead/Program.cs
@@ -0,0 +1,63 @@
+﻿/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Jobs;
+using BenchmarkDotNet.Running;
+using BenchmarkDotNet.Diagnosers;
+using System;
+using ILGPU;
+using ILGPU.Runtime;
+
+/* jscpd:ignore-start */
+
+[SimpleJob(RuntimeMoniker.Net60)]
+[ThreadingDiagnoser]
+[HardwareCounters]
+[EventPipeProfiler(EventPipeProfile.CpuSampling)]
+public class Overhead
+{
+    Action<KernelConfig, ArrayView1D<int, Stride1D.Dense>> kernel;
+    ArrayView1D<int, Stride1D.Dense> data;
+
+    // run the setup
+    [GlobalSetup]
+    public void Setup()
+    {
+	var context = Context.CreateDefault();
+	var accelerator = context.Devices[1].CreateAccelerator(context);
+	kernel = accelerator.LoadStreamKernel<ArrayView1D<int, Stride1D.Dense>>(arr => { });
+	data = accelerator.Allocate1D<int>(1); 
+    }
+
+    //run the simulation
+    [Benchmark]
+    public void TestOverhead()
+    {
+	kernel((1, 256), data);
+    }
+}
+
+/* jscpd:ignore-end */
+
+// driver
+public class Program
+{
+    public static void Main(string[] args)
+    {
+        BenchmarkRunner.Run<Overhead>();
+    }
+}

From ca0b54a25c46afa73a4d597f612cabc6892b371b Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Mon, 13 Nov 2023 12:44:48 -0600
Subject: [PATCH 57/61] run dotnet format

---
 DotMP/GPU/GpuArray.cs | 22 +++++++++++-----------
 DotMP/GPU/Index.cs    |  2 +-
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs
index b4cbe394..11efff12 100644
--- a/DotMP/GPU/GpuArray.cs
+++ b/DotMP/GPU/GpuArray.cs
@@ -65,7 +65,7 @@ internal GPUArray(Buffer<T> buf)
                     // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
                     view3d = new Buffer<T>(new T[1, 1, 1], Buffer.Behavior.NoCopy).View3D;
                     break;*/
-		default:
+                default:
                 case 2:
                     // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
                     //view1d = new Buffer<T>(new T[1], Buffer.Behavior.NoCopy).View1D;
@@ -73,14 +73,14 @@ internal GPUArray(Buffer<T> buf)
                     // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
                     //view3d = new Buffer<T>(new T[1, 1, 1], Buffer.Behavior.NoCopy).View3D;
                     break;
-                /*case 3:
-                default:
-                    // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
-                    view1d = new Buffer<T>(new T[1], Buffer.Behavior.NoCopy).View1D;
-                    // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
-                    view2d = new Buffer<T>(new T[1, 1], Buffer.Behavior.NoCopy).View2D;
-                    view3d = buf.View3D;
-                    break;*/
+                    /*case 3:
+                    default:
+                        // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                        view1d = new Buffer<T>(new T[1], Buffer.Behavior.NoCopy).View1D;
+                        // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                        view2d = new Buffer<T>(new T[1, 1], Buffer.Behavior.NoCopy).View2D;
+                        view3d = buf.View3D;
+                        break;*/
             }
 
             dims = buf.Dimensions;
@@ -133,8 +133,8 @@ public int Length
                     //    return view1d.IntLength;
                     case 2:
                         return view2d.IntLength;
-                    //case 3:
-                    //    return view3d.IntLength;
+                        //case 3:
+                        //    return view3d.IntLength;
                 }
             }
         }
diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs
index e6da53d3..159fee65 100644
--- a/DotMP/GPU/Index.cs
+++ b/DotMP/GPU/Index.cs
@@ -45,7 +45,7 @@ internal void AddOffset(int offset)
             this.offset = offset;
         }
 
-	[MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static implicit operator int(Index i)
         {
             if (i.idx == -1)

From 1c72a2fe4c11b4f5c5616a56cb5ff9c38b973700 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Tue, 14 Nov 2023 17:49:18 -0600
Subject: [PATCH 58/61] add support for .NET 8

---
 DotMP/DotMP.csproj | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DotMP/DotMP.csproj b/DotMP/DotMP.csproj
index 00a47b68..24113bd7 100644
--- a/DotMP/DotMP.csproj
+++ b/DotMP/DotMP.csproj
@@ -1,7 +1,7 @@
 <Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
-    <TargetFrameworks>net6.0;net7.0</TargetFrameworks>
+    <TargetFrameworks>net6.0;net7.0;net8.0</TargetFrameworks>
     <RootNamespace>DotMP</RootNamespace>
     <PackageId>DotMP</PackageId>
     <Version>2.0.0</Version>

From 8432ed7150717a312fa44d66a203172faa9c30a3 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Wed, 15 Nov 2023 23:51:54 -0600
Subject: [PATCH 59/61] comments, optimizations

---
 DotMP/GPU/AcceleratorHandler.tt | 45 +++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt
index e3582530..156d155a 100644
--- a/DotMP/GPU/AcceleratorHandler.tt
+++ b/DotMP/GPU/AcceleratorHandler.tt
@@ -52,12 +52,18 @@ namespace DotMP.GPU
         /// Kernel cache.
         /// </summary>
         private static Dictionary<string, Delegate> kernels = new Dictionary<string, Delegate>();
-
+        /// <summary>
+        /// Index cache for 1D kernels.
+        /// </summary>
         private static Dictionary<string, ValueTuple<int, int, Buffer<int>>> indices1d = new Dictionary<string, ValueTuple<int, int, Buffer<int>>>();
-
+        /// <summary>
+        /// Index cache for 2D kernels.
+        /// </summary>
         private static Dictionary<string, ValueTuple<int, int, int, int, Buffer<int>, Buffer<int>>> indices2d =
             new Dictionary<string, ValueTuple<int, int, int, int, Buffer<int>, Buffer<int>>>();
-
+        /// <summary>
+        /// Index cache for 3D kernels.
+        /// </summary>
         private static Dictionary<string, ValueTuple<ValueTuple<int, int>, ValueTuple<int, int>, ValueTuple<int, int>, Buffer<int>, Buffer<int>, Buffer<int>>> indices3d =
             new Dictionary<string, ValueTuple<ValueTuple<int, int>, ValueTuple<int, int>, ValueTuple<int, int>, Buffer<int>, Buffer<int>, Buffer<int>>>();
 
@@ -185,7 +191,7 @@ namespace DotMP.GPU
         /// </summary>
         /// <param name="range">The range of the for loop.</param>
         /// <param name="src">The calling location in the source code.</param>
-        /// <returns>A buffer representing the indices.</returns>
+        /// <returns>The calculated index.</returns>
         internal Index Get1DIdx((int, int) range, string src)
         {
             if (indices1d.ContainsKey(src))
@@ -207,6 +213,13 @@ namespace DotMP.GPU
             return new Index(buf);
         }
 
+        /// <summary>
+        /// Precomputes and caches the indices for a 2D for loop.
+        /// </summary>
+        /// <param name="range1">The outer range of the for loop.</param>
+        /// <param name="range2">The inner range of the for loop.</param>
+        /// <param name="src">The calling location in the source code.</param>
+        /// <returns>A tuple of calculated indices.</returns>
         internal ValueTuple<Index, Index> Get2DIdx((int, int) range1, (int, int) range2, string src)
         {
             if (indices2d.ContainsKey(src))
@@ -247,6 +260,14 @@ namespace DotMP.GPU
             return (new Index(b1), new Index(b2));
         }
 
+        /// <summary>
+        /// Precomputes and caches the indices for a 3D for loop.
+        /// </summary>
+        /// <param name="range1">The outer range of the for loop.</param>
+        /// <param name="range2">The middle range of the for loop.</param>
+        /// <param name="range3">The inner range of the for loop.</param>
+        /// <param name="src">The calling location in the source code.</param>
+        /// <returns>A tuple of calculated indices.</returns>
         internal ValueTuple<Index, Index, Index> Get3DIdx((int, int) range1, (int, int) range2, (int, int) range3, string src)
         {
             if (indices3d.ContainsKey(src))
@@ -323,9 +344,13 @@ namespace DotMP.GPU
 
             var kernel = GetKernel(action, src);
 
+<# for (int i = 0; i < c; i++) { #>
+            var gpu<#= i + 1 #> = new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>);
+<# } #>
+
             kernel((len / block_size, block_size), idx
 <# for (int i = 0; i < c; i++) { #>
-            , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>)
+            , gpu<#= i + 1 #>
 <# } #>
             );
 
@@ -338,7 +363,7 @@ namespace DotMP.GPU
 
                 kernel((1, not_done), idx
 <# for (int i = 0; i < c; i++) { #>
-                , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>)
+                , gpu<#= i + 1 #>
 <# } #>
                 );
             }
@@ -431,9 +456,13 @@ namespace DotMP.GPU
 
             var kernel = GetKernel(action, src);
 
+<# for (int i = 0; i < c; i++) { #>
+            var gpu<#= i + 1 #> = new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>);
+<# } #>
+
             kernel((len / block_size, block_size), i, j, k
 <# for (int i = 0; i < c; i++) { #>
-            , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>)
+            , gpu<#= i + 1 #>
 <# } #>
             );
 
@@ -448,7 +477,7 @@ namespace DotMP.GPU
 
                 kernel((1, not_done), i, j, k
 <# for (int i = 0; i < c; i++) { #>
-                , new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>)
+                , gpu<#= i + 1 #>
 <# } #>
                 );
             }

From 97571c53a5bfec68bca1a980ba2ee0fe01e5b26b Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Wed, 15 Nov 2023 23:52:18 -0600
Subject: [PATCH 60/61] temporary

---
 DotMP/GPU/GpuArray.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs
index 11efff12..ca4ad643 100644
--- a/DotMP/GPU/GpuArray.cs
+++ b/DotMP/GPU/GpuArray.cs
@@ -33,7 +33,7 @@ public struct GPUArray<T>
         /// <summary>
         /// The ILGPU view for 1D arrays.
         /// </summary>
-        private ArrayView1D<T, Stride1D.Dense> view1d;
+        //private ArrayView1D<T, Stride1D.Dense> view1d;
 
         /// <summary>
         /// The ILGPU view for 2D arrays.
@@ -43,7 +43,7 @@ public struct GPUArray<T>
         /// <summary>
         /// The ILGPU view for 3D arrays.
         /// </summary>
-        private ArrayView3D<T, Stride3D.DenseZY> view3d;
+        //private ArrayView3D<T, Stride3D.DenseZY> view3d;
 
         /// <summary>
         /// Number of dimensions.

From f87b6876c3532f54a477ac8628adf85e557766a1 Mon Sep 17 00:00:00 2001
From: Lane <iamaperson620@gmail.com>
Date: Wed, 15 Nov 2023 23:52:36 -0600
Subject: [PATCH 61/61] add optimizations

---
 DotMP/GPU/Index.cs | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs
index 159fee65..8de4dc3e 100644
--- a/DotMP/GPU/Index.cs
+++ b/DotMP/GPU/Index.cs
@@ -29,10 +29,23 @@ namespace DotMP.GPU
     [ExcludeFromCodeCoverage]
     public struct Index
     {
+        /// <summary>
+        /// Lookup table for indices.
+        /// </summary>
         private ArrayView1D<int, Stride1D.Dense> lookup;
+        /// <summary>
+        /// Offset for followup kernels.
+        /// </summary>
         private int offset;
+        /// <summary>
+        /// Cached index.
+        /// </summary>
         private int idx;
 
+        /// <summary>
+        /// Constructor.
+        /// </summary>
+        /// <param name="buf">Buffer representing the indices.</param>
         internal Index(Buffer<int> buf)
         {
             this.lookup = buf.View1D;
@@ -40,11 +53,19 @@ internal Index(Buffer<int> buf)
             idx = -1;
         }
 
+        /// <summary>
+        /// Adds an offset in preperation for a followup kernel.
+        /// </summary>
+        /// <param name="offset">The offset to set.</param>
         internal void AddOffset(int offset)
         {
             this.offset = offset;
         }
 
+        /// <summary>
+        /// Calculates the index and caches for future use.
+        /// </summary>
+        /// <param name="i">The Index object to cast to int.</param>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static implicit operator int(Index i)
         {