diff --git a/.github/workflows/publish_site.yml b/.github/workflows/publish_site.yml
index 21b8cc44..2bc3b185 100644
--- a/.github/workflows/publish_site.yml
+++ b/.github/workflows/publish_site.yml
@@ -12,6 +12,14 @@ jobs:
     steps:
     - uses: actions/checkout@v4
 
+    - name: Setup .NET
+      uses: actions/setup-dotnet@v3
+      with:
+        dotnet-version: 8.0.x
+
+    - name: Build Templates
+      run: make build
+
     - name: Install Doxygen
       run: sudo apt-get install doxygen graphviz -y
       shell: bash
@@ -28,4 +36,4 @@ jobs:
       uses: JamesIves/github-pages-deploy-action@v4
       with:
         token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-        folder: docs/html
\ No newline at end of file
+        folder: docs/html
diff --git a/.gitignore b/.gitignore
index 25fb8c99..ac1e66dc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,8 @@ docs/*
 .vscode
 *.opencover.xml
 *.sln
+AcceleratorHandler.cs
+Gpu.cs
 ProcessedREADME.md
 
 # User-specific files
diff --git a/DotMP-Tests/ParallelTests.cs b/DotMP-Tests/CPUTests.cs
similarity index 97%
rename from DotMP-Tests/ParallelTests.cs
rename to DotMP-Tests/CPUTests.cs
index 4f38d367..038ff4ba 100644
--- a/DotMP-Tests/ParallelTests.cs
+++ b/DotMP-Tests/CPUTests.cs
@@ -28,9 +28,9 @@
 namespace DotMPTests
 {
     /// <summary>
-    /// Tests for the DotMP library.
+    /// CPU tests for the DotMP library.
     /// </summary>
-    public class ParallelTests
+    public class CPUTests
     {
         private readonly ITestOutputHelper output;
 
@@ -38,7 +38,7 @@ public class ParallelTests
         /// Constructor to write output.
         /// </summary>
         /// <param name="output">Output object.</param>
-        public ParallelTests(ITestOutputHelper output)
+        public CPUTests(ITestOutputHelper output)
         {
             this.output = output;
         }
@@ -522,7 +522,7 @@ public void Critical_works()
             DotMP.Parallel.ParallelRegion(num_threads: threads, action: () =>
             {
                 for (int i = 0; i < iters; i++)
-                    DotMP.Parallel.Critical(0, () => ++total);
+                    DotMP.Parallel.Critical(() => ++total);
             });
 
             total.Should().Be((int)threads * iters);
@@ -531,14 +531,13 @@ public void Critical_works()
 
             DotMP.Parallel.ParallelRegion(num_threads: 4, action: () =>
             {
-                if (DotMP.Parallel.GetThreadNum() == 0) DotMP.Parallel.Critical(0, () => Thread.Sleep(1000));
-                if (DotMP.Parallel.GetThreadNum() == 1) DotMP.Parallel.Critical(1, () => Thread.Sleep(1000));
-                if (DotMP.Parallel.GetThreadNum() == 2) DotMP.Parallel.Critical(0, () => Thread.Sleep(1000));
-                if (DotMP.Parallel.GetThreadNum() == 3) DotMP.Parallel.Critical(1, () => Thread.Sleep(1000));
+                if (DotMP.Parallel.GetThreadNum() % 2 == 0) DotMP.Parallel.Critical(() => Thread.Sleep(1000));
+                if (DotMP.Parallel.GetThreadNum() % 2 == 1) DotMP.Parallel.Critical(() => Thread.Sleep(1000));
             });
 
             double elapsed = DotMP.Parallel.GetWTime() - start;
-            elapsed.Should().BeLessThan(2200);
+            elapsed.Should().BeLessThan(2.2);
+            elapsed.Should().BeGreaterThan(2.0);
         }
 
         /// <summary>
@@ -571,7 +570,7 @@ public void Single_works()
             {
                 for (int i = 0; i < 10; i++)
                 {
-                    DotMP.Parallel.Single(0, () => DotMP.Atomic.Inc(ref total));
+                    DotMP.Parallel.Single(() => DotMP.Atomic.Inc(ref total));
                 }
             });
 
@@ -583,7 +582,7 @@ public void Single_works()
             {
                 for (int i = 0; i < 10; i++)
                 {
-                    DotMP.Parallel.Single(0, () => DotMP.Atomic.Inc(ref total));
+                    DotMP.Parallel.Single(() => DotMP.Atomic.Inc(ref total));
                 }
             });
 
@@ -749,7 +748,7 @@ public void Ordered_works()
             DotMP.Parallel.ParallelFor(0, 1024, schedule: DotMP.Schedule.Static,
                                         num_threads: threads, action: i =>
             {
-                DotMP.Parallel.Ordered(0, () =>
+                DotMP.Parallel.Ordered(() =>
                 {
                     incrementing[i] = ctr++;
                 });
@@ -1111,7 +1110,7 @@ public void Tasking_works()
 
             DotMP.Parallel.ParallelRegion(num_threads: threads, action: () =>
             {
-                DotMP.Parallel.Single(0, () =>
+                DotMP.Parallel.Single(() =>
                 {
                     for (int i = 0; i < threads * 2; i++)
                     {
@@ -1139,7 +1138,7 @@ public void Tasking_works()
 
             DotMP.Parallel.ParallelRegion(num_threads: threads, action: () =>
             {
-                DotMP.Parallel.Single(0, () =>
+                DotMP.Parallel.Single(() =>
                 {
                     for (int i = 0; i < tasks_to_spawn; i++)
                     {
@@ -1199,7 +1198,7 @@ public void Nested_tasks_work()
 
             DotMP.Parallel.ParallelRegion(num_threads: threads, action: () =>
             {
-                DotMP.Parallel.Single(0, () =>
+                DotMP.Parallel.Single(() =>
                 {
                     DotMP.Parallel.Task(() =>
                     {
@@ -1369,7 +1368,7 @@ public void Non_parallel_single_should_except()
         {
             Assert.Throws<DotMP.Exceptions.NotInParallelRegionException>(() =>
             {
-                DotMP.Parallel.Single(0, () => { });
+                DotMP.Parallel.Single(() => { });
             });
         }
 
@@ -1381,7 +1380,7 @@ public void Non_parallel_critical_should_except()
         {
             Assert.Throws<DotMP.Exceptions.NotInParallelRegionException>(() =>
             {
-                DotMP.Parallel.Critical(0, () => { });
+                DotMP.Parallel.Critical(() => { });
             });
         }
 
@@ -1395,7 +1394,7 @@ public void Nested_worksharing_should_except()
             {
                 DotMP.Parallel.ParallelFor(0, 10, num_threads: 4, action: i =>
                 {
-                    DotMP.Parallel.Single(0, () => { });
+                    DotMP.Parallel.Single(() => { });
                 });
             });
 
@@ -1403,7 +1402,7 @@ public void Nested_worksharing_should_except()
             {
                 DotMP.Parallel.ParallelRegion(num_threads: 4, action: () =>
                 {
-                    DotMP.Parallel.Single(0, () =>
+                    DotMP.Parallel.Single(() =>
                     {
                         DotMP.Parallel.For(0, 10, action: i => { });
                     });
@@ -1427,7 +1426,7 @@ public void Non_for_ordered_should_except()
         {
             Assert.Throws<DotMP.Exceptions.NotInParallelRegionException>(() =>
             {
-                DotMP.Parallel.Ordered(0, () => { });
+                DotMP.Parallel.Ordered(() => { });
             });
         }
 
diff --git a/DotMP-Tests/GPUTests.cs b/DotMP-Tests/GPUTests.cs
new file mode 100644
index 00000000..7da2b446
--- /dev/null
+++ b/DotMP-Tests/GPUTests.cs
@@ -0,0 +1,136 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Linq;
+using System.Text.Json.Serialization;
+using System.Threading;
+using DotMP;
+using DotMP.GPU;
+using FluentAssertions;
+using Xunit;
+using Xunit.Abstractions;
+
+
+namespace DotMPTests
+{
+    /// <summary>
+    /// CPU tests for the DotMP library.
+    /// </summary>
+    public class GPUTests
+    {
+        /// <summary>
+        /// Tests to make sure that for loops work in GPU kernels.
+        /// </summary>
+        [Fact]
+        public void GPU_for_works()
+        {
+            double[] a = new double[50000];
+            double[] x = new double[50000];
+            double[] y = new double[50000];
+            float[] res = new float[50000];
+            float[] res_cpu = new float[50000];
+
+            random_init(a);
+            random_init(x);
+            random_init(y);
+
+            {
+                using var a_gpu = new DotMP.GPU.Buffer<double>(a, DotMP.GPU.Buffer.Behavior.To);
+                using var x_gpu = new DotMP.GPU.Buffer<double>(x, DotMP.GPU.Buffer.Behavior.To);
+                using var y_gpu = new DotMP.GPU.Buffer<double>(y, DotMP.GPU.Buffer.Behavior.To);
+                using var res_gpu = new DotMP.GPU.Buffer<float>(res, DotMP.GPU.Buffer.Behavior.From);
+
+                DotMP.GPU.Parallel.ParallelFor(0, a.Length, a_gpu, x_gpu, y_gpu, res_gpu,
+                    (i, a, x, y, res) =>
+                {
+                    res[i] = (float)(a[i] * x[i] + y[i]);
+                });
+            }
+
+            for (int i = 0; i < a.Length; i++)
+            {
+                res_cpu[i] = (float)(a[i] * x[i] + y[i]);
+            }
+
+            Assert.Equal(res_cpu, res);
+
+            double[] a_old = a.Select(a => a).ToArray();
+
+            using (var a_gpu = new DotMP.GPU.Buffer<double>(a, DotMP.GPU.Buffer.Behavior.ToFrom))
+            {
+                DotMP.GPU.Parallel.ParallelFor(0, a.Length, a_gpu, (i, a) =>
+                {
+                    a[i]++;
+                });
+            }
+
+            for (int i = 0; i < a.Length; i++)
+            {
+                a_old[i]++;
+            }
+
+            Assert.Equal(a, a_old);
+        }
+
+        /// <summary>
+        /// Tests to make sure that DotMP.GPU.Parallel.ForCollapse produces correct results.
+        /// </summary>
+        [Fact]
+        public void Collapse_works()
+        {
+            int[,] iters_hit = new int[1024, 1024];
+
+            using (var buf = new Buffer<int>(iters_hit, DotMP.GPU.Buffer.Behavior.ToFrom))
+            {
+                DotMP.GPU.Parallel.ParallelForCollapse((258, 512), (512, 600), buf, (i, j, iters_hit) =>
+                {
+                    iters_hit[i, j]++;
+                });
+            }
+
+            for (int i = 0; i < 1024; i++)
+                for (int j = 0; j < 1024; j++)
+                    if (i >= 258 && i < 512 && j >= 512 && j < 600)
+                        iters_hit[i, j].Should().Be(1);
+                    else
+                        iters_hit[i, j].Should().Be(0);
+
+            iters_hit = null;
+
+            int[,,] iters_hit_3 = new int[128, 128, 64];
+
+            using (var buf = new Buffer<int>(iters_hit_3, DotMP.GPU.Buffer.Behavior.ToFrom))
+            {
+                DotMP.GPU.Parallel.ParallelForCollapse((35, 64), (16, 100), (10, 62), buf, action: (i, j, k, iters_hit_3) =>
+                {
+                    iters_hit_3[i, j, k]++;
+                });
+            }
+
+            for (int i = 0; i < 128; i++)
+                for (int j = 0; j < 128; j++)
+                    for (int k = 0; k < 64; k++)
+                        if (i >= 35 && i < 64 && j >= 16 && j < 100 && k >= 10 && k < 62)
+                            iters_hit_3[i, j, k].Should().Be(1);
+                        else
+                            iters_hit_3[i, j, k].Should().Be(0);
+
+            iters_hit_3 = null;
+        }
+
+        /// <summary>
+        /// Randomly initialize an array of type T.
+        /// </summary>
+        /// <typeparam name="T">The type to initialize to.</typeparam>
+        /// <param name="arr">The allocated array to store values into.</param>
+        private void random_init<T>(T[] arr)
+        {
+            Random r = new Random();
+
+            for (int i = 0; i < arr.Length; i++)
+            {
+                arr[i] = (T)Convert.ChangeType(r.NextDouble() * 128, typeof(T));
+            }
+        }
+    }
+}
diff --git a/DotMP/DotMP.csproj b/DotMP/DotMP.csproj
index e8cc69f0..d045a50c 100644
--- a/DotMP/DotMP.csproj
+++ b/DotMP/DotMP.csproj
@@ -4,7 +4,7 @@
     <TargetFrameworks>net6.0;net7.0;net8.0</TargetFrameworks>
     <RootNamespace>DotMP</RootNamespace>
     <PackageId>DotMP</PackageId>
-    <Version>1.6.0</Version>
+    <Version>2.0-pre1</Version>
     <Authors>Phillip Allen Lane,et al.</Authors>
     <PackageDescription>A library for fork-join parallelism in .NET, with an OpenMP-like API.</PackageDescription>
     <RepositoryUrl>https://github.com/computablee/DotMP</RepositoryUrl>
@@ -23,4 +23,21 @@
     <None Include="../ProcessedREADME.md" Pack="true" PackagePath="." />
   </ItemGroup>
 
+  <ItemGroup>
+    <PackageReference Include="ILGPU" Version="1.5.1" />
+    <PackageReference Include="T4.Build" Version="0.2.4" />
+
+    <None Include="GPU/AcceleratorHandler.cs">
+      <DesignTime>True</DesignTime>
+      <AutoGen>True</AutoGen>
+      <DependentUpon>GPU/AcceleratorHandler.tt</DependentUpon>
+    </None>
+
+    <None Include="GPU/Gpu.cs">
+      <DesignTime>True</DesignTime>
+      <AutoGen>True</AutoGen>
+      <DependentUpon>GPU/Gpu.tt</DependentUpon>
+    </None>
+  </ItemGroup>
+
 </Project>
diff --git a/DotMP/GPU/AcceleratorHandler.tt b/DotMP/GPU/AcceleratorHandler.tt
new file mode 100644
index 00000000..2527d1ba
--- /dev/null
+++ b/DotMP/GPU/AcceleratorHandler.tt
@@ -0,0 +1,488 @@
+/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+<#@ template debug="false" hostspecific="false" language="C#" #>
+<#@ output extension=".cs" #>
+<# var letters = new char[] { 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'A', 'B', 'C', 'D', 'E', 'F' };
+   int max = 13; #>
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using ILGPU;
+using ILGPU.Runtime;
+
+namespace DotMP.GPU
+{
+    /// <summary>
+    /// The handler class managing GPU acceleration.
+    /// </summary>
+    internal sealed class AcceleratorHandler
+    {
+        /// <summary>
+        /// Determines if a GPU context has been initialized yet.
+        /// </summary>
+        private static bool initialized = false;
+        /// <summary>
+        /// The GPU context.
+        /// </summary>
+        private static Context context;
+        /// <summary>
+        /// The accelerator object.
+        /// </summary>
+        internal static Accelerator accelerator;
+        /// <summary>
+        /// Block size to use for kernels.
+        /// </summary>
+        private static int block_size;
+        /// <summary>
+        /// Kernel cache.
+        /// </summary>
+        private static Dictionary<string, Delegate> kernels = new Dictionary<string, Delegate>();
+        /// <summary>
+        /// Index cache for 1D kernels.
+        /// </summary>
+        private static Dictionary<string, ValueTuple<int, int, Buffer<int>>> indices1d = new Dictionary<string, ValueTuple<int, int, Buffer<int>>>();
+        /// <summary>
+        /// Index cache for 2D kernels.
+        /// </summary>
+        private static Dictionary<string, ValueTuple<int, int, int, int, Buffer<int>, Buffer<int>>> indices2d =
+            new Dictionary<string, ValueTuple<int, int, int, int, Buffer<int>, Buffer<int>>>();
+        /// <summary>
+        /// Index cache for 3D kernels.
+        /// </summary>
+        private static Dictionary<string, ValueTuple<ValueTuple<int, int>, ValueTuple<int, int>, ValueTuple<int, int>, Buffer<int>, Buffer<int>, Buffer<int>>> indices3d =
+            new Dictionary<string, ValueTuple<ValueTuple<int, int>, ValueTuple<int, int>, ValueTuple<int, int>, Buffer<int>, Buffer<int>, Buffer<int>>>();
+
+        /// <summary>
+        /// Default constructor. If this is the first time it's called, it initializes all relevant singleton data.
+        /// </summary>
+        internal AcceleratorHandler()
+        {
+            if (initialized) return;
+
+            context = Context.Create()
+                .Optimize(OptimizationLevel.O2)
+                .Inlining(InliningMode.Aggressive)
+                .AllAccelerators()
+                //.Math(MathMode.Fast32BitOnly)
+                .ToContext();
+            var selectedDevice = context.Devices[0];
+
+            foreach (var d in context.Devices)
+            {
+                Console.WriteLine("Detected {0} accelerator.", d.ToString());
+
+                if (selectedDevice.AcceleratorType == AcceleratorType.CPU && d.AcceleratorType == AcceleratorType.OpenCL)
+                    selectedDevice = d;
+                if (selectedDevice.AcceleratorType != AcceleratorType.Cuda && d.AcceleratorType == AcceleratorType.Cuda)
+                    selectedDevice = d;
+            }
+
+            accelerator = selectedDevice.CreateAccelerator(context);
+            //accelerator = context.Devices[0].CreateAccelerator(context);
+
+            Console.WriteLine("Using {0} accelerator.", accelerator.AcceleratorType.ToString());
+            initialized = true;
+            block_size = accelerator.AcceleratorType == AcceleratorType.CPU ? 16 : 256;
+        }
+
+        /// <summary>
+        /// Synchronize pending operations.
+        /// </summary>
+        private void Synchronize() => accelerator.Synchronize();
+
+<# for (int c = 1; c <= max; c++) { #>
+        /// <summary>
+        /// Get the kernel associated with this lambda.
+        /// </summary>
+        /// <param name="action">The action provided on the CPU.</param>
+        /// <param name="src">The calling location.</param>
+        /// <returns>The GPU kernel.</returns>
+        private Action<KernelConfig, Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+        > GetKernel<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+        >(Action<Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #>
+        > action, string src)
+<# for (int i = 0; i < c; i++) { #>
+            where <#= letters[i] #> : unmanaged
+<# } #>
+        {
+            if (!kernels.ContainsKey(src))
+                kernels.Add(src, accelerator.LoadStreamKernel(action));
+
+            return (Action<KernelConfig, Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> 
+            >) kernels[src];
+        }
+<# } #>
+
+<# for (int c = 1; c <= max - 1; c++) { #>
+        /// <summary>
+        /// Get the kernel associated with this lambda.
+        /// </summary>
+        /// <param name="action">The action provided on the CPU.</param>
+        /// <param name="src">The calling location.</param>
+        /// <returns>The GPU kernel.</returns>
+        private Action<KernelConfig, Index, Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+        > GetKernel<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+        >(Action<Index, Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #>
+        > action, string src)
+<# for (int i = 0; i < c; i++) { #>
+            where <#= letters[i] #> : unmanaged
+<# } #>
+        {
+            if (!kernels.ContainsKey(src))
+                kernels.Add(src, accelerator.LoadStreamKernel(action));
+
+            return (Action<KernelConfig, Index, Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> 
+            >) kernels[src];
+        }
+<# } #>
+
+<# for (int c = 1; c <= max - 2; c++) { #>
+        /// <summary>
+        /// Get the kernel associated with this lambda.
+        /// </summary>
+        /// <param name="action">The action provided on the CPU.</param>
+        /// <param name="src">The calling location.</param>
+        /// <returns>The GPU kernel.</returns>
+        private Action<KernelConfig, Index, Index, Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+        > GetKernel<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+        >(Action<Index, Index, Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? ", " : "" #> <# } #>
+        > action, string src)
+<# for (int i = 0; i < c; i++) { #>
+            where <#= letters[i] #> : unmanaged
+<# } #>
+        {
+            if (!kernels.ContainsKey(src))
+                kernels.Add(src, accelerator.LoadStreamKernel(action));
+
+            return (Action<KernelConfig, Index, Index, Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #> 
+            >) kernels[src];
+        }
+<# } #>
+
+        /// <summary>
+        /// Precomputes and caches the indices for a 1D for loop.
+        /// </summary>
+        /// <param name="range">The range of the for loop.</param>
+        /// <param name="src">The calling location in the source code.</param>
+        /// <returns>The calculated index.</returns>
+        internal Index Get1DIdx((int, int) range, string src)
+        {
+            if (indices1d.ContainsKey(src))
+            {
+                var data = indices1d[src];
+                if (data.Item1 == range.Item1 && data.Item2 == range.Item2)
+                    return new Index(data.Item3);
+                else
+                    data.Item3.Dispose();
+            }
+
+            int[] indices = new int[range.Item2 - range.Item1];
+
+            for (int i = 0; i < indices.Length; i++)
+                indices[i] = i + range.Item1;
+
+            var buf = new Buffer<int>(indices, Buffer.Behavior.To);
+            indices1d[src] = (range.Item1, range.Item2, buf);
+            return new Index(buf);
+        }
+
+        /// <summary>
+        /// Precomputes and caches the indices for a 2D for loop.
+        /// </summary>
+        /// <param name="range1">The outer range of the for loop.</param>
+        /// <param name="range2">The inner range of the for loop.</param>
+        /// <param name="src">The calling location in the source code.</param>
+        /// <returns>A tuple of calculated indices.</returns>
+        internal ValueTuple<Index, Index> Get2DIdx((int, int) range1, (int, int) range2, string src)
+        {
+            if (indices2d.ContainsKey(src))
+            {
+                var data = indices2d[src];
+                if (data.Item1 == range1.Item1 && data.Item2 == range1.Item2 &&
+                    data.Item3 == range2.Item1 && data.Item4 == range2.Item2)
+                    return (new Index(data.Item5), new Index(data.Item6));
+                else
+                {
+                    data.Item5.Dispose();
+                    data.Item6.Dispose();
+                }
+            }
+
+            int[] indi = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1)];
+            int[] indj = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1)];
+
+            int ci = range1.Item1, cj = range2.Item1;
+
+            for (int i = 0; i < indi.Length; i++)
+            {
+                indi[i] = ci;
+                indj[i] = cj;
+
+                if (++cj == range2.Item2)
+                {
+                    cj = range2.Item1;
+                    ++ci;
+                }
+            }
+
+            var b1 = new Buffer<int>(indi, Buffer.Behavior.To);
+            var b2 = new Buffer<int>(indj, Buffer.Behavior.To);
+            indices2d[src] = (range1.Item1, range1.Item2, range2.Item1, range2.Item2, b1, b2);
+
+            return (new Index(b1), new Index(b2));
+        }
+
+        /// <summary>
+        /// Precomputes and caches the indices for a 3D for loop.
+        /// </summary>
+        /// <param name="range1">The outer range of the for loop.</param>
+        /// <param name="range2">The middle range of the for loop.</param>
+        /// <param name="range3">The inner range of the for loop.</param>
+        /// <param name="src">The calling location in the source code.</param>
+        /// <returns>A tuple of calculated indices.</returns>
+        internal ValueTuple<Index, Index, Index> Get3DIdx((int, int) range1, (int, int) range2, (int, int) range3, string src)
+        {
+            if (indices3d.ContainsKey(src))
+            {
+                var data = indices3d[src];
+                if (data.Item1.Item1 == range1.Item1 && data.Item1.Item2 == range1.Item2 &&
+                    data.Item2.Item1 == range2.Item1 && data.Item2.Item2 == range2.Item2 &&
+                    data.Item3.Item1 == range3.Item1 && data.Item3.Item2 == range3.Item2)
+                    return (new Index(data.Item4), new Index(data.Item5), new Index(data.Item6));
+                else
+                {
+                    data.Item4.Dispose();
+                    data.Item5.Dispose();
+                    data.Item6.Dispose();
+                }
+            }
+
+            int[] indi = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1)];
+            int[] indj = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1)];
+            int[] indk = new int[(range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1)];
+
+            int ci = range1.Item1, cj = range2.Item1, ck = range3.Item1;
+
+            for (int i = 0; i < indi.Length; i++)
+            {
+                indi[i] = ci;
+                indj[i] = cj;
+                indk[i] = ck;
+
+                if (++ck == range3.Item2)
+                {
+                    ck = range3.Item1;
+
+                    if (++cj == range2.Item2)
+                    {
+                        cj = range2.Item1;
+                        ++ci;
+                    }
+                }
+            }
+
+            var b1 = new Buffer<int>(indi, Buffer.Behavior.To);
+            var b2 = new Buffer<int>(indj, Buffer.Behavior.To);
+            var b3 = new Buffer<int>(indk, Buffer.Behavior.To);
+            indices3d[src] = ((range1.Item1, range1.Item2), (range2.Item1, range2.Item2), (range3.Item1, range3.Item2), b1, b2, b3);
+
+            return (new Index(b1), new Index(b2), new Index(b3));
+        }
+
+
+<# for (int c = 1; c <= max; c++) { #>
+        /// <summary>
+        /// Dispatches a linear kernel with the given number of parameters.
+        /// </summary>
+        /// <param name="range1">The range of the for loop.</param>
+<# for (int i = 0; i < c; i++) { #>
+        /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
+<# } #>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
+        internal void DispatchKernel<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+        >((int, int) range1,
+<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
+        Action<Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+        > action, string src)
+<# for (int i = 0; i < c; i++) { #>
+            where <#= letters[i] #> : unmanaged
+<# } #>
+        {
+            var idx = Get1DIdx(range1, src);
+            var len = range1.Item2 - range1.Item1;
+
+            var kernel = GetKernel(action, src);
+
+<# for (int i = 0; i < c; i++) { #>
+            var gpu<#= i + 1 #> = new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>);
+<# } #>
+
+            kernel((len / block_size, block_size), idx
+<# for (int i = 0; i < c; i++) { #>
+            , gpu<#= i + 1 #>
+<# } #>
+            );
+
+            int not_done = len % block_size;
+
+            if (not_done > 0)
+            {
+                int offset = len - not_done;
+                idx.AddOffset(offset);
+
+                kernel((1, not_done), idx
+<# for (int i = 0; i < c; i++) { #>
+                , gpu<#= i + 1 #>
+<# } #>
+                );
+            }
+
+            Synchronize();
+        }
+<# } #>
+
+<# for (int c = 1; c <= max - 1; c++) { #>
+        /// <summary>
+        /// Dispatches a 2D kernel with the given number of parameters.
+        /// </summary>
+        /// <param name="range1">The outer range of the for loop.</param>
+        /// <param name="range2">The inner range of the for loop.</param>
+<# for (int i = 0; i < c; i++) { #>
+        /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
+<# } #>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
+        internal void DispatchKernel<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+        >((int, int) range1, (int, int) range2,
+<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
+        Action<Index, Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+        > action, string src)
+<# for (int i = 0; i < c; i++) { #>
+            where <#= letters[i] #> : unmanaged
+<# } #>
+        {
+            var len = (range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1);
+            (var i, var j) = Get2DIdx(range1, range2, src);
+
+            var kernel = GetKernel(action, src);
+
+<# for (int i = 0; i < c; i++) { #>
+            var gpu<#= i + 1 #> = new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>);
+<# } #>
+
+            kernel((len / block_size, block_size), i, j
+<# for (int i = 0; i < c; i++) { #>
+            , gpu<#= i + 1 #>
+<# } #>
+            );
+
+            int not_done = len % block_size;
+
+            if (not_done > 0)
+            {
+                int offset = len - not_done;
+                i.AddOffset(offset);
+                j.AddOffset(offset);
+
+                kernel((1, not_done), i, j
+<# for (int i = 0; i < c; i++) { #>
+                , gpu<#= i + 1 #>
+<# } #>
+                );
+            }
+
+            Synchronize();
+        }
+<# } #>
+
+<# for (int c = 1; c <= max - 2; c++) { #>
+        /// <summary>
+        /// Dispatches a 3D kernel with the given number of parameters.
+        /// </summary>
+        /// <param name="range1">The outer range of the for loop.</param>
+        /// <param name="range2">The middle range of the for loop.</param>
+        /// <param name="range3">The inner range of the for loop.</param>
+<# for (int i = 0; i < c; i++) { #>
+        /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
+<# } #>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="src">The originating caller location.</param>
+        internal void DispatchKernel<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+        >((int, int) range1, (int, int) range2, (int, int) range3,
+<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
+        Action<Index, Index, Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+        > action, string src)
+<# for (int i = 0; i < c; i++) { #>
+            where <#= letters[i] #> : unmanaged
+<# } #>
+        {
+            var len = (range1.Item2 - range1.Item1) * (range2.Item2 - range2.Item1) * (range3.Item2 - range3.Item1);
+            (var i, var j, var k) = Get3DIdx(range1, range2, range3, src);
+
+            var kernel = GetKernel(action, src);
+
+<# for (int i = 0; i < c; i++) { #>
+            var gpu<#= i + 1 #> = new GPUArray<<#= letters[i] #>>(buf<#= i + 1 #>);
+<# } #>
+
+            kernel((len / block_size, block_size), i, j, k
+<# for (int i = 0; i < c; i++) { #>
+            , gpu<#= i + 1 #>
+<# } #>
+            );
+
+            int not_done = len % block_size;
+
+            if (not_done > 0)
+            {
+                int offset = len - not_done;
+                i.AddOffset(offset);
+                j.AddOffset(offset);
+                k.AddOffset(offset);
+
+                kernel((1, not_done), i, j, k
+<# for (int i = 0; i < c; i++) { #>
+                , gpu<#= i + 1 #>
+<# } #>
+                );
+            }
+
+            Synchronize();
+        }
+<# } #>
+    }
+}
diff --git a/DotMP/GPU/AssemblyAttributes.cs b/DotMP/GPU/AssemblyAttributes.cs
new file mode 100644
index 00000000..7077a588
--- /dev/null
+++ b/DotMP/GPU/AssemblyAttributes.cs
@@ -0,0 +1,19 @@
+/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+using System.Runtime.CompilerServices;
+
+[assembly: InternalsVisibleTo("ILGPURuntime")]
\ No newline at end of file
diff --git a/DotMP/GPU/Buffer.cs b/DotMP/GPU/Buffer.cs
new file mode 100644
index 00000000..26832163
--- /dev/null
+++ b/DotMP/GPU/Buffer.cs
@@ -0,0 +1,226 @@
+/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+using System;
+using System.Runtime.CompilerServices;
+using ILGPU;
+using ILGPU.Runtime;
+
+namespace DotMP.GPU
+{
+    namespace Buffer
+    {
+        /// <summary>
+        /// Specifies the behavior of the buffer.
+        /// </summary>
+        public enum Behavior
+        {
+            /// <summary>
+            /// Specifies that data should be transfered to the GPU, but not from it.
+            /// </summary>
+            To,
+            /// <summary>
+            /// Specifies that data should be transfered from the GPU, but not to it.
+            /// </summary>
+            From,
+            /// <summary>
+            /// Specifies that data should be transfered both to and from the GPU.
+            /// </summary>
+            ToFrom,
+            /// <summary>
+            /// Specifies that the data shouldn't be transfered to or from the GPU. For internal use.
+            /// </summary>
+            NoCopy
+        }
+    }
+
+    /// <summary>
+    /// Buffer to manage GPU memory. Should only be created on the CPU.
+    /// </summary>
+    public sealed class Buffer<T> : IDisposable
+        where T : unmanaged
+    {
+        /// <summary>
+        /// The ILGPU buffer for 1D arrays.
+        /// </summary>
+        private MemoryBuffer1D<T, Stride1D.Dense> buf1d;
+
+        /// <summary>
+        /// The ILGPU buffer for 2D arrays.
+        /// </summary>
+        private MemoryBuffer2D<T, Stride2D.DenseY> buf2d;
+
+        /// <summary>
+        /// The ILGPU buffer for 3D arrays.
+        /// </summary>
+        private MemoryBuffer3D<T, Stride3D.DenseXY> buf3d;
+
+        /// <summary>
+        /// Behavior of the data, as specified by Behavior.
+        /// </summary>
+        private Buffer.Behavior behavior;
+
+        /// <summary>
+        /// The CPU 1D array, so that we can copy the data back.
+        /// </summary>
+        private T[] data1d;
+
+        /// <summary>
+        /// The CPU 2D array, so that we can copy the data back.
+        /// </summary>
+        private T[,] data2d;
+
+        /// <summary>
+        /// The CPU 3D array, so that we can copy the data back.
+        /// </summary>
+        private T[,,] data3d;
+
+        /// <summary>
+        /// The number of dimensions in the array.
+        /// </summary>
+        internal int Dimensions { get; private set; }
+
+        /// <summary>
+        /// Constructor for buffer object. Allocates a 1D array on the GPU and makes it available for the next GPU kernel.
+        /// </summary>
+        /// <param name="data">The data to allocate on the GPU.</param>
+        /// <param name="behavior">The behavior of the data, see Behavior.</param>
+        public Buffer(T[] data, Buffer.Behavior behavior)
+        {
+            new AcceleratorHandler();
+
+            this.behavior = behavior;
+            this.data1d = data;
+
+            switch (behavior)
+            {
+                case Buffer.Behavior.To:
+                case Buffer.Behavior.ToFrom:
+                    buf1d = AcceleratorHandler.accelerator.Allocate1D(data);
+                    break;
+                case Buffer.Behavior.From:
+                case Buffer.Behavior.NoCopy:
+                    buf1d = AcceleratorHandler.accelerator.Allocate1D<T>(data.Length);
+                    break;
+            }
+
+            Dimensions = 1;
+        }
+
+        /// <summary>
+        /// Constructor for buffer object. Allocates a 2D array on the GPU and makes it available for the next GPU kernel.
+        /// </summary>
+        /// <param name="data">The data to allocate on the GPU.</param>
+        /// <param name="behavior">The behavior of the data, see Behavior.</param>
+        public Buffer(T[,] data, Buffer.Behavior behavior)
+        {
+            new AcceleratorHandler();
+
+            this.behavior = behavior;
+            this.data2d = data;
+
+            switch (behavior)
+            {
+                case Buffer.Behavior.To:
+                case Buffer.Behavior.ToFrom:
+                    buf2d = AcceleratorHandler.accelerator.Allocate2DDenseY(data);
+                    break;
+                case Buffer.Behavior.From:
+                case Buffer.Behavior.NoCopy:
+                    buf2d = AcceleratorHandler.accelerator.Allocate2DDenseY<T>((data.GetLength(0), data.GetLength(1)));
+                    break;
+            }
+
+            Dimensions = 2;
+        }
+
+        /// <summary>
+        /// Constructor for buffer object. Allocates a 3D array on the GPU and makes it available for the next GPU kernel.
+        /// </summary>
+        /// <param name="data">The data to allocate on the GPU.</param>
+        /// <param name="behavior">The behavior of the data, see Behavior.</param>
+        public Buffer(T[,,] data, Buffer.Behavior behavior)
+        {
+            new AcceleratorHandler();
+
+            this.behavior = behavior;
+            this.data3d = data;
+
+            switch (behavior)
+            {
+                case Buffer.Behavior.To:
+                case Buffer.Behavior.ToFrom:
+                    buf3d = AcceleratorHandler.accelerator.Allocate3DDenseXY<T>(data);
+                    break;
+                case Buffer.Behavior.From:
+                case Buffer.Behavior.NoCopy:
+                    buf3d = AcceleratorHandler.accelerator.Allocate3DDenseXY<T>((data.GetLength(0), data.GetLength(1), data.GetLength(2)));
+                    break;
+            }
+
+            Dimensions = 3;
+        }
+
+        /// <summary>
+        /// Dispose of the buffer, freeing GPU memory and copying any relevant data back to the CPU.
+        /// </summary>
+        public void Dispose()
+        {
+            if (Dimensions == 1)
+            {
+                if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom)
+                {
+                    buf1d.GetAsArray1D().CopyTo(data1d, 0);
+                }
+
+                buf1d.Dispose();
+            }
+            else if (Dimensions == 2)
+            {
+                if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom)
+                {
+                    System.Buffer.BlockCopy(buf2d.GetAsArray2D(), 0, data2d, 0, Unsafe.SizeOf<T>() * data2d.Length);
+                }
+
+                buf2d.Dispose();
+            }
+            else if (Dimensions == 3)
+            {
+                if (behavior == Buffer.Behavior.From || behavior == Buffer.Behavior.ToFrom)
+                {
+                    System.Buffer.BlockCopy(buf3d.GetAsArray3D(), 0, data3d, 0, Unsafe.SizeOf<T>() * data3d.Length);
+                }
+
+                buf3d.Dispose();
+            }
+        }
+
+        /// <summary>
+        /// Get the view of the memory for the GPU.
+        /// </summary>
+        internal ArrayView1D<T, Stride1D.Dense> View1D { get => buf1d.View; }
+
+        /// <summary>
+        /// Get the view of the memory for the GPU.
+        /// </summary>
+        internal ArrayView2D<T, Stride2D.DenseY> View2D { get => buf2d.View; }
+
+        /// <summary>
+        /// Get the view of the memory for the GPU.
+        /// </summary>
+        internal ArrayView3D<T, Stride3D.DenseXY> View3D { get => buf3d.View; }
+    }
+}
\ No newline at end of file
diff --git a/DotMP/GPU/Exceptions.cs b/DotMP/GPU/Exceptions.cs
new file mode 100644
index 00000000..4705041b
--- /dev/null
+++ b/DotMP/GPU/Exceptions.cs
@@ -0,0 +1,21 @@
+/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+using System;
+
+namespace DotMP.GPU
+{
+}
\ No newline at end of file
diff --git a/DotMP/GPU/Gpu.tt b/DotMP/GPU/Gpu.tt
new file mode 100644
index 00000000..6cf2d841
--- /dev/null
+++ b/DotMP/GPU/Gpu.tt
@@ -0,0 +1,140 @@
+/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+<#@ template debug="false" hostspecific="false" language="C#" #>
+<#@ output extension=".cs" #>
+<# var letters = new char[] { 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'A', 'B', 'C', 'D', 'E', 'F' };
+   int max = 13; #>
+
+using System;
+using System.Runtime.CompilerServices;
+
+namespace DotMP.GPU
+{
+    /// <summary>
+    /// The main class of DotMP's GPU API, powered by the ILGPU project.
+    /// Contains all the main methods for constructing and running GPU kernels.
+    /// The GPU API is not thread-safe at the current moment, so its methods should not be called from within a Parallel.ParallelRegion!
+    /// </summary>
+    public static class Parallel
+    {
+        /// <summary>
+        /// Formats the caller information for determining uniqueness of a call.
+        /// </summary>
+        /// <param name="filename">The calling file.</param>
+        /// <param name="linenum">The calling line number.</param>
+        /// <returns>A formatted string representing "{filename}:{linenum}"</returns>
+        private static string FormatCaller(string filename, int linenum)
+        {
+            return string.Format("{0}:{1}", filename, linenum);
+        }
+
+<# for (int c = 1; c <= max; c++) { #>
+        /// <summary>
+        /// Creates a GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// </summary>
+        /// <param name="start">The start of the loop, inclusive.</param>
+        /// <param name="end">The end of the loop, exclusive.</param>
+<# for (int i = 0; i < c; i++) { #>
+        /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
+<# } #>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
+        public static void ParallelFor<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+        >(int start, int end, 
+<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
+        Action<Index, 
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+        > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
+<# for (int i = 0; i < c; i++) { #> where <#= letters[i] #> : unmanaged <# } #>
+        {
+            var handler = new AcceleratorHandler();
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel((start, end), 
+<# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #>
+            action, src);
+        }
+<# } #>
+
+<# for (int c = 1; c <= max - 1; c++) { #>
+        /// <summary>
+        /// Creates a collapsed GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// </summary>
+        /// <param name="range1">The range of the outer for loop.</param>
+        /// <param name="range2">The range of the inner for loop.</param>
+<# for (int i = 0; i < c; i++) { #>
+        /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
+<# } #>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
+        public static void ParallelForCollapse<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+        >((int, int) range1, (int, int) range2, 
+<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
+        Action<Index, Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+        > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
+<# for (int i = 0; i < c; i++) { #>
+            where <#= letters[i] #> : unmanaged
+<# } #>
+        {
+            var handler = new AcceleratorHandler();
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(range1, range2, 
+<# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #>
+            action, src);
+        }
+<# } #>
+
+<# for (int c = 1; c <= max - 2; c++) { #>
+        /// <summary>
+        /// Creates a collapsed GPU parallel for loop.
+        /// The body of the kernel is run on a GPU target.
+        /// </summary>
+        /// <param name="range1">The range of the outer for loop.</param>
+        /// <param name="range2">The range of the middle for loop.</param>
+        /// <param name="range3">The range of the inner for loop.</param>
+<# for (int i = 0; i < c; i++) { #>
+        /// <param name="buf<#= i + 1 #>">Buffer #<#= i + 1 #> to run the kernel with.</param>
+<# } #>
+        /// <param name="action">The kernel to run on the GPU.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
+        public static void ParallelForCollapse<
+<# for (int i = 0; i < c; i++) { #> <#= letters[i] + ((i != c - 1) ? "," : "") #> <# } #>
+        >((int, int) range1, (int, int) range2, (int, int) range3,
+<# for (int i = 0; i < c; i++) { #> Buffer<<#= letters[i] #>> buf<#= i + 1 #>, <# } #>
+        Action<Index, Index, Index,
+<# for (int i = 0; i < c; i++) { #> GPUArray<<#= letters[i] #>><#= (i != c - 1) ? "," : "" #> <# } #>
+        > action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
+<# for (int i = 0; i < c; i++) { #>
+            where <#= letters[i] #> : unmanaged
+<# } #>
+        {
+            var handler = new AcceleratorHandler();
+            string src = FormatCaller(path, line);
+            handler.DispatchKernel(range1, range2, range3, 
+<# for (int i = 0; i < c; i++) { #> buf<#= i + 1 #>, <# } #>
+            action, src);
+        }
+<# } #>
+    }
+}
diff --git a/DotMP/GPU/GpuArray.cs b/DotMP/GPU/GpuArray.cs
new file mode 100644
index 00000000..036fe1a4
--- /dev/null
+++ b/DotMP/GPU/GpuArray.cs
@@ -0,0 +1,141 @@
+/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+using ILGPU;
+using ILGPU.IR.Values;
+using ILGPU.Runtime;
+using System;
+using System.Diagnostics.CodeAnalysis;
+
+namespace DotMP.GPU
+{
+    /// <summary>
+    /// Wrapper object for representing arrays on the GPU.
+    /// </summary>
+    /// <typeparam name="T"></typeparam>
+    [ExcludeFromCodeCoverage]
+    public struct GPUArray<T>
+        where T : unmanaged
+    {
+        /// <summary>
+        /// The ILGPU view for 1D arrays.
+        /// </summary>
+        private ArrayView1D<T, Stride1D.Dense> view1d;
+
+        /// <summary>
+        /// The ILGPU view for 2D arrays.
+        /// </summary>
+        private ArrayView2D<T, Stride2D.DenseY> view2d;
+
+        /// <summary>
+        /// The ILGPU view for 3D arrays.
+        /// </summary>
+        private ArrayView3D<T, Stride3D.DenseXY> view3d;
+
+        /// <summary>
+        /// Number of dimensions.
+        /// </summary>
+        private int dims;
+
+        /// <summary>
+        /// Constructor.
+        /// </summary>
+        /// <param name="buf">The Buffer to create an array from.</param>
+        internal GPUArray(Buffer<T> buf)
+        {
+            switch (buf.Dimensions)
+            {
+                default:
+                case 1:
+                    view1d = buf.View1D;
+                    // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                    view2d = new Buffer<T>(new T[1, 1], Buffer.Behavior.NoCopy).View2D;
+                    // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                    view3d = new Buffer<T>(new T[1, 1, 1], Buffer.Behavior.NoCopy).View3D;
+                    break;
+                case 2:
+                    // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                    view1d = new Buffer<T>(new T[1], Buffer.Behavior.NoCopy).View1D;
+                    view2d = buf.View2D;
+                    // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                    view3d = new Buffer<T>(new T[1, 1, 1], Buffer.Behavior.NoCopy).View3D;
+                    break;
+                case 3:
+                    // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                    view1d = new Buffer<T>(new T[1], Buffer.Behavior.NoCopy).View1D;
+                    // BAND-AID FIX: Cannot use empty ArrayViews on OpenCL devices.
+                    view2d = new Buffer<T>(new T[1, 1], Buffer.Behavior.NoCopy).View2D;
+                    view3d = buf.View3D;
+                    break;
+            }
+
+            dims = buf.Dimensions;
+        }
+
+        /// <summary>
+        /// Overload for [] operator.
+        /// </summary>
+        /// <param name="idx">The ID to index into.</param>
+        /// <returns>The data at that ID.</returns>
+        public ref T this[int idx]
+        {
+            get => ref view1d[idx];
+        }
+
+        /// <summary>
+        /// Overload for [,] operator.
+        /// </summary>
+        /// <param name="i">The first ID to index into.</param>
+        /// <param name="j">The second ID to index into.</param>
+        /// <returns>The data at that ID.</returns>
+        public ref T this[int i, int j]
+        {
+            get => ref view2d[i, j];
+        }
+
+        /// <summary>
+        /// Overload for [,,] operator.
+        /// </summary>
+        /// <param name="i">The first ID to index into.</param>
+        /// <param name="j">The second ID to index into.</param>
+        /// <param name="k">The third ID to index into.</param>
+        /// <returns>The data at that ID.</returns>
+        public ref T this[int i, int j, int k]
+        {
+            get => ref view3d[i, j, k];
+        }
+
+        /// <summary>
+        /// Gets the length of the array.
+        /// </summary>
+        public int Length
+        {
+            get
+            {
+                switch (dims)
+                {
+                    case 1:
+                    default:
+                        return view1d.IntLength;
+                    case 2:
+                        return view2d.IntLength;
+                    case 3:
+                        return view3d.IntLength;
+                }
+            }
+        }
+    }
+}
diff --git a/DotMP/GPU/Index.cs b/DotMP/GPU/Index.cs
new file mode 100644
index 00000000..8de4dc3e
--- /dev/null
+++ b/DotMP/GPU/Index.cs
@@ -0,0 +1,78 @@
+/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+using ILGPU;
+using ILGPU.Runtime;
+using System.Diagnostics.CodeAnalysis;
+using System.Linq;
+using System.Runtime.CompilerServices;
+using System.Xml;
+
+namespace DotMP.GPU
+{
+    /// <summary>
+    /// Represents an index passed as the first index argument.
+    /// </summary>
+    [ExcludeFromCodeCoverage]
+    public struct Index
+    {
+        /// <summary>
+        /// Lookup table for indices.
+        /// </summary>
+        private ArrayView1D<int, Stride1D.Dense> lookup;
+        /// <summary>
+        /// Offset for followup kernels.
+        /// </summary>
+        private int offset;
+        /// <summary>
+        /// Cached index.
+        /// </summary>
+        private int idx;
+
+        /// <summary>
+        /// Constructor.
+        /// </summary>
+        /// <param name="buf">Buffer representing the indices.</param>
+        internal Index(Buffer<int> buf)
+        {
+            this.lookup = buf.View1D;
+            offset = 0;
+            idx = -1;
+        }
+
+        /// <summary>
+        /// Adds an offset in preperation for a followup kernel.
+        /// </summary>
+        /// <param name="offset">The offset to set.</param>
+        internal void AddOffset(int offset)
+        {
+            this.offset = offset;
+        }
+
+        /// <summary>
+        /// Calculates the index and caches for future use.
+        /// </summary>
+        /// <param name="i">The Index object to cast to int.</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static implicit operator int(Index i)
+        {
+            if (i.idx == -1)
+                i.idx = i.lookup[Grid.GlobalLinearIndex + i.offset];
+
+            return i.idx;
+        }
+    }
+}
diff --git a/DotMP/Parallel.cs b/DotMP/Parallel.cs
index a7f4f8c9..a64ae99f 100644
--- a/DotMP/Parallel.cs
+++ b/DotMP/Parallel.cs
@@ -16,31 +16,34 @@
 
 using System;
 using System.Collections.Generic;
+using System.Runtime.CompilerServices;
+using System.ComponentModel;
 using System.Threading;
 using DotMP.Exceptions;
 using DotMP.Schedulers;
+using System.Diagnostics.CodeAnalysis;
 
 namespace DotMP
 {
     /// <summary>
     /// The main class of DotMP.
     /// Contains all the main methods for parallelism.
-    /// For users, this is the main class you want to worry about, along with Lock, Shared, and Atomic
+    /// For users, this is the main class you want to worry about, along with Lock, Shared, Atomic, and GPU.
     /// </summary>
     public static class Parallel
     {
         /// <summary>
         /// The dictionary for critical regions.
         /// </summary>
-        private static volatile Dictionary<int, object> critical_lock = new Dictionary<int, object>();
+        private static volatile Dictionary<string, object> critical_lock = new Dictionary<string, object>();
         /// <summary>
         /// The dictionary for single regions.
         /// </summary>
-        private static volatile HashSet<int> single_thread = new HashSet<int>();
+        private static volatile HashSet<string> single_thread = new HashSet<string>();
         /// <summary>
         /// The dictionary for ordered regions.
         /// </summary>
-        private static volatile Dictionary<int, int> ordered = new Dictionary<int, int>();
+        private static volatile Dictionary<string, int> ordered = new Dictionary<string, int>();
         /// <summary>
         /// Barrier object for DotMP.Parallel.Barrier()
         /// </summary>
@@ -174,6 +177,17 @@ private static void ValidateParams(int start = 0, int end = 0, IScheduler schedu
                 throw new InvalidArgumentsException(string.Format("Chunk size must be specified with user-defined schedulers, as it cannot be inferred."));
         }
 
+        /// <summary>
+        /// Formats the caller information for determining uniqueness of a call.
+        /// </summary>
+        /// <param name="filename">The calling file.</param>
+        /// <param name="linenum">The calling line number.</param>
+        /// <returns>A formatted string representing "{filename}:{linenum}"</returns>
+        private static string FormatCaller(string filename, int linenum)
+        {
+            return string.Format("{0}:{1}", filename, linenum);
+        }
+
         /// <summary>
         /// Creates a for loop inside a parallel region.
         /// A for loop created with For inside of a parallel region is executed in parallel, with iterations being distributed among the threads, and potentially out-of-order.
@@ -1096,11 +1110,16 @@ public static void ParallelSections(uint? num_threads = null, params Action[] ac
         /// Creates a critical region.
         /// A critical region is a region of code that can only be executed by one thread at a time.
         /// If a thread encounters a critical region while another thread is inside a critical region, it will wait until the other thread is finished.
+        /// 
+        /// THIS METHOD IS NOW DEPRECATED.
         /// </summary>
         /// <param name="id">The ID of the critical region. Must be unique per region but consistent across all threads.</param>
         /// <param name="action">The action to be performed in the critical region.</param>
         /// <returns>The ID of the critical region.</returns>
         /// <exception cref="NotInParallelRegionException">Thrown when not in a parallel region.</exception>
+        [Obsolete("This version of Critical is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")]
+        [EditorBrowsable(EditorBrowsableState.Never)]
+        [ExcludeFromCodeCoverage]
         public static int Critical(int id, Action action)
         {
             if (!InParallel())
@@ -1110,6 +1129,45 @@ public static int Critical(int id, Action action)
 
             object lock_obj;
 
+            lock (critical_lock)
+            {
+                if (!critical_lock.ContainsKey(id.ToString()))
+                {
+                    critical_lock.Add(id.ToString(), new object());
+                }
+
+                lock_obj = critical_lock[id.ToString()];
+            }
+
+            lock (lock_obj)
+            {
+                action();
+            }
+
+            return id;
+        }
+
+        /// <summary>
+        /// Creates a critical region.
+        /// A critical region is a region of code that can only be executed by one thread at a time.
+        /// If a thread encounters a critical region while another thread is inside a critical region, it will wait until the other thread is finished.
+        /// </summary>
+        /// <param name="action">The action to be performed in the critical region.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
+        /// <returns>The ID of the critical region.</returns>
+        /// <exception cref="NotInParallelRegionException">Thrown when not in a parallel region.</exception>
+        public static void Critical(Action action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
+        {
+            string id = FormatCaller(path, line);
+
+            if (!InParallel())
+            {
+                throw new NotInParallelRegionException("Cannot use DotMP Critical outside of a parallel region.");
+            }
+
+            object lock_obj;
+
             lock (critical_lock)
             {
                 if (!critical_lock.ContainsKey(id))
@@ -1124,8 +1182,6 @@ public static int Critical(int id, Action action)
             {
                 action();
             }
-
-            return id;
         }
 
         /// <summary>
@@ -1180,11 +1236,16 @@ public static void Master(Action action)
         /// Creates a single region.
         /// A single region is only executed once per Parallel.ParallelRegion.
         /// The first thread to encounter the single region marks the region as encountered, then executes it.
+        /// 
+        /// THIS METHOD IS NOW DEPRECATED.
         /// </summary>
         /// <param name="id">The ID of the single region. Must be unique per region but consistent across all threads.</param>
         /// <param name="action">The action to be performed in the single region.</param>
         /// <exception cref="NotInParallelRegionException">Thrown when not in a parallel region.</exception>
         /// <exception cref="CannotPerformNestedWorksharingException">Thrown when nested inside another worksharing region.</exception>
+        [Obsolete("This version of Single is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")]
+        [EditorBrowsable(EditorBrowsableState.Never)]
+        [ExcludeFromCodeCoverage]
         public static void Single(int id, Action action)
         {
             var freg = new ForkedRegion();
@@ -1204,6 +1265,55 @@ public static void Single(int id, Action action)
 
             Interlocked.Increment(ref freg.in_workshare);
 
+            lock (single_thread)
+            {
+                if (!single_thread.Contains(id.ToString()))
+                {
+                    single_thread.Add(id.ToString());
+                    new_single = true;
+                }
+            }
+
+            if (new_single)
+            {
+                action();
+            }
+
+            Interlocked.Decrement(ref freg.in_workshare);
+
+            Barrier();
+        }
+
+        /// <summary>
+        /// Creates a single region.
+        /// A single region is only executed once per Parallel.ParallelRegion.
+        /// The first thread to encounter the single region marks the region as encountered, then executes it.
+        /// </summary>
+        /// <param name="action">The action to be performed in the single region.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
+        /// <exception cref="NotInParallelRegionException">Thrown when not in a parallel region.</exception>
+        /// <exception cref="CannotPerformNestedWorksharingException">Thrown when nested inside another worksharing region.</exception>
+        public static void Single(Action action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
+        {
+            string id = FormatCaller(path, line);
+            var freg = new ForkedRegion();
+            bool new_single = false;
+
+            if (!freg.in_parallel)
+            {
+                throw new NotInParallelRegionException("Cannot use DotMP Single outside of a parallel region.");
+            }
+
+            var ws = new WorkShare();
+
+            if (ws.in_for)
+            {
+                throw new CannotPerformNestedWorksharingException("Cannot use DotMP Single nested within other worksharing constructs.");
+            }
+
+            Interlocked.Increment(ref freg.in_workshare);
+
             lock (single_thread)
             {
                 if (!single_thread.Contains(id))
@@ -1227,10 +1337,15 @@ public static void Single(int id, Action action)
         /// Creates an ordered region.
         /// An ordered region is a region of code that is executed in order inside of a For() or ForReduction&lt;T&gt;() loop.
         /// This also acts as an implicit Critical() region.
+        /// 
+        /// THIS METHOD IS NOW DEPRECATED.
         /// </summary>
         /// <param name="id">The ID of the ordered region. Must be unique per region but consistent across all threads.</param>
         /// <param name="action">The action to be performed in the ordered region.</param>
         /// <exception cref="NotInParallelRegionException">Thrown when not in a parallel region.</exception>
+        [Obsolete("This version of Ordered is deprecated. Omit the id parameter for the updated version. This overload will be removed in a future release.")]
+        [EditorBrowsable(EditorBrowsableState.Never)]
+        [ExcludeFromCodeCoverage]
         public static void Ordered(int id, Action action)
         {
             var freg = new ForkedRegion();
@@ -1240,6 +1355,46 @@ public static void Ordered(int id, Action action)
                 throw new NotInParallelRegionException("Cannot use DotMP Ordered outside of a parallel region.");
             }
 
+            lock (ordered)
+            {
+                if (!ordered.ContainsKey(id.ToString()))
+                {
+                    ordered.Add(id.ToString(), 0);
+                }
+                Thread.MemoryBarrier();
+            }
+
+            WorkShare ws = new WorkShare();
+
+            while (ordered[id.ToString()] != ws.working_iter) ;
+
+            action();
+
+            lock (ordered)
+            {
+                ordered[id.ToString()]++;
+            }
+        }
+
+        /// <summary>
+        /// Creates an ordered region.
+        /// An ordered region is a region of code that is executed in order inside of a For() or ForReduction&lt;T&gt;() loop.
+        /// This also acts as an implicit Critical() region.
+        /// </summary>
+        /// <param name="action">The action to be performed in the ordered region.</param>
+        /// <param name="line">The line number this method was called from.</param>
+        /// <param name="path">The path to the file this method was called from.</param>
+        /// <exception cref="NotInParallelRegionException">Thrown when not in a parallel region.</exception>
+        public static void Ordered(Action action, [CallerFilePath] string path = "", [CallerLineNumber] int line = 0)
+        {
+            string id = FormatCaller(path, line);
+            var freg = new ForkedRegion();
+
+            if (!freg.in_parallel)
+            {
+                throw new NotInParallelRegionException("Cannot use DotMP Ordered outside of a parallel region.");
+            }
+
             lock (ordered)
             {
                 if (!ordered.ContainsKey(id))
@@ -1270,7 +1425,7 @@ public static int GetNumThreads()
         {
             var freg = new ForkedRegion();
 
-            return (freg.reg is not null)
+            return freg.in_parallel
                     ? (int)freg.reg.num_threads
                     : 1;
         }
diff --git a/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj b/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj
new file mode 100644
index 00000000..9cf0a6f0
--- /dev/null
+++ b/benchmarks/GPUHeatTransfer/GPUHeatTransfer.csproj
@@ -0,0 +1,18 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net6.0</TargetFramework>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="BenchmarkDotNet" Version="0.13.10" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\..\DotMP\DotMP.csproj" />
+  </ItemGroup>
+
+</Project>
diff --git a/benchmarks/GPUHeatTransfer/Program.cs b/benchmarks/GPUHeatTransfer/Program.cs
new file mode 100644
index 00000000..75d0747f
--- /dev/null
+++ b/benchmarks/GPUHeatTransfer/Program.cs
@@ -0,0 +1,307 @@
+﻿/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Jobs;
+using BenchmarkDotNet.Running;
+using BenchmarkDotNet.Diagnosers;
+
+/* jscpd:ignore-start */
+
+[SimpleJob(RuntimeMoniker.Net60)]
+[ThreadingDiagnoser]
+[HardwareCounters]
+[EventPipeProfiler(EventPipeProfile.CpuSampling)]
+// test heat transfer using Parallel.For
+public class HeatTransfer
+{
+    // scratch array
+    private double[,] scratch = new double[0, 0];
+    // grid array
+    private double[,] grid = new double[0, 0];
+
+    // parallel type enum
+    public enum ParType { DMPFor, DMPGPU }
+
+    // test dims of 100x100, 1000x1000, and 5000x5000
+    [Params(768)]
+    public int dim;
+
+    // test with 10 steps and 100 steps
+    [Params(100)]
+    public int steps;
+
+    // test with all 3 parallel types
+    [Params(ParType.DMPFor, ParType.DMPGPU)]
+    public ParType type;
+
+    // change this to configure the number of threads to use
+    public uint num_threads = 6;
+
+    // buffer for grid
+    private DotMP.GPU.Buffer<double> gridbuf;
+
+    // buffer for scratch
+    private DotMP.GPU.Buffer<double> scratchbuf;
+
+    // run the setup
+    [GlobalSetup]
+    public void Setup()
+    {
+        scratch = new double[dim, dim];
+        grid = new double[dim, dim];
+
+        for (int i = 0; i < dim; i++)
+        {
+            grid[0, i] = 100.0;
+            grid[i, 0] = 100.0;
+            grid[dim - 1, i] = 100.0;
+            grid[i, dim - 1] = 100.0;
+        }
+
+        if (type == ParType.DMPGPU)
+        {
+            gridbuf = new DotMP.GPU.Buffer<double>(grid, DotMP.GPU.Buffer.Behavior.ToFrom);
+            scratchbuf = new DotMP.GPU.Buffer<double>(scratch, DotMP.GPU.Buffer.Behavior.NoCopy);
+        }
+    }
+
+    //run the simulation
+    [Benchmark]
+    public void DoSimulation()
+    {
+        Action action = () =>
+        {
+            //do the steps
+            for (int i = 0; i < steps; i++)
+            {
+                DoStep();
+            }
+        };
+
+        if (type == ParType.DMPGPU)
+        {
+            action();
+            //gridbuf.Dispose();
+            //scratchbuf.Dispose();
+        }
+        else
+        {
+            // spawn a parallel region
+            DotMP.Parallel.ParallelRegion(num_threads: num_threads, action: action);
+        }
+    }
+
+    //do a step of the heat transfer simulation
+    public void DoStep()
+    {
+        switch (type)
+        {
+            case ParType.DMPFor:
+                //iterate over all cells not on the border
+                DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i =>
+                {
+                    for (int j = 1; j < dim - 1; j++)
+                    {
+                        //set the scratch array to the average of the surrounding cells
+                        scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
+                    }
+                });
+
+                //copy the scratch array to the grid array
+                DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i =>
+                {
+                    for (int j = 1; j < dim - 1; j++)
+                    {
+                        grid[i, j] = scratch[i, j];
+                    }
+                });
+                break;
+
+            case ParType.DMPGPU:
+                DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (i, j, grid, scratch) =>
+                {
+                    //set the scratch array to the average of the surrounding cells
+                    scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
+                });
+
+                DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (i, j, grid, scratch) =>
+                {
+                    grid[i, j] = scratch[i, j];
+                });
+                break;
+        }
+    }
+}
+
+// test heat transfer using Parallel.For
+public class HeatTransferVerify
+{
+    // scratch array
+    private double[,] scratch = new double[0, 0];
+    // grid array
+    private double[,] grid = new double[0, 0];
+
+    // parallel type enum
+    public enum ParType { DMPFor, DMPGPU }
+
+    // test dims of 100x100, 1000x1000, and 5000x5000
+    public int dim = 1000;
+
+    // test with 10 steps and 100 steps
+    public int steps = 100;
+
+    // test with all 3 parallel types
+    public ParType type = ParType.DMPFor;
+
+    // change this to configure the number of threads to use
+    public uint num_threads = 6;
+
+    // buffer for grid
+    private DotMP.GPU.Buffer<double> gridbuf;
+
+    // buffer for scratch
+    private DotMP.GPU.Buffer<double> scratchbuf;
+
+    // run the setup
+    public void Setup()
+    {
+        scratch = new double[dim, dim];
+        grid = new double[dim, dim];
+
+        for (int i = 0; i < dim; i++)
+        {
+            grid[0, i] = 100.0;
+            grid[i, 0] = 100.0;
+            grid[dim - 1, i] = 100.0;
+            grid[i, dim - 1] = 100.0;
+        }
+
+        if (type == ParType.DMPGPU)
+        {
+            gridbuf = new DotMP.GPU.Buffer<double>(grid, DotMP.GPU.Buffer.Behavior.ToFrom);
+            scratchbuf = new DotMP.GPU.Buffer<double>(scratch, DotMP.GPU.Buffer.Behavior.NoCopy);
+        }
+    }
+
+    //run the simulation
+    public void DoSimulation()
+    {
+        Action action = () =>
+        {
+            //do the steps
+            for (int i = 0; i < steps; i++)
+            {
+                DoStep();
+            }
+        };
+
+        if (type == ParType.DMPGPU)
+        {
+            action();
+            gridbuf.Dispose();
+            scratchbuf.Dispose();
+        }
+        else
+        {
+            // spawn a parallel region
+            DotMP.Parallel.ParallelRegion(num_threads: num_threads, action: action);
+        }
+    }
+
+    //do a step of the heat transfer simulation
+    public void DoStep()
+    {
+        switch (type)
+        {
+            case ParType.DMPFor:
+                //iterate over all cells not on the border
+                DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i =>
+                {
+                    for (int j = 1; j < dim - 1; j++)
+                    {
+                        //set the scratch array to the average of the surrounding cells
+                        scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
+                    }
+                });
+
+                //copy the scratch array to the grid array
+                DotMP.Parallel.For(1, dim - 1, schedule: DotMP.Schedule.Guided, action: i =>
+                {
+                    for (int j = 1; j < dim - 1; j++)
+                    {
+                        grid[i, j] = scratch[i, j];
+                    }
+                });
+                break;
+
+            case ParType.DMPGPU:
+                DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (i, j, grid, scratch) =>
+                {
+                    //set the scratch array to the average of the surrounding cells
+                    scratch[i, j] = 0.25 * (grid[i - 1, j] + grid[i + 1, j] + grid[i, j - 1] + grid[i, j + 1]);
+                });
+
+                DotMP.GPU.Parallel.ParallelForCollapse((1, dim - 1), (1, dim - 1), gridbuf, scratchbuf, (i, j, grid, scratch) =>
+                {
+                    grid[i, j] = scratch[i, j];
+                });
+                break;
+        }
+    }
+
+    public void Verify()
+    {
+        type = ParType.DMPFor;
+        Setup();
+        DoSimulation();
+        double[,] gridA = grid;
+
+        type = ParType.DMPGPU;
+        Setup();
+        DoSimulation();
+        double[,] gridB = grid;
+
+        bool wrong = false;
+
+        for (int i = 0; i < dim; i++)
+            for (int j = 0; j < dim; j++)
+                if (gridA[i, j] != gridB[i, j])
+                {
+                    wrong = true;
+                    Console.WriteLine("Wrong at ({0}, {1}), expected {2}, got {3}.", i, j, gridA[i, j], gridB[i, j]);
+                }
+
+        if (wrong)
+            Console.WriteLine("WRONG RESULT");
+        else
+            Console.WriteLine("RIGHT RESULT");
+    }
+}
+
+/* jscpd:ignore-end */
+
+// driver
+public class Program
+{
+    public static void Main(string[] args)
+    {
+        if (args.Length > 0 && args[0] == "verify")
+            new HeatTransferVerify().Verify();
+        else
+            BenchmarkRunner.Run<HeatTransfer>();
+    }
+}
diff --git a/benchmarks/GPUOverhead/GPUOverhead.csproj b/benchmarks/GPUOverhead/GPUOverhead.csproj
new file mode 100644
index 00000000..9cf0a6f0
--- /dev/null
+++ b/benchmarks/GPUOverhead/GPUOverhead.csproj
@@ -0,0 +1,18 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net6.0</TargetFramework>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="BenchmarkDotNet" Version="0.13.10" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\..\DotMP\DotMP.csproj" />
+  </ItemGroup>
+
+</Project>
diff --git a/benchmarks/GPUOverhead/Program.cs b/benchmarks/GPUOverhead/Program.cs
new file mode 100644
index 00000000..9c0dde2c
--- /dev/null
+++ b/benchmarks/GPUOverhead/Program.cs
@@ -0,0 +1,56 @@
+﻿/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Jobs;
+using BenchmarkDotNet.Running;
+using BenchmarkDotNet.Diagnosers;
+
+/* jscpd:ignore-start */
+
+[SimpleJob(RuntimeMoniker.Net60)]
+[ThreadingDiagnoser]
+[HardwareCounters]
+[EventPipeProfiler(EventPipeProfile.CpuSampling)]
+public class Overhead
+{
+    DotMP.GPU.Buffer<int> buf;
+
+    // run the setup
+    [GlobalSetup]
+    public void Setup()
+    {
+        buf = new DotMP.GPU.Buffer<int>(new int[1, 1], DotMP.GPU.Buffer.Behavior.NoCopy);
+    }
+
+    //run the simulation
+    [Benchmark]
+    public void TestOverhead()
+    {
+        DotMP.GPU.Parallel.ParallelForCollapse((0, 500), (0, 500), buf, (i, j, buf) => { });
+    }
+}
+
+/* jscpd:ignore-end */
+
+// driver
+public class Program
+{
+    public static void Main(string[] args)
+    {
+        BenchmarkRunner.Run<Overhead>();
+    }
+}
diff --git a/benchmarks/ILGPUOverhead/ILGPUOverhead.csproj b/benchmarks/ILGPUOverhead/ILGPUOverhead.csproj
new file mode 100644
index 00000000..9cf0a6f0
--- /dev/null
+++ b/benchmarks/ILGPUOverhead/ILGPUOverhead.csproj
@@ -0,0 +1,18 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFramework>net6.0</TargetFramework>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="BenchmarkDotNet" Version="0.13.10" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\..\DotMP\DotMP.csproj" />
+  </ItemGroup>
+
+</Project>
diff --git a/benchmarks/ILGPUOverhead/Program.cs b/benchmarks/ILGPUOverhead/Program.cs
new file mode 100644
index 00000000..6153183c
--- /dev/null
+++ b/benchmarks/ILGPUOverhead/Program.cs
@@ -0,0 +1,63 @@
+﻿/*
+* DotMP - A collection of powerful abstractions for parallel programming in .NET with an OpenMP-like API. 
+* Copyright (C) 2023 Phillip Allen Lane
+*
+* This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser
+* General Public License as published by the Free Software Foundation; either version 2.1 of the License, or
+* (at your option) any later version.
+*
+* This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+* License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License along with this library; if not,
+* write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+*/
+
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Jobs;
+using BenchmarkDotNet.Running;
+using BenchmarkDotNet.Diagnosers;
+using System;
+using ILGPU;
+using ILGPU.Runtime;
+
+/* jscpd:ignore-start */
+
+[SimpleJob(RuntimeMoniker.Net60)]
+[ThreadingDiagnoser]
+[HardwareCounters]
+[EventPipeProfiler(EventPipeProfile.CpuSampling)]
+public class Overhead
+{
+    Action<KernelConfig, ArrayView1D<int, Stride1D.Dense>> kernel;
+    ArrayView1D<int, Stride1D.Dense> data;
+
+    // run the setup
+    [GlobalSetup]
+    public void Setup()
+    {
+        var context = Context.CreateDefault();
+        var accelerator = context.Devices[1].CreateAccelerator(context);
+        kernel = accelerator.LoadStreamKernel<ArrayView1D<int, Stride1D.Dense>>(arr => { });
+        data = accelerator.Allocate1D<int>(1);
+    }
+
+    //run the simulation
+    [Benchmark]
+    public void TestOverhead()
+    {
+        kernel((1, 256), data);
+    }
+}
+
+/* jscpd:ignore-end */
+
+// driver
+public class Program
+{
+    public static void Main(string[] args)
+    {
+        BenchmarkRunner.Run<Overhead>();
+    }
+}