From 2a8d17c3051552f8deec557b129f95f6d26231b1 Mon Sep 17 00:00:00 2001 From: Tianjiao Sun Date: Thu, 11 Apr 2019 16:36:14 +0100 Subject: [PATCH 001/100] codegen: Implement SIMD vectorisation Only works when kernel is a Loopy kernel. --- pyop2/codegen/rep2loopy.py | 11 ++++-- pyop2/configuration.py | 2 + pyop2/sequential.py | 79 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 85 insertions(+), 7 deletions(-) diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py index 9d4cf5837..c0597bfc5 100644 --- a/pyop2/codegen/rep2loopy.py +++ b/pyop2/codegen/rep2loopy.py @@ -247,13 +247,14 @@ def solve_fn_lookup(target, identifier): class _PreambleGen(ImmutableRecord): - fields = set(("preamble", )) + fields = {"preamble", "idx"} - def __init__(self, preamble): + def __init__(self, preamble, idx="0"): self.preamble = preamble + self.idx = idx def __call__(self, preamble_info): - yield ("0", self.preamble) + yield (self.idx, self.preamble) class PyOP2KernelCallable(loopy.ScalarCallable): @@ -610,7 +611,9 @@ def generate(builder, wrapper_name=None): options=options, assumptions=assumptions, lang_version=(2018, 2), - name=wrapper_name) + name=wrapper_name, + # TODO, should these really be silenced? + silenced_warnings=["write_race*"]) # prioritize loops for indices in context.index_ordering: diff --git a/pyop2/configuration.py b/pyop2/configuration.py index c5259340e..60edf173d 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -76,6 +76,8 @@ class Configuration(dict): DEFAULTS = { "compiler": ("PYOP2_BACKEND_COMPILER", str, "gcc"), "simd_width": ("PYOP2_SIMD_WIDTH", int, 4), + "alignment": ("PYOP2_ALIGNMENT", int, 64), + "time": ("PYOP2_TIME", bool, False), "debug": ("PYOP2_DEBUG", bool, False), "cflags": ("PYOP2_CFLAGS", str, ""), "ldflags": ("PYOP2_LDFLAGS", str, ""), diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 1dbab1c18..3a166660e 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -37,6 +37,8 @@ from copy import deepcopy as dcopy import ctypes +import loopy +import numpy from pyop2.datatypes import IntType, as_ctypes from pyop2 import base @@ -57,8 +59,48 @@ from pyop2.mpi import collective from pyop2.profiling import timed_region from pyop2.utils import cached_property, get_petsc_dir +from pyop2.configuration import configuration +from pyop2.codegen.rep2loopy import _PreambleGen -import loopy + +def vectorise(wrapper, iname, batch_size): + """Return a vectorised version of wrapper, vectorising over iname. + + :arg wrapper: A loopy kernel to vectorise. + :arg iname: The iteration index to vectorise over. + :arg batch_size: The vector width.""" + if batch_size == 1: + return wrapper + + # create constant zero vectors + wrapper = wrapper.copy(target=loopy.CVecTarget()) + kernel = wrapper.root_kernel + zeros = loopy.TemporaryVariable("_zeros", shape=loopy.auto, dtype=numpy.float64, read_only=True, + initializer=numpy.array(0.0, dtype=numpy.float64), + address_space=loopy.AddressSpace.GLOBAL, zero_size=batch_size) + tmps = kernel.temporary_variables.copy() + tmps["_zeros"] = zeros + kernel = kernel.copy(temporary_variables=tmps) + + # split iname and vectorize the inner loop + inner_iname = iname + "_batch" + + # vectorize using vector extenstions + kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="c_vec", inner_iname=inner_iname) + + alignment = configuration["alignment"] + tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) + kernel = kernel.copy(temporary_variables=tmps) + + wrapper = wrapper.with_root_kernel(kernel) + + # vector data type + vec_types = [("double", 8), ("int", 4)] # scalar type, bytes + preamble = ["typedef {0} {0}{1} __attribute__ ((vector_size ({2})));".format(t, batch_size, batch_size * b) for t, b in vec_types] + preamble = "\n" + "\n".join(preamble) + + wrapper = loopy.register_preamble_generators(wrapper, [_PreambleGen(preamble, idx="01")]) + return wrapper class JITModule(base.JITModule): @@ -122,6 +164,15 @@ def code_to_compile(self): builder.add_argument(arg) wrapper = generate(builder) + if self._iterset._extruded: + iname = "layer" + else: + iname = "n" + has_matrix = any(arg._is_mat for arg in self._args) + has_rw = any(arg.access == RW for arg in self._args) + if isinstance(self._kernel.code, loopy.LoopKernel) and not (has_matrix or has_rw): + wrapper = loopy.inline_callable_kernel(wrapper, self._kernel.name) + wrapper = vectorise(wrapper, iname, configuration["simd_width"]) code = loopy.generate_code_v2(wrapper) if self._kernel._cpp: @@ -137,8 +188,6 @@ def compile(self): if not hasattr(self, '_args'): raise RuntimeError("JITModule has no args associated with it, should never happen") - from pyop2.configuration import configuration - compiler = configuration["compiler"] extension = "cpp" if self._kernel._cpp else "c" cppargs = self._cppargs @@ -184,6 +233,24 @@ def argtypes(self): class ParLoop(petsc_base.ParLoop): + def set_nbytes(self, args): + nbytes = 0 + seen = set() + for arg in args: + if arg.access is INC: + nbytes += arg.data.nbytes + else: + nbytes += arg.data.nbytes + for map_ in arg.map_tuple: + if map_ is None: + continue + for k in map_._kernel_args_: + if k in seen: + continue + nbytes += map_.values.nbytes + seen.add(k) + self.nbytes = nbytes + def prepare_arglist(self, iterset, *args): arglist = iterset._kernel_args_ for arg in args: @@ -199,6 +266,8 @@ def prepare_arglist(self, iterset, *args): continue arglist += (k,) seen.add(k) + if configuration["time"]: + self.set_nbytes(args) return arglist @cached_property @@ -213,6 +282,10 @@ def _compute_event(self): @collective def _compute(self, part, fun, *arglist): + if configuration["time"]: + nbytes = self.comm.allreduce(self.nbytes) + if self.comm.Get_rank() == 0: + print("{0}_BYTES= {1}".format(self._jitmodule._wrapper_name, nbytes)) with self._compute_event: self.log_flops(part.size * self.num_flops) fun(part.offset, part.offset + part.size, *arglist) From fbc6e4aa86e3e5b880f9854158ffb166c468b145 Mon Sep 17 00:00:00 2001 From: tj sun Date: Thu, 1 Aug 2019 17:55:28 +0100 Subject: [PATCH 002/100] add omp simd vectorization mode --- pyop2/configuration.py | 1 + pyop2/sequential.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pyop2/configuration.py b/pyop2/configuration.py index 60edf173d..9d2076bb6 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -76,6 +76,7 @@ class Configuration(dict): DEFAULTS = { "compiler": ("PYOP2_BACKEND_COMPILER", str, "gcc"), "simd_width": ("PYOP2_SIMD_WIDTH", int, 4), + "vectorization_strategy":("PYOP2_VECT_STRATEGY", str, "ve"), "alignment": ("PYOP2_ALIGNMENT", int, 64), "time": ("PYOP2_TIME", bool, False), "debug": ("PYOP2_DEBUG", bool, False), diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 3a166660e..cc8d78d55 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -85,8 +85,13 @@ def vectorise(wrapper, iname, batch_size): # split iname and vectorize the inner loop inner_iname = iname + "_batch" - # vectorize using vector extenstions - kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="c_vec", inner_iname=inner_iname) + if configuration["vectorization_strategy"] == "ve": + # vectorize using vector extenstions + kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="c_vec", inner_iname=inner_iname) + else: + # vectoriza using omp pragma simd + assert configuration["vectorization_strategy"] == "omp" + kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="omp_simd", inner_iname=inner_iname) alignment = configuration["alignment"] tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) From 5ae780da4444d78ffa19558be93ddd20903f8e46 Mon Sep 17 00:00:00 2001 From: tj sun Date: Sun, 4 Aug 2019 19:46:27 +0100 Subject: [PATCH 003/100] add openmp flag and by pass workaround flag --- pyop2/codegen/rep2loopy.py | 9 +++++++++ pyop2/compilation.py | 4 ++-- pyop2/sequential.py | 9 ++++++--- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py index c0597bfc5..2fa117f9c 100644 --- a/pyop2/codegen/rep2loopy.py +++ b/pyop2/codegen/rep2loopy.py @@ -614,6 +614,15 @@ def generate(builder, wrapper_name=None): name=wrapper_name, # TODO, should these really be silenced? silenced_warnings=["write_race*"]) + from pyop2.configuration import configuration + if configuration["time"]: + batch_size = configuration["simd_width"] + if builder.extruded: + start, end = parameters.layer_start, parameters.layer_end + else: + start, end = "start", "end" + wrapper = loopy.assume(wrapper, "{0} mod {1} = 0".format(end, batch_size)) + wrapper = loopy.assume(wrapper, "exists zz: zz > 0 and {0} = {1}*zz + {2}".format(end, configuration["simd_width"], start)) # prioritize loops for indices in context.index_ordering: diff --git a/pyop2/compilation.py b/pyop2/compilation.py index 01b1d279a..0f9f79b92 100644 --- a/pyop2/compilation.py +++ b/pyop2/compilation.py @@ -216,7 +216,7 @@ def workaround_cflags(self): if version.StrictVersion("7.3") <= ver <= version.StrictVersion("7.5"): # GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90055 # See also https://github.com/firedrakeproject/firedrake/issues/1442 - # And https://github.com/firedrakeproject/firedrake/issues/1717 + return # enable vectorization for paper # Bug also on skylake with the vectoriser in this # combination (disappears without # -fno-tree-loop-vectorize!) @@ -394,7 +394,7 @@ class LinuxCompiler(Compiler): :kwarg comm: Optional communicator to compile the code on (only rank 0 compiles code) (defaults to COMM_WORLD).""" def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None): - opt_flags = ['-march=native', '-O3', '-ffast-math'] + opt_flags = ['-O3', '-ffast-math', '-fopenmp'] if configuration['debug']: opt_flags = ['-O0', '-g'] cc = "mpicc" diff --git a/pyop2/sequential.py b/pyop2/sequential.py index cc8d78d55..bebe7a2c0 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -83,15 +83,18 @@ def vectorise(wrapper, iname, batch_size): kernel = kernel.copy(temporary_variables=tmps) # split iname and vectorize the inner loop + slabs = (1, 1) + if configuration["time"]: + slabs = (0, 0) inner_iname = iname + "_batch" if configuration["vectorization_strategy"] == "ve": # vectorize using vector extenstions - kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="c_vec", inner_iname=inner_iname) + kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="c_vec", inner_iname=inner_iname) else: # vectoriza using omp pragma simd assert configuration["vectorization_strategy"] == "omp" - kernel = loopy.split_iname(kernel, iname, batch_size, slabs=(0, 1), inner_tag="omp_simd", inner_iname=inner_iname) + kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="omp_simd", inner_iname=inner_iname) alignment = configuration["alignment"] tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) @@ -243,7 +246,7 @@ def set_nbytes(self, args): seen = set() for arg in args: if arg.access is INC: - nbytes += arg.data.nbytes + nbytes += arg.data.nbytes * 2 else: nbytes += arg.data.nbytes for map_ in arg.map_tuple: From ba693dc1b4ba6addfa2f0693c0473b06677d3d92 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Thu, 11 Apr 2019 17:09:25 +0100 Subject: [PATCH 004/100] DROP BEFORE MERGE: test with correct loopy branch --- requirements-git.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-git.txt b/requirements-git.txt index 718e27330..4790f7f1b 100644 --- a/requirements-git.txt +++ b/requirements-git.txt @@ -1,4 +1,4 @@ git+https://github.com/firedrakeproject/petsc.git@firedrake#egg=petsc --no-deps git+https://github.com/firedrakeproject/petsc4py.git@firedrake#egg=petsc4py git+https://github.com/coneoproject/COFFEE.git#egg=coffee -git+https://github.com/firedrakeproject/loopy.git@firedrake#egg=loopy +git+https://github.com/firedrakeproject/loopy.git@cvec#egg=loopy From 4ec0769a9c5c65d626e9f80fc2df7abfe64a117a Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 1 Jul 2020 09:57:13 +0100 Subject: [PATCH 005/100] Turn of tree vectorize for certain gcc compilers. We might not need the tree vectorisation flag for our vectorisation anyways. --- pyop2/compilation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyop2/compilation.py b/pyop2/compilation.py index 0f9f79b92..5b517fc44 100644 --- a/pyop2/compilation.py +++ b/pyop2/compilation.py @@ -216,7 +216,6 @@ def workaround_cflags(self): if version.StrictVersion("7.3") <= ver <= version.StrictVersion("7.5"): # GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90055 # See also https://github.com/firedrakeproject/firedrake/issues/1442 - return # enable vectorization for paper # Bug also on skylake with the vectoriser in this # combination (disappears without # -fno-tree-loop-vectorize!) From f9e60fdb6a1608535c39884f73bd8bb8f5efcc84 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 1 Jul 2020 10:01:26 +0100 Subject: [PATCH 006/100] Add simd compiler flags. --- pyop2/compilation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyop2/compilation.py b/pyop2/compilation.py index 5b517fc44..e8d8a921d 100644 --- a/pyop2/compilation.py +++ b/pyop2/compilation.py @@ -365,7 +365,7 @@ class MacCompiler(Compiler): """ def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None): - opt_flags = ['-march=native', '-O3', '-ffast-math'] + opt_flags = ['-march=native', '-O3', '-ffast-math', '-fopenmp-simd'] if configuration['debug']: opt_flags = ['-O0', '-g'] cc = "mpicc" @@ -393,7 +393,7 @@ class LinuxCompiler(Compiler): :kwarg comm: Optional communicator to compile the code on (only rank 0 compiles code) (defaults to COMM_WORLD).""" def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None): - opt_flags = ['-O3', '-ffast-math', '-fopenmp'] + opt_flags = ['-O3', '-ffast-math', '-fopenmp-simd'] if configuration['debug']: opt_flags = ['-O0', '-g'] cc = "mpicc" @@ -419,7 +419,7 @@ class LinuxIntelCompiler(Compiler): rank 0 compiles code) (defaults to COMM_WORLD). """ def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None): - opt_flags = ['-Ofast', '-xHost'] + opt_flags = ['-Ofast', '-xHost', '-qopenmp-simd'] if configuration['debug']: opt_flags = ['-O0', '-g'] cc = "mpicc" From 00e073d1d0780b462ad0002a8f27610296a618ec Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 1 Jul 2020 10:02:37 +0100 Subject: [PATCH 007/100] Remove time configuration. --- pyop2/codegen/rep2loopy.py | 9 --------- pyop2/sequential.py | 8 -------- 2 files changed, 17 deletions(-) diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py index 2fa117f9c..c0597bfc5 100644 --- a/pyop2/codegen/rep2loopy.py +++ b/pyop2/codegen/rep2loopy.py @@ -614,15 +614,6 @@ def generate(builder, wrapper_name=None): name=wrapper_name, # TODO, should these really be silenced? silenced_warnings=["write_race*"]) - from pyop2.configuration import configuration - if configuration["time"]: - batch_size = configuration["simd_width"] - if builder.extruded: - start, end = parameters.layer_start, parameters.layer_end - else: - start, end = "start", "end" - wrapper = loopy.assume(wrapper, "{0} mod {1} = 0".format(end, batch_size)) - wrapper = loopy.assume(wrapper, "exists zz: zz > 0 and {0} = {1}*zz + {2}".format(end, configuration["simd_width"], start)) # prioritize loops for indices in context.index_ordering: diff --git a/pyop2/sequential.py b/pyop2/sequential.py index bebe7a2c0..98a2c5abe 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -84,8 +84,6 @@ def vectorise(wrapper, iname, batch_size): # split iname and vectorize the inner loop slabs = (1, 1) - if configuration["time"]: - slabs = (0, 0) inner_iname = iname + "_batch" if configuration["vectorization_strategy"] == "ve": @@ -274,8 +272,6 @@ def prepare_arglist(self, iterset, *args): continue arglist += (k,) seen.add(k) - if configuration["time"]: - self.set_nbytes(args) return arglist @cached_property @@ -290,10 +286,6 @@ def _compute_event(self): @collective def _compute(self, part, fun, *arglist): - if configuration["time"]: - nbytes = self.comm.allreduce(self.nbytes) - if self.comm.Get_rank() == 0: - print("{0}_BYTES= {1}".format(self._jitmodule._wrapper_name, nbytes)) with self._compute_event: self.log_flops(part.size * self.num_flops) fun(part.offset, part.offset + part.size, *arglist) From 1cf7698593e77db83859f229183540ea931227c0 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 1 Jul 2020 13:02:23 +0100 Subject: [PATCH 008/100] Default SIMD width. --- pyop2/configuration.py | 20 ++++++++++++++++++-- requirements-ext.txt | 1 + 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pyop2/configuration.py b/pyop2/configuration.py index 9d2076bb6..06db28b71 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -39,6 +39,22 @@ from pyop2.exceptions import ConfigurationError +def default_simd_width(): + from cpuinfo import get_cpu_info + avx_to_width = {'avx': 2, 'avx1': 2, 'avx128': 2, 'avx2': 4, + 'avx256': 4, 'avx3': 8, 'avx512': 8} + longest_ext = [t for t in get_cpu_info()["flags"] if t.startswith('avx')][-1] + if longest_ext not in avx_to_width.keys(): + if longest_ext[:6] not in avx_to_width.keys(): + assert longest_ext[:4] in avx_to_width.keys(), \ + "The vector extension of your architecture is unknown. Disable vectorisation!" + return avx_to_width[longest_ext[:4]] + else: + return avx_to_width[longest_ext[:6]] + else: + return avx_to_width[longest_ext] + + class Configuration(dict): r"""PyOP2 configuration parameters @@ -75,8 +91,8 @@ class Configuration(dict): # name, env variable, type, default, write once DEFAULTS = { "compiler": ("PYOP2_BACKEND_COMPILER", str, "gcc"), - "simd_width": ("PYOP2_SIMD_WIDTH", int, 4), - "vectorization_strategy":("PYOP2_VECT_STRATEGY", str, "ve"), + "simd_width": ("PYOP2_SIMD_WIDTH", int, default_simd_width()), + "vectorization_strategy": ("PYOP2_VECT_STRATEGY", str, "ve"), "alignment": ("PYOP2_ALIGNMENT", int, 64), "time": ("PYOP2_TIME", bool, False), "debug": ("PYOP2_DEBUG", bool, False), diff --git a/requirements-ext.txt b/requirements-ext.txt index 758ccd963..a73f7da24 100644 --- a/requirements-ext.txt +++ b/requirements-ext.txt @@ -5,3 +5,4 @@ flake8>=2.1.0 pycparser>=2.10 mpi4py>=1.3.1 decorator +py-cpuinfo From 3e669468a339e694ed1e9f6d9f6d6c4cd2b14c8f Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Fri, 3 Jul 2020 13:08:23 +0100 Subject: [PATCH 009/100] Generate CVec Target with batch size infomation and move typedef into loopy codebase. --- pyop2/sequential.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 98a2c5abe..12040fef9 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -73,7 +73,7 @@ def vectorise(wrapper, iname, batch_size): return wrapper # create constant zero vectors - wrapper = wrapper.copy(target=loopy.CVecTarget()) + wrapper = wrapper.copy(target=loopy.CVecTarget(batch_size)) kernel = wrapper.root_kernel zeros = loopy.TemporaryVariable("_zeros", shape=loopy.auto, dtype=numpy.float64, read_only=True, initializer=numpy.array(0.0, dtype=numpy.float64), @@ -100,12 +100,6 @@ def vectorise(wrapper, iname, batch_size): wrapper = wrapper.with_root_kernel(kernel) - # vector data type - vec_types = [("double", 8), ("int", 4)] # scalar type, bytes - preamble = ["typedef {0} {0}{1} __attribute__ ((vector_size ({2})));".format(t, batch_size, batch_size * b) for t, b in vec_types] - preamble = "\n" + "\n".join(preamble) - - wrapper = loopy.register_preamble_generators(wrapper, [_PreambleGen(preamble, idx="01")]) return wrapper From 1238ce8f244ded66fb1b5b6c798f183d9ed8e1a3 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Fri, 3 Jul 2020 15:59:45 +0100 Subject: [PATCH 010/100] Move zero declaration to loopy code base to be more robust in naming the variable. --- pyop2/sequential.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 12040fef9..5ffd4909e 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -38,7 +38,6 @@ import ctypes import loopy -import numpy from pyop2.datatypes import IntType, as_ctypes from pyop2 import base @@ -60,7 +59,6 @@ from pyop2.profiling import timed_region from pyop2.utils import cached_property, get_petsc_dir from pyop2.configuration import configuration -from pyop2.codegen.rep2loopy import _PreambleGen def vectorise(wrapper, iname, batch_size): @@ -75,12 +73,6 @@ def vectorise(wrapper, iname, batch_size): # create constant zero vectors wrapper = wrapper.copy(target=loopy.CVecTarget(batch_size)) kernel = wrapper.root_kernel - zeros = loopy.TemporaryVariable("_zeros", shape=loopy.auto, dtype=numpy.float64, read_only=True, - initializer=numpy.array(0.0, dtype=numpy.float64), - address_space=loopy.AddressSpace.GLOBAL, zero_size=batch_size) - tmps = kernel.temporary_variables.copy() - tmps["_zeros"] = zeros - kernel = kernel.copy(temporary_variables=tmps) # split iname and vectorize the inner loop slabs = (1, 1) From 1d547779934fd7a0a5cc61407a869dbeaf4a6d62 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 16 Jul 2020 00:33:57 +0100 Subject: [PATCH 011/100] Added conditionals when to vectorise: Don't vectorise, if complex arguments. Check if vect strategy specified, otw dont vectorise. --- pyop2/sequential.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 5ffd4909e..a7fd81e68 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -160,11 +160,16 @@ def code_to_compile(self): iname = "layer" else: iname = "n" + has_matrix = any(arg._is_mat for arg in self._args) has_rw = any(arg.access == RW for arg in self._args) - if isinstance(self._kernel.code, loopy.LoopKernel) and not (has_matrix or has_rw): + is_cplx = any(arg.dtype.name == 'complex128' for arg in self._args) + vectorisable = not (has_matrix or has_rw) and (configuration["vectorization_strategy"]) + + if (isinstance(self._kernel.code, loopy.LoopKernel) and vectorisable): wrapper = loopy.inline_callable_kernel(wrapper, self._kernel.name) - wrapper = vectorise(wrapper, iname, configuration["simd_width"]) + if not is_cplx: + wrapper = vectorise(wrapper, iname, configuration["simd_width"]) code = loopy.generate_code_v2(wrapper) if self._kernel._cpp: From b36921353942b10b6a0635c2543e99cfae4e9355 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 16 Jul 2020 00:18:15 +0100 Subject: [PATCH 012/100] Drop omp vectorisation. --- pyop2/sequential.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pyop2/sequential.py b/pyop2/sequential.py index a7fd81e68..ddb6dd054 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -79,12 +79,7 @@ def vectorise(wrapper, iname, batch_size): inner_iname = iname + "_batch" if configuration["vectorization_strategy"] == "ve": - # vectorize using vector extenstions kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="c_vec", inner_iname=inner_iname) - else: - # vectoriza using omp pragma simd - assert configuration["vectorization_strategy"] == "omp" - kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="omp_simd", inner_iname=inner_iname) alignment = configuration["alignment"] tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) From 1c6346e90ea5e786c02e83d7d91dcf15a54972a2 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 16 Jul 2020 14:18:15 +0100 Subject: [PATCH 013/100] Add -march=native everywhere. --- pyop2/compilation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyop2/compilation.py b/pyop2/compilation.py index e8d8a921d..3d60bde4e 100644 --- a/pyop2/compilation.py +++ b/pyop2/compilation.py @@ -393,7 +393,7 @@ class LinuxCompiler(Compiler): :kwarg comm: Optional communicator to compile the code on (only rank 0 compiles code) (defaults to COMM_WORLD).""" def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None): - opt_flags = ['-O3', '-ffast-math', '-fopenmp-simd'] + opt_flags = ['-march=native', '-O3', '-ffast-math', '-fopenmp-simd'] if configuration['debug']: opt_flags = ['-O0', '-g'] cc = "mpicc" @@ -419,7 +419,7 @@ class LinuxIntelCompiler(Compiler): rank 0 compiles code) (defaults to COMM_WORLD). """ def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None): - opt_flags = ['-Ofast', '-xHost', '-qopenmp-simd'] + opt_flags = ['-march=native', '-Ofast', '-xHost', '-qopenmp-simd'] if configuration['debug']: opt_flags = ['-O0', '-g'] cc = "mpicc" From 856b6aab584b30ad88d366246d5d2305274d786a Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 22 Jul 2020 12:45:21 +0100 Subject: [PATCH 014/100] Silence warnings. --- pyop2/codegen/rep2loopy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py index c0597bfc5..b39b37d49 100644 --- a/pyop2/codegen/rep2loopy.py +++ b/pyop2/codegen/rep2loopy.py @@ -613,7 +613,7 @@ def generate(builder, wrapper_name=None): lang_version=(2018, 2), name=wrapper_name, # TODO, should these really be silenced? - silenced_warnings=["write_race*"]) + silenced_warnings=["write_race*", "data_dep*"]) # prioritize loops for indices in context.index_ordering: From 5e52ce1f78b57a3fa948910c8edd55de68b47559 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Mon, 24 Aug 2020 11:24:46 +0200 Subject: [PATCH 015/100] Change vector tag. --- pyop2/sequential.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyop2/sequential.py b/pyop2/sequential.py index ddb6dd054..6b6022995 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -79,7 +79,7 @@ def vectorise(wrapper, iname, batch_size): inner_iname = iname + "_batch" if configuration["vectorization_strategy"] == "ve": - kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="c_vec", inner_iname=inner_iname) + kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="vec", inner_iname=inner_iname) alignment = configuration["alignment"] tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) From 537c14c94f946f7f34399745966c2f07c50a70f8 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 1 Sep 2020 14:44:26 +0200 Subject: [PATCH 016/100] Give more control over vectorisation to PyOP2. --- pyop2/sequential.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 6b6022995..5b5163fe6 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -85,8 +85,23 @@ def vectorise(wrapper, iname, batch_size): tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) kernel = kernel.copy(temporary_variables=tmps) - wrapper = wrapper.with_root_kernel(kernel) + from loopy.preprocess import check_cvec_vectorizability, cvec_privatize + from loopy.kernel.data import OpenMPSIMDTag, VectorizeTag + from loopy.transform.iname import tag_inames + + # try to vectorise with vector extensionn + pragma_inst, vector_inst, iname_to_pragma, iname_to_unr = check_cvec_vectorizability(kernel) + + # if not possible fall back to OpenMP SIMD pragmas or unrolling by retagging + for i in iname_to_pragma: + kernel = tag_inames(kernel, [(i, "omp_simd")], retag=True) + for i in iname_to_unr: + kernel = tag_inames(kernel, [(i, "unr")], retag=True) + kernel = cvec_privatize(kernel, pragma_inst, vector_inst) + + wrapper = wrapper.with_root_kernel(kernel) + return wrapper From 9317654904d64854eb16b8016df572ea80df3098 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 1 Sep 2020 17:02:40 +0200 Subject: [PATCH 017/100] Naming adaption. --- pyop2/sequential.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 5b5163fe6..b00e66e71 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -90,15 +90,10 @@ def vectorise(wrapper, iname, batch_size): from loopy.transform.iname import tag_inames # try to vectorise with vector extensionn - pragma_inst, vector_inst, iname_to_pragma, iname_to_unr = check_cvec_vectorizability(kernel) - - # if not possible fall back to OpenMP SIMD pragmas or unrolling by retagging - for i in iname_to_pragma: - kernel = tag_inames(kernel, [(i, "omp_simd")], retag=True) - for i in iname_to_unr: - kernel = tag_inames(kernel, [(i, "unr")], retag=True) + vector_inst, pragma_inst_to_tag, unr_inst_to_tag = check_cvec_vectorizability(kernel) - kernel = cvec_privatize(kernel, pragma_inst, vector_inst) + # if not possible fall back to OpenMP SIMD pragmas or unrolling by retagging, then privatize + kernel = cvec_retag_and_privatize(kernel, vector_inst, pragma_inst_to_tag, unr_inst_to_tag) wrapper = wrapper.with_root_kernel(kernel) From 6723b6a44207c66974347968aaf682e777ac3762 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 1 Sep 2020 17:03:09 +0200 Subject: [PATCH 018/100] Realize ilp first. --- pyop2/sequential.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyop2/sequential.py b/pyop2/sequential.py index b00e66e71..f72a96143 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -85,10 +85,12 @@ def vectorise(wrapper, iname, batch_size): tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) kernel = kernel.copy(temporary_variables=tmps) - from loopy.preprocess import check_cvec_vectorizability, cvec_privatize + from loopy.preprocess import check_cvec_vectorizability, cvec_retag_and_privatize, realize_ilp from loopy.kernel.data import OpenMPSIMDTag, VectorizeTag from loopy.transform.iname import tag_inames + kernel = realize_ilp(kernel) # FIXME: do we also need to realize the reductions first? + # try to vectorise with vector extensionn vector_inst, pragma_inst_to_tag, unr_inst_to_tag = check_cvec_vectorizability(kernel) From 38ebc8aca6703ac60b1e84717d28d948d261f669 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 1 Sep 2020 17:16:58 +0200 Subject: [PATCH 019/100] Jenkins. --- requirements-git.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-git.txt b/requirements-git.txt index 4790f7f1b..7e682bf1c 100644 --- a/requirements-git.txt +++ b/requirements-git.txt @@ -1,4 +1,4 @@ git+https://github.com/firedrakeproject/petsc.git@firedrake#egg=petsc --no-deps git+https://github.com/firedrakeproject/petsc4py.git@firedrake#egg=petsc4py git+https://github.com/coneoproject/COFFEE.git#egg=coffee -git+https://github.com/firedrakeproject/loopy.git@cvec#egg=loopy +git+https://github.com/firedrakeproject/loopy.git@cvec-restructure-checks#egg=loopy From 944c6cf73b302a1c6b2a5a4bc919db6818e8f96c Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 1 Mar 2022 11:19:19 +0100 Subject: [PATCH 020/100] DBM: run against new loopy branch --- requirements-git.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-git.txt b/requirements-git.txt index 7f36bba5e..55cd0e593 100644 --- a/requirements-git.txt +++ b/requirements-git.txt @@ -1,2 +1,2 @@ git+https://github.com/coneoproject/COFFEE.git#egg=coffee -git+https://github.com/firedrakeproject/loopy.git@cvec-restructure-checks#egg=loopy +git+https://github.com/firedrakeproject/loopy.git@vectorisation-sprint#egg=loopy From 3a1eb249d18d2a313545bf5bfcbecdaf65feba26 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 1 Mar 2022 12:58:11 +0100 Subject: [PATCH 021/100] Lint --- pyop2/global_kernel.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 6b1384b1c..b8b020752 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -15,7 +15,6 @@ from pyop2.datatypes import IntType, as_ctypes from pyop2.types import IterationRegion from pyop2.utils import cached_property, get_petsc_dir -from pyop2 import configuration from pyop2 import op2 @@ -355,7 +354,7 @@ def code_to_compile(self): device_code = "\n\n".join(str(dp.ast) for dp in code.device_programs) return preamble + "\nextern \"C\" {\n" + device_code + "\n}\n" return code.device_code() - + def vectorise(wrapper, iname, batch_size): """Return a vectorised version of wrapper, vectorising over iname. @@ -381,8 +380,6 @@ def vectorise(wrapper, iname, batch_size): kernel = kernel.copy(temporary_variables=tmps) from lp.preprocess import check_cvec_vectorizability, cvec_retag_and_privatize, realize_ilp - from lp.kernel.data import OpenMPSIMDTag, VectorizeTag - from lp.transform.iname import tag_inames kernel = realize_ilp(kernel) # FIXME: do we also need to realize the reductions first? @@ -393,7 +390,7 @@ def vectorise(wrapper, iname, batch_size): kernel = cvec_retag_and_privatize(kernel, vector_inst, pragma_inst_to_tag, unr_inst_to_tag) wrapper = wrapper.with_root_kernel(kernel) - + return wrapper @PETSc.Log.EventDecorator() From 681e3159dca53b82e65dd91405959f08c7c0726a Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 1 Mar 2022 13:03:19 +0100 Subject: [PATCH 022/100] More adapations to new PyOP2 --- pyop2/global_kernel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index b8b020752..a5857fcec 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -332,12 +332,12 @@ def code_to_compile(self): from pyop2.codegen.rep2loopy import generate wrapper = generate(self.builder) - if self._iterset._extruded: + if self._extruded: iname = "layer" else: iname = "n" - has_matrix = any(arg._is_mat for arg in self._args) + has_matrix = any(isinstance(arg, MatKernelArg) for arg in self._args) has_rw = any(arg.access == op2.RW for arg in self._args) is_cplx = any(arg.dtype.name == 'complex128' for arg in self._args) vectorisable = not (has_matrix or has_rw) and (configuration["vectorization_strategy"]) From 48d61428fe3477941445b5392f31af29ffa4ecd7 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 1 Mar 2022 13:04:15 +0100 Subject: [PATCH 023/100] More adapations to new PyOP2 --- pyop2/global_kernel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index a5857fcec..2a3967f7f 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -337,9 +337,9 @@ def code_to_compile(self): else: iname = "n" - has_matrix = any(isinstance(arg, MatKernelArg) for arg in self._args) - has_rw = any(arg.access == op2.RW for arg in self._args) - is_cplx = any(arg.dtype.name == 'complex128' for arg in self._args) + has_matrix = any(isinstance(arg, MatKernelArg) for arg in self.arguments) + has_rw = any(arg.access == op2.RW for arg in self.arguments) + is_cplx = any(arg.dtype.name == 'complex128' for arg in self.arguments) vectorisable = not (has_matrix or has_rw) and (configuration["vectorization_strategy"]) if (isinstance(self._kernel.code, lp.LoopKernel) and vectorisable): From 792c8f04c49e01ea1cf76320daee21e4314da9cc Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 3 Mar 2022 13:17:38 +0100 Subject: [PATCH 024/100] DBM take the correct branch --- requirements-git.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-git.txt b/requirements-git.txt index 55cd0e593..d6200093f 100644 --- a/requirements-git.txt +++ b/requirements-git.txt @@ -1,2 +1,2 @@ git+https://github.com/coneoproject/COFFEE.git#egg=coffee -git+https://github.com/firedrakeproject/loopy.git@vectorisation-sprint#egg=loopy +git+https://github.com/firedrakeproject/loopy.git@c_vecextensions_target#egg=loopy From 2469870e08df111b3a3c2470629f6212c5e5a4af Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 3 Mar 2022 15:00:23 +0100 Subject: [PATCH 025/100] Adapt to new PyOP2 and vectorisation --- pyop2/global_kernel.py | 58 ++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 2a3967f7f..7d1bcbc52 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -338,13 +338,11 @@ def code_to_compile(self): iname = "n" has_matrix = any(isinstance(arg, MatKernelArg) for arg in self.arguments) - has_rw = any(arg.access == op2.RW for arg in self.arguments) - is_cplx = any(arg.dtype.name == 'complex128' for arg in self.arguments) - vectorisable = not (has_matrix or has_rw) and (configuration["vectorization_strategy"]) + has_rw = any(arg.access == op2.RW for arg in self.local_kernel.arguments) + is_cplx = any(arg.dtype == 'complex128' for arg in self.local_kernel.arguments) + vectorisable = (not (has_matrix or has_rw) and (configuration["vectorization_strategy"])) and not is_cplx - if (isinstance(self._kernel.code, lp.LoopKernel) and vectorisable): - wrapper = lp.inline_callable_kernel(wrapper, self._kernel.name) - if not is_cplx: + if vectorisable: wrapper = self.vectorise(wrapper, iname, configuration["simd_width"]) code = lp.generate_code_v2(wrapper) @@ -364,34 +362,40 @@ def vectorise(wrapper, iname, batch_size): if batch_size == 1: return wrapper - # create constant zero vectors - wrapper = wrapper.copy(target=lp.CVecTarget(batch_size)) - kernel = wrapper.root_kernel + wrapper = wrapper.copy(target=lp.CVectorExtensionsTarget()) + kernel = wrapper.default_entrypoint + + # align temps + alignment = configuration["alignment"] + tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) + kernel = kernel.copy(temporary_variables=tmps) - # split iname and vectorize the inner loop + # split iname slabs = (1, 1) inner_iname = iname + "_batch" if configuration["vectorization_strategy"] == "ve": kernel = lp.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="vec", inner_iname=inner_iname) - alignment = configuration["alignment"] - tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) - kernel = kernel.copy(temporary_variables=tmps) - - from lp.preprocess import check_cvec_vectorizability, cvec_retag_and_privatize, realize_ilp - - kernel = realize_ilp(kernel) # FIXME: do we also need to realize the reductions first? - - # try to vectorise with vector extensionn - vector_inst, pragma_inst_to_tag, unr_inst_to_tag = check_cvec_vectorizability(kernel) - - # if not possible fall back to OpenMP SIMD pragmas or unrolling by retagging, then privatize - kernel = cvec_retag_and_privatize(kernel, vector_inst, pragma_inst_to_tag, unr_inst_to_tag) - - wrapper = wrapper.with_root_kernel(kernel) - - return wrapper + # private the temporaries on the inner inames + kernel = lp.privatize_temporaries_with_inames(kernel, inner_iname) + + # tag axes of the temporaries as vectorised + if tmps: + # The following only works if I uncomment the error I ge in + # File "/Users/sv2518/firedrakeinstalls/fresh/firedrake/src/loopy/loopy/kernel/array.py", line 803, in __init__ + # The error is + # loopy.diagnostic.LoopyError: contradictory values for number of dimensions of array 't0' from shape, strides, dim_tags, or dim_names + kernel = lp.tag_array_axes(kernel, ",".join(tmps.keys()), "vec") + + # tag the inner iname as vectorized + kernel = lp.tag_inames(kernel, {inner_iname: lp.VectorizeTag()}) + # FIXME I want to do + # kernel = lp.tag_inames(kernel, {inner_iname: lp.VectorizeTag(lp.OpenMPSIMDTag())}) + # but it throws the error + # pytools.tag.NonUniqueTagError: Multiple tags are direct subclasses of the following UniqueTag(s): InameImplementationTag + + return kernel @PETSc.Log.EventDecorator() @mpi.collective From 4bbcde52bc28e3a39f97548d16e914d03f9cc3d3 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 3 Mar 2022 15:07:05 +0100 Subject: [PATCH 026/100] Adapt to new PyOP2 and vectorisation --- pyop2/global_kernel.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 7d1bcbc52..f3f588586 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -343,6 +343,10 @@ def code_to_compile(self): vectorisable = (not (has_matrix or has_rw) and (configuration["vectorization_strategy"])) and not is_cplx if vectorisable: + #FIXME inside the vectorisation we loose the connection of the wrapper kernel + # to the kernels that it is calling + # I run into + # loopy.diagnostic.LoopyError: Unknown function 'expression_kernel' -- register a callable corresponding to it. wrapper = self.vectorise(wrapper, iname, configuration["simd_width"]) code = lp.generate_code_v2(wrapper) @@ -353,7 +357,7 @@ def code_to_compile(self): return preamble + "\nextern \"C\" {\n" + device_code + "\n}\n" return code.device_code() - def vectorise(wrapper, iname, batch_size): + def vectorise(self, wrapper, iname, batch_size): """Return a vectorised version of wrapper, vectorising over iname. :arg wrapper: A loopy kernel to vectorise. From a5c04558ac57df36e44cb412c837ead39030afc7 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 3 Mar 2022 15:18:15 +0100 Subject: [PATCH 027/100] Fix return wrapper with kernel not kernel --- pyop2/global_kernel.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index f3f588586..a6745f59f 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -343,10 +343,6 @@ def code_to_compile(self): vectorisable = (not (has_matrix or has_rw) and (configuration["vectorization_strategy"])) and not is_cplx if vectorisable: - #FIXME inside the vectorisation we loose the connection of the wrapper kernel - # to the kernels that it is calling - # I run into - # loopy.diagnostic.LoopyError: Unknown function 'expression_kernel' -- register a callable corresponding to it. wrapper = self.vectorise(wrapper, iname, configuration["simd_width"]) code = lp.generate_code_v2(wrapper) @@ -399,7 +395,7 @@ def vectorise(self, wrapper, iname, batch_size): # but it throws the error # pytools.tag.NonUniqueTagError: Multiple tags are direct subclasses of the following UniqueTag(s): InameImplementationTag - return kernel + return wrapper.with_kernel(kernel) @PETSc.Log.EventDecorator() @mpi.collective From c3740317d9697930fcebacf90b288a156300d82c Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 3 Mar 2022 15:29:47 +0100 Subject: [PATCH 028/100] We do need to inline bc Implementing transforms that apply cleanly across caller-callee is a bit involved and loopy can't deal with it yet. --- pyop2/global_kernel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index a6745f59f..3bd2b4ea2 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -343,6 +343,7 @@ def code_to_compile(self): vectorisable = (not (has_matrix or has_rw) and (configuration["vectorization_strategy"])) and not is_cplx if vectorisable: + wrapper = lp.inline_callable_kernel(wrapper, self.local_kernel.name) wrapper = self.vectorise(wrapper, iname, configuration["simd_width"]) code = lp.generate_code_v2(wrapper) From e7d31eb0c6a574a62d612108744501e7c65d865b Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 3 Mar 2022 15:38:45 +0100 Subject: [PATCH 029/100] First split then tag because loopy does not support retaggin of inames by default --- pyop2/global_kernel.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 3bd2b4ea2..9d4d44f5d 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -376,7 +376,10 @@ def vectorise(self, wrapper, iname, batch_size): inner_iname = iname + "_batch" if configuration["vectorization_strategy"] == "ve": - kernel = lp.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="vec", inner_iname=inner_iname) + kernel = lp.split_iname(kernel, iname, batch_size, slabs=slabs, inner_iname=inner_iname) + + # tag the inner iname as vectorized + kernel = lp.tag_inames(kernel, {inner_iname: lp.VectorizeTag(lp.OpenMPSIMDTag())}) # private the temporaries on the inner inames kernel = lp.privatize_temporaries_with_inames(kernel, inner_iname) From 56a8dde69e09fa00f1469a7537398715c739d7c2 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 3 Mar 2022 15:48:26 +0100 Subject: [PATCH 030/100] tag_array_axes requires us to specify the tags for each dimension of the temporary. I.e. if the temporary is 1D: 'vec', if the temporary is 2D: 'vec, c', if the temporary is 3D: 'vec,c,c' --- pyop2/global_kernel.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 9d4d44f5d..42dfdeee8 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -385,19 +385,9 @@ def vectorise(self, wrapper, iname, batch_size): kernel = lp.privatize_temporaries_with_inames(kernel, inner_iname) # tag axes of the temporaries as vectorised - if tmps: - # The following only works if I uncomment the error I ge in - # File "/Users/sv2518/firedrakeinstalls/fresh/firedrake/src/loopy/loopy/kernel/array.py", line 803, in __init__ - # The error is - # loopy.diagnostic.LoopyError: contradictory values for number of dimensions of array 't0' from shape, strides, dim_tags, or dim_names - kernel = lp.tag_array_axes(kernel, ",".join(tmps.keys()), "vec") - - # tag the inner iname as vectorized - kernel = lp.tag_inames(kernel, {inner_iname: lp.VectorizeTag()}) - # FIXME I want to do - # kernel = lp.tag_inames(kernel, {inner_iname: lp.VectorizeTag(lp.OpenMPSIMDTag())}) - # but it throws the error - # pytools.tag.NonUniqueTagError: Multiple tags are direct subclasses of the following UniqueTag(s): InameImplementationTag + for name, tmp in tmps.items(): + tag = "vec" + len(tmp.shape)*",c" + kernel = lp.tag_array_axes(kernel, name, tag) return wrapper.with_kernel(kernel) From d1171b3faf6791f22c69383bec3f8c0ce16485d0 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 3 Mar 2022 15:52:43 +0100 Subject: [PATCH 031/100] Fix --- pyop2/global_kernel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 42dfdeee8..7c0d7fd51 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -386,7 +386,7 @@ def vectorise(self, wrapper, iname, batch_size): # tag axes of the temporaries as vectorised for name, tmp in tmps.items(): - tag = "vec" + len(tmp.shape)*",c" + tag = "vec" + (len(tmp.shape)-1)*",c" kernel = lp.tag_array_axes(kernel, name, tag) return wrapper.with_kernel(kernel) From 0641c759ed9aeb1cfe43553611f3720d052d0c73 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 3 Mar 2022 16:01:03 +0100 Subject: [PATCH 032/100] fix --- pyop2/global_kernel.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 7c0d7fd51..aa4c99d6a 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -377,18 +377,18 @@ def vectorise(self, wrapper, iname, batch_size): if configuration["vectorization_strategy"] == "ve": kernel = lp.split_iname(kernel, iname, batch_size, slabs=slabs, inner_iname=inner_iname) - - # tag the inner iname as vectorized - kernel = lp.tag_inames(kernel, {inner_iname: lp.VectorizeTag(lp.OpenMPSIMDTag())}) # private the temporaries on the inner inames kernel = lp.privatize_temporaries_with_inames(kernel, inner_iname) # tag axes of the temporaries as vectorised - for name, tmp in tmps.items(): + for name, tmp in kernel.temporary_variables.items(): tag = "vec" + (len(tmp.shape)-1)*",c" kernel = lp.tag_array_axes(kernel, name, tag) + # tag the inner iname as vectorized + kernel = lp.tag_inames(kernel, {inner_iname: lp.VectorizeTag(lp.OpenMPSIMDTag())}) + return wrapper.with_kernel(kernel) @PETSc.Log.EventDecorator() From 644842ee889efb6c21a2633ca4ae0e86c25a6cfd Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 3 Mar 2022 16:08:20 +0100 Subject: [PATCH 033/100] improve comments --- pyop2/global_kernel.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index aa4c99d6a..20ce0e165 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -375,10 +375,14 @@ def vectorise(self, wrapper, iname, batch_size): slabs = (1, 1) inner_iname = iname + "_batch" + # in the ideal world breaks a loop of n*batch_size into two loops: + # an outer loop of n/batch_size + # and an inner loop over batch_size if configuration["vectorization_strategy"] == "ve": kernel = lp.split_iname(kernel, iname, batch_size, slabs=slabs, inner_iname=inner_iname) - # private the temporaries on the inner inames + # adds a new axis to the temporary and indexes it with the provided iname + # i.e. stores the value at each instance of the loop. kernel = lp.privatize_temporaries_with_inames(kernel, inner_iname) # tag axes of the temporaries as vectorised From 9e58b227e57c2381aedb901963e29a508e0af679 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 3 Mar 2022 12:45:33 -0600 Subject: [PATCH 034/100] tag only non-constant arrays with vec axes --- pyop2/global_kernel.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 20ce0e165..5b9b5fdcb 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -343,8 +343,8 @@ def code_to_compile(self): vectorisable = (not (has_matrix or has_rw) and (configuration["vectorization_strategy"])) and not is_cplx if vectorisable: - wrapper = lp.inline_callable_kernel(wrapper, self.local_kernel.name) - wrapper = self.vectorise(wrapper, iname, configuration["simd_width"]) + wrapper = lp.inline_callable_kernel(wrapper, self.local_kernel.name) + wrapper = self.vectorise(wrapper, iname, configuration["simd_width"]) code = lp.generate_code_v2(wrapper) if self.local_kernel.cpp: @@ -365,7 +365,7 @@ def vectorise(self, wrapper, iname, batch_size): wrapper = wrapper.copy(target=lp.CVectorExtensionsTarget()) kernel = wrapper.default_entrypoint - + # align temps alignment = configuration["alignment"] tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) @@ -387,8 +387,9 @@ def vectorise(self, wrapper, iname, batch_size): # tag axes of the temporaries as vectorised for name, tmp in kernel.temporary_variables.items(): - tag = "vec" + (len(tmp.shape)-1)*",c" - kernel = lp.tag_array_axes(kernel, name, tag) + if not (tmp.read_only and tmp.initializer is not None): + tag = (len(tmp.shape)-1)*"c," + "vec" + kernel = lp.tag_array_axes(kernel, name, tag) # tag the inner iname as vectorized kernel = lp.tag_inames(kernel, {inner_iname: lp.VectorizeTag(lp.OpenMPSIMDTag())}) From 3f133fd7ad866209751a824be877c0f431b22cee Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Fri, 4 Mar 2022 15:52:15 +0100 Subject: [PATCH 035/100] Only vectorise when local kernel is a loopy thing. --- pyop2/global_kernel.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 5b9b5fdcb..ca5e152d3 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -343,8 +343,9 @@ def code_to_compile(self): vectorisable = (not (has_matrix or has_rw) and (configuration["vectorization_strategy"])) and not is_cplx if vectorisable: - wrapper = lp.inline_callable_kernel(wrapper, self.local_kernel.name) - wrapper = self.vectorise(wrapper, iname, configuration["simd_width"]) + if isinstance(self.local_kernel.code, lp.TranslationUnit): + wrapper = lp.inline_callable_kernel(wrapper, self.local_kernel.name) + wrapper = self.vectorise(wrapper, iname, configuration["simd_width"]) code = lp.generate_code_v2(wrapper) if self.local_kernel.cpp: From dcd0b695eeb9577af2f0ffeca58dbe9b641b714d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 4 Mar 2022 13:59:30 -0600 Subject: [PATCH 036/100] shift iel-loop to have lbound of 0 --- pyop2/global_kernel.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index ca5e152d3..53d02cf1b 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -371,19 +371,24 @@ def vectorise(self, wrapper, iname, batch_size): alignment = configuration["alignment"] tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) kernel = kernel.copy(temporary_variables=tmps) + iel = kernel.get_var_name_generator()("iel") + # make iel-loop so that the loop undergoing array expansion has a lower + # bound of '0' + kernel = lp.affine_map_inames(kernel, iname, iel, f"{iel}=({iname}-start)") # split iname slabs = (1, 1) - inner_iname = iname + "_batch" + inner_iname = kernel.get_var_name_generator()(f"{iel}_batch") # in the ideal world breaks a loop of n*batch_size into two loops: # an outer loop of n/batch_size # and an inner loop over batch_size if configuration["vectorization_strategy"] == "ve": - kernel = lp.split_iname(kernel, iname, batch_size, slabs=slabs, inner_iname=inner_iname) + kernel = lp.split_iname(kernel, iel, batch_size, slabs=slabs, inner_iname=inner_iname) # adds a new axis to the temporary and indexes it with the provided iname - # i.e. stores the value at each instance of the loop. + # i.e. stores the value at each instance of the loop. (i.e. array + # expansion) kernel = lp.privatize_temporaries_with_inames(kernel, inner_iname) # tag axes of the temporaries as vectorised From 907fe58d36949704ea943bc55a734ba2835f9f6b Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Sun, 6 Mar 2022 11:37:05 +0100 Subject: [PATCH 037/100] Fix import --- pyop2/global_kernel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 53d02cf1b..8aa9eb8d3 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -349,7 +349,7 @@ def code_to_compile(self): code = lp.generate_code_v2(wrapper) if self.local_kernel.cpp: - from lp.codegen.result import process_preambles + from loopy.codegen.result import process_preambles preamble = "".join(process_preambles(getattr(code, "device_preambles", []))) device_code = "\n\n".join(str(dp.ast) for dp in code.device_programs) return preamble + "\nextern \"C\" {\n" + device_code + "\n}\n" From ca2aaaf68ea32f2274e783e5817a179e89641cae Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Sun, 6 Mar 2022 12:22:58 +0100 Subject: [PATCH 038/100] Debug: try with newer python version --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 45df5ed57..ab79c43fd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,7 +30,7 @@ jobs: - name: Set correct Python version uses: actions/setup-python@v2 with: - python-version: '3.6' + python-version: '3.8' - name: Clone PETSc uses: actions/checkout@v2 From 0440f66929e4bb6ab09561753a996279303f3062 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Sun, 6 Mar 2022 12:38:05 +0100 Subject: [PATCH 039/100] Debug: try with newer python version --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ab79c43fd..b334b2438 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,7 +30,7 @@ jobs: - name: Set correct Python version uses: actions/setup-python@v2 with: - python-version: '3.8' + python-version: '3.9' - name: Clone PETSc uses: actions/checkout@v2 From 4bcb5924e2805f255022f79763ca1f16a3256345 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 6 Mar 2022 18:52:29 -0600 Subject: [PATCH 040/100] change target before inlining --- pyop2/global_kernel.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 8aa9eb8d3..af050a26b 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -344,6 +344,9 @@ def code_to_compile(self): if vectorisable: if isinstance(self.local_kernel.code, lp.TranslationUnit): + # change target to generate vectorized code via gcc vector + # extensions + wrapper = wrapper.copy(target=lp.CVectorExtensionsTarget()) wrapper = lp.inline_callable_kernel(wrapper, self.local_kernel.name) wrapper = self.vectorise(wrapper, iname, configuration["simd_width"]) code = lp.generate_code_v2(wrapper) @@ -364,7 +367,6 @@ def vectorise(self, wrapper, iname, batch_size): if batch_size == 1: return wrapper - wrapper = wrapper.copy(target=lp.CVectorExtensionsTarget()) kernel = wrapper.default_entrypoint # align temps From d42e7e824a451921927e5d5677991566faa55867 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 6 Mar 2022 19:01:03 -0600 Subject: [PATCH 041/100] ignore loopy vectorization fallback warnings --- pyop2/global_kernel.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index af050a26b..d80209a2b 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -367,6 +367,14 @@ def vectorise(self, wrapper, iname, batch_size): if batch_size == 1: return wrapper + if not configuration["debug"]: + # loopy warns for every instruction that cannot be vectorized; + # ignore in non-debug mode. + new_entrypoint = wrapper.default_entrypoint.copy( + silenced_warnings=(wrapper.default_entrypoint.silenced_warnings + + ["vectorize_failed"])) + wrapper = wrapper.with_kernel(new_entrypoint) + kernel = wrapper.default_entrypoint # align temps From 7e37e02f8015861c5e02ebaa7bf2f77bfa51c4af Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Sun, 6 Mar 2022 12:43:55 +0100 Subject: [PATCH 042/100] Revert "Debug: try with newer python version" This reverts commit ca2aaaf68ea32f2274e783e5817a179e89641cae. --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b334b2438..45df5ed57 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,7 +30,7 @@ jobs: - name: Set correct Python version uses: actions/setup-python@v2 with: - python-version: '3.9' + python-version: '3.6' - name: Clone PETSc uses: actions/checkout@v2 From b541dbd8e38588e37d3b242f6c035ee6f875f9dd Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Fri, 11 Mar 2022 17:01:42 +0100 Subject: [PATCH 043/100] Make complex check tighter --- pyop2/global_kernel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index d80209a2b..7f5213ec6 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -339,7 +339,8 @@ def code_to_compile(self): has_matrix = any(isinstance(arg, MatKernelArg) for arg in self.arguments) has_rw = any(arg.access == op2.RW for arg in self.local_kernel.arguments) - is_cplx = any(arg.dtype == 'complex128' for arg in self.local_kernel.arguments) + is_cplx = (any(arg.dtype == 'complex128' for arg in self.local_kernel.arguments) + or any(arg.dtype.dtype == 'complex128' for arg in tuple(wrapper.default_entrypoint.temporary_variables.values()))) vectorisable = (not (has_matrix or has_rw) and (configuration["vectorization_strategy"])) and not is_cplx if vectorisable: From caa567a2e160ea0c7e773d4658dac10a63f280ba Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 11 Mar 2022 12:47:01 -0600 Subject: [PATCH 044/100] extend the set of variables that cannot be vecotrized --- pyop2/global_kernel.py | 51 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 7f5213ec6..c0c0e1d41 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -380,22 +380,54 @@ def vectorise(self, wrapper, iname, batch_size): # align temps alignment = configuration["alignment"] - tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) + tmps = {name: tv.copy(alignment=alignment) + for name, tv in kernel.temporary_variables.items()} kernel = kernel.copy(temporary_variables=tmps) - iel = kernel.get_var_name_generator()("iel") - # make iel-loop so that the loop undergoing array expansion has a lower - # bound of '0' - kernel = lp.affine_map_inames(kernel, iname, iel, f"{iel}=({iname}-start)") + shifted_iname = kernel.get_var_name_generator()(f"{iname}_shift") + + # {{{ + + # Do not vectorize temporaries used outside *iname* + from functools import reduce + temps_not_to_vectorize = reduce(set.union, + [(insn.dependency_names() + & frozenset(kernel.temporary_variables)) + for insn in kernel.instructions + if iname not in insn.within_inames], + set()) + + # Constant literal temporaries are arguments => cannot vectorize + temps_not_to_vectorize |= {name + for name, tv in kernel.temporary_variables.items() + if (tv.read_only + and tv.initializer is not None)} + + # }}} + + # {{{ TODO: placeholder until loopy's simplify_using_pwaff gets smarter + + # transform to ensure that the loop undergoing array expansion has a + # lower bound of '0' + from loopy.symbolic import pw_aff_to_expr + import pymbolic.primitives as prim + lbound = pw_aff_to_expr(kernel.get_iname_bounds(iname).lower_bound_pw_aff) + + kernel = lp.affine_map_inames(kernel, iname, shifted_iname, + [(prim.Variable(shifted_iname), + (prim.Variable(iname) - lbound))]) + + # }}} # split iname slabs = (1, 1) - inner_iname = kernel.get_var_name_generator()(f"{iel}_batch") + inner_iname = kernel.get_var_name_generator()(f"{shifted_iname}_batch") # in the ideal world breaks a loop of n*batch_size into two loops: # an outer loop of n/batch_size # and an inner loop over batch_size if configuration["vectorization_strategy"] == "ve": - kernel = lp.split_iname(kernel, iel, batch_size, slabs=slabs, inner_iname=inner_iname) + kernel = lp.split_iname(kernel, shifted_iname, batch_size, slabs=slabs, + inner_iname=inner_iname) # adds a new axis to the temporary and indexes it with the provided iname # i.e. stores the value at each instance of the loop. (i.e. array @@ -404,12 +436,13 @@ def vectorise(self, wrapper, iname, batch_size): # tag axes of the temporaries as vectorised for name, tmp in kernel.temporary_variables.items(): - if not (tmp.read_only and tmp.initializer is not None): + if name not in temps_not_to_vectorize: tag = (len(tmp.shape)-1)*"c," + "vec" kernel = lp.tag_array_axes(kernel, name, tag) # tag the inner iname as vectorized - kernel = lp.tag_inames(kernel, {inner_iname: lp.VectorizeTag(lp.OpenMPSIMDTag())}) + kernel = lp.tag_inames(kernel, + {inner_iname: lp.VectorizeTag(lp.OpenMPSIMDTag())}) return wrapper.with_kernel(kernel) From c3a96fa6b1f27911ce19cdb20bb75bd18a4b5e1a Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Mon, 14 Mar 2022 13:37:37 +0100 Subject: [PATCH 045/100] Attempt to fix Slate by inlining of all subkernels --- pyop2/global_kernel.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index c0c0e1d41..04105ed9c 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -348,7 +348,12 @@ def code_to_compile(self): # change target to generate vectorized code via gcc vector # extensions wrapper = wrapper.copy(target=lp.CVectorExtensionsTarget()) - wrapper = lp.inline_callable_kernel(wrapper, self.local_kernel.name) + names = self.local_kernel.code.callables_table + for name in names: + if name in wrapper.callables_table.keys() \ + and isinstance(wrapper.callables_table[name], lp.CallableKernel): + print(name) + wrapper = lp.inline_callable_kernel(wrapper, name) wrapper = self.vectorise(wrapper, iname, configuration["simd_width"]) code = lp.generate_code_v2(wrapper) From dc996de8c73fa5090e890b883fdef923eab5d1b1 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Mon, 14 Mar 2022 13:39:23 +0100 Subject: [PATCH 046/100] Add comment --- pyop2/global_kernel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 04105ed9c..15f853b74 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -348,11 +348,11 @@ def code_to_compile(self): # change target to generate vectorized code via gcc vector # extensions wrapper = wrapper.copy(target=lp.CVectorExtensionsTarget()) + # inline all inner kernels names = self.local_kernel.code.callables_table for name in names: if name in wrapper.callables_table.keys() \ and isinstance(wrapper.callables_table[name], lp.CallableKernel): - print(name) wrapper = lp.inline_callable_kernel(wrapper, name) wrapper = self.vectorise(wrapper, iname, configuration["simd_width"]) code = lp.generate_code_v2(wrapper) From fa343e135b0e092ee4022b7b87058131c0b8b1cb Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 15 Mar 2022 10:30:37 -0500 Subject: [PATCH 047/100] placate flake8 --- pyop2/global_kernel.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 15f853b74..d30eed467 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -348,11 +348,12 @@ def code_to_compile(self): # change target to generate vectorized code via gcc vector # extensions wrapper = wrapper.copy(target=lp.CVectorExtensionsTarget()) - # inline all inner kernels + # inline all inner kernels names = self.local_kernel.code.callables_table for name in names: - if name in wrapper.callables_table.keys() \ - and isinstance(wrapper.callables_table[name], lp.CallableKernel): + if (name in wrapper.callables_table.keys() + and isinstance(wrapper.callables_table[name], + lp.CallableKernel)): wrapper = lp.inline_callable_kernel(wrapper, name) wrapper = self.vectorise(wrapper, iname, configuration["simd_width"]) code = lp.generate_code_v2(wrapper) From aa7bc0c2bb632575630e1967ca435b2fcaff4700 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 1 Apr 2022 01:26:21 -0500 Subject: [PATCH 048/100] blas callables: do not accept vectorized dtypes --- pyop2/codegen/rep2loopy.py | 4 ++++ pyop2/global_kernel.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py index a90131d9f..1adbe2eee 100644 --- a/pyop2/codegen/rep2loopy.py +++ b/pyop2/codegen/rep2loopy.py @@ -139,6 +139,7 @@ def with_types(self, arg_id_to_dtype, callables_table): callables_table) def emit_call_insn(self, insn, target, expression_to_code_mapper): + from loopy.codegen import UnvectorizableError assert self.is_ready_for_codegen() assert isinstance(insn, loopy.CallInstruction) @@ -147,6 +148,9 @@ def emit_call_insn(self, insn, target, expression_to_code_mapper): parameters = list(parameters) par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] + if expression_to_code_mapper.codegen_state.vectorization_info: + raise UnvectorizableError("LACallable: cannot take in vector arrays") + parameters.append(insn.assignees[-1]) par_dtypes.append(self.arg_id_to_dtype[0]) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index d30eed467..30af65fe5 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -391,7 +391,7 @@ def vectorise(self, wrapper, iname, batch_size): kernel = kernel.copy(temporary_variables=tmps) shifted_iname = kernel.get_var_name_generator()(f"{iname}_shift") - # {{{ + # {{{ record temps that cannot be vectorized # Do not vectorize temporaries used outside *iname* from functools import reduce From 8302d5205de8f47c64cb4a0e7588092ada1d171c Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 5 May 2022 11:36:52 -0500 Subject: [PATCH 049/100] allow inverse.c::inverse() to take in vector dtypes --- pyop2/codegen/c/inverse.c | 29 +++++++++++++++++++++++---- pyop2/codegen/rep2loopy.py | 40 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 4 deletions(-) diff --git a/pyop2/codegen/c/inverse.c b/pyop2/codegen/c/inverse.c index 42964604a..a0fd3130a 100644 --- a/pyop2/codegen/c/inverse.c +++ b/pyop2/codegen/c/inverse.c @@ -6,17 +6,38 @@ #define BUF_SIZE 30 static PetscBLASInt ipiv_buffer[BUF_SIZE]; static PetscScalar work_buffer[BUF_SIZE*BUF_SIZE]; +static PetscScalar Aout_proxy_buffer[BUF_SIZE*BUF_SIZE]; #endif -static void inverse(PetscScalar* __restrict__ Aout, const PetscScalar* __restrict__ A, PetscBLASInt N) +static void inverse(PetscScalar* __restrict__ Aout, const PetscScalar* __restrict__ A, PetscBLASInt N, + PetscBLASInt incA, PetscBLASInt incAout) { PetscBLASInt info; PetscBLASInt *ipiv = N <= BUF_SIZE ? ipiv_buffer : malloc(N*sizeof(*ipiv)); PetscScalar *Awork = N <= BUF_SIZE ? work_buffer : malloc(N*N*sizeof(*Awork)); - memcpy(Aout, A, N*N*sizeof(PetscScalar)); - LAPACKgetrf_(&N, &N, Aout, &N, ipiv, &info); + + PetscInt N_sq = N * N; + PetscInt one = 1; + + // Aout_proxy: 'Aout', but stored contiguously + PetscScalar *Aout_proxy; + if (incAout == 1) + Aout_proxy = Aout; + else + { + // TODO: Must see if allocating has a significant performance impact + Aout_proxy = N_sq <= BUF_SIZE ? Aout_proxy_buffer : malloc(N*N*sizeof(*Aout)); + } + + BLAScopy_(&N_sq, A, &incA, Aout_proxy, &one); + + LAPACKgetrf_(&N, &N, Aout_proxy, &N, ipiv, &info); if(info == 0){ - LAPACKgetri_(&N, Aout, &N, ipiv, Awork, &N, &info); + LAPACKgetri_(&N, Aout_proxy, &N, ipiv, Awork, &N, &info); + + // Copy Aout_proxy back to Aout + if (Aout != Aout_proxy) + BLAScopy_(&N_sq, Aout_proxy, &one, Aout, &incAout); } if(info != 0){ fprintf(stderr, "Getri throws nonzero info."); diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py index 1adbe2eee..5ee8a357b 100644 --- a/pyop2/codegen/rep2loopy.py +++ b/pyop2/codegen/rep2loopy.py @@ -177,6 +177,46 @@ class INVCallable(LACallable): """ name = "inverse" + def with_descrs(self, arg_id_to_descr, callables_table): + a_descr = arg_id_to_descr.get(0) + a_inv_descr = arg_id_to_descr.get(-1) + + if a_descr is None or a_inv_descr is None: + # shapes aren't specialized enough to be resolved + return self, callables_table + + assert len(a_descr.shape) == 2 + assert a_descr.shape == a_inv_descr.shape + assert a_descr.shape[1] == a_descr.shape[0] + + return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + from loopy.codegen import UnvectorizableError + + # Override codegen to emit stride info. to the blas calls. + in_descr = self.arg_id_to_descr[0] + out_descr = self.arg_id_to_descr[-1] + ecm = expression_to_code_mapper + + # see pyop2/codegen/c/inverse.c for the func. signature + inc_a = in_descr.dim_tags[1].stride + inc_a_out = out_descr.dim_tags[1].stride + n = in_descr.shape[0] + + a, = insn.expression.parameters + a_out, = insn.assignees + + if ecm.codegen_state.vectorization_info is not None: + raise UnvectorizableError("cannot vectorize 'inverse'.") + + c_parameters = [ecm(a_out).expr, + ecm(a).expr, + n, + inc_a, + inc_a_out] + return var(self.name_in_target)(*c_parameters), False + def generate_preambles(self, target): assert isinstance(target, loopy.CTarget) yield ("inverse", inverse_preamble) From 85de156366b72cb6bfaa83094f9a8d9463e99aea Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 5 May 2022 15:19:53 -0500 Subject: [PATCH 050/100] do not invoke the vectorization pass if one of the arguments is a MixedMatKernelArg --- pyop2/global_kernel.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 567700908..388505a68 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -333,11 +333,14 @@ def code_to_compile(self): else: iname = "n" - has_matrix = any(isinstance(arg, MatKernelArg) for arg in self.arguments) + # TODO: vectorizing 2-form assembly kernels is possible, but must + # change the arguments passed to MatSetValuesxxx (not yet implemented) + has_matrix = any(isinstance(arg, (MatKernelArg, MixedMatKernelArg)) + for arg in self.arguments) has_rw = any(arg.access == op2.RW for arg in self.local_kernel.arguments) is_cplx = (any(arg.dtype == 'complex128' for arg in self.local_kernel.arguments) or any(arg.dtype.dtype == 'complex128' for arg in tuple(wrapper.default_entrypoint.temporary_variables.values()))) - vectorisable = (not (has_matrix or has_rw) and (configuration["vectorization_strategy"])) and not is_cplx + vectorisable = ((not (has_matrix or has_rw)) and (configuration["vectorization_strategy"])) and not is_cplx if vectorisable: if isinstance(self.local_kernel.code, lp.TranslationUnit): From 30f8ecbe7b5e5667169225b7dd97cfa5c3ff8395 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 5 May 2022 18:41:47 -0500 Subject: [PATCH 051/100] makes freeing logic accurate --- pyop2/codegen/c/inverse.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyop2/codegen/c/inverse.c b/pyop2/codegen/c/inverse.c index d180412b2..34022799a 100644 --- a/pyop2/codegen/c/inverse.c +++ b/pyop2/codegen/c/inverse.c @@ -65,8 +65,9 @@ static void inverse(PetscScalar* __restrict__ Aout, const PetscScalar* __restric fprintf(stderr, "Getri throws nonzero info."); abort(); } - if ( N > BUF_SIZE ) { + + if (Awork != work_buffer) free(Awork); + if (ipiv != ipiv_buffer) free(ipiv); - } } From 0d5023d6d519316c9103df582618304d1073285d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 5 May 2022 19:24:15 -0500 Subject: [PATCH 052/100] rewrite solve to accept strided inputs --- pyop2/codegen/c/solve.c | 45 ++++++++++++++++++++++++++++++-------- pyop2/codegen/rep2loopy.py | 45 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 9 deletions(-) diff --git a/pyop2/codegen/c/solve.c b/pyop2/codegen/c/solve.c index fbabc9588..0e24eb7e8 100644 --- a/pyop2/codegen/c/solve.c +++ b/pyop2/codegen/c/solve.c @@ -8,6 +8,8 @@ static PetscBLASInt ipiv_buffer[BUF_SIZE]; static PetscScalar work_buffer[BUF_SIZE*BUF_SIZE]; #endif +static PetscScalar out_proxy_buffer[BUF_SIZE]; + #ifndef PYOP2_SOLVE_LOG_EVENTS #define PYOP2_SOLVE_LOG_EVENTS PetscLogEvent ID_solve_memcpy = -1; @@ -16,15 +18,32 @@ PetscLogEvent ID_solve_getrs = -1; static PetscBool log_active_solve = 0; #endif -void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, const PetscScalar* __restrict__ B, PetscBLASInt N) + +/* + * @param[incA]: Stride value while accessing elements of 'A'. + * @param[incB]: Stride value while accessing elements of 'B'. + * @param[incOut]: Stride value while accessing elements of 'out'. + */ +void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, const PetscScalar* __restrict__ B, PetscBLASInt N, + PetscBLASInt incA, PetscBLASInt incB, PetscBLASInt incOut) { + PetscScalar* out_proxy; /// output laid-out with unit stride, expected by LAPACK + PetscInt N_sq = N*N; + PetscInt one = 1; PetscLogIsActive(&log_active_solve); if (log_active_solve){PetscLogEventBegin(ID_solve_memcpy,0,0,0,0);} PetscBLASInt info; PetscBLASInt *ipiv = N <= BUF_SIZE ? ipiv_buffer : malloc(N*sizeof(*ipiv)); - memcpy(out,B,N*sizeof(PetscScalar)); - PetscScalar *Awork = N <= BUF_SIZE ? work_buffer : malloc(N*N*sizeof(*Awork)); - memcpy(Awork,A,N*N*sizeof(PetscScalar)); + + if (incOut == 1) + out_proxy = out; + else + out_proxy = (N <= BUF_SIZE) ? out_proxy_buffer : malloc(N*sizeof(*out)); + + BLAScopy_(&N, B, &incB, out_proxy, &one); + + PetscScalar *Awork = N <= BUF_SIZE ? work_buffer : malloc(N_sq*sizeof(*Awork)); + BLAScopy_(&N_sq, A, &incA, Awork, &one); if (log_active_solve){PetscLogEventEnd(ID_solve_memcpy,0,0,0,0);} PetscBLASInt NRHS = 1; @@ -35,7 +54,11 @@ void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, con if(info == 0){ if (log_active_solve){PetscLogEventBegin(ID_solve_getrs,0,0,0,0);} - LAPACKgetrs_(&T, &N, &NRHS, Awork, &N, ipiv, out, &N, &info); + LAPACKgetrs_(&T, &N, &NRHS, Awork, &N, ipiv, out_proxy, &N, &info); + + if (out != out_proxy) + BLAScopy_(&N, out_proxy, &one, out, &incOut); + if (log_active_solve){PetscLogEventEnd(ID_solve_getrs,0,0,0,0);} } @@ -44,8 +67,12 @@ void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, con abort(); } - if ( N > BUF_SIZE ) { - free(ipiv); - free(Awork); - } + if (ipiv != ipiv_buffer) + free(ipiv); + + if (Awork != work_buffer) + free(Awork); + + if (out_proxy != out_proxy_buffer) + free(out_proxy); } diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py index 8d9e35e6c..43918c134 100644 --- a/pyop2/codegen/rep2loopy.py +++ b/pyop2/codegen/rep2loopy.py @@ -233,6 +233,51 @@ class SolveCallable(LACallable): """ name = "solve" + def with_descrs(self, arg_id_to_descr, callables_table): + a_descr = arg_id_to_descr.get(0) + b_descr = arg_id_to_descr.get(1) + x_descr = arg_id_to_descr.get(-1) + + if a_descr is None or b_descr is None: + # shapes aren't specialized enough to be resolved + return self, callables_table + + assert len(a_descr.shape) == 2 + assert len(x_descr.shape) == 1 + assert b_descr.shape == x_descr.shape + + return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + from loopy.codegen import UnvectorizableError + + # Override codegen to emit stride info. to the blas calls. + a_descr = self.arg_id_to_descr[0] + b_descr = self.arg_id_to_descr[1] + out_descr = self.arg_id_to_descr[-1] + ecm = expression_to_code_mapper + + # see pyop2/codegen/c/solve.c for the func. signature + inc_a = a_descr.dim_tags[1].stride + inc_b = b_descr.dim_tags[0].stride + inc_out = out_descr.dim_tags[0].stride + n = a_descr.shape[0] + + a, b = insn.expression.parameters + out, = insn.assignees + + if ecm.codegen_state.vectorization_info is not None: + raise UnvectorizableError("cannot vectorize 'inverse'.") + + c_parameters = [ecm(out).expr, + ecm(a).expr, + ecm(b).expr, + n, + inc_a, + inc_b, + inc_out] + return var(self.name_in_target)(*c_parameters), False + def generate_preambles(self, target): assert isinstance(target, type(target)) yield ("solve", solve_preamble) From d25545b106dd8878bc7b69bb014688111dee7349 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 5 May 2022 22:22:41 -0500 Subject: [PATCH 053/100] blas-helpers: corrects the freeing logic --- pyop2/codegen/c/inverse.c | 2 ++ pyop2/codegen/c/solve.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pyop2/codegen/c/inverse.c b/pyop2/codegen/c/inverse.c index 34022799a..be8bafd6c 100644 --- a/pyop2/codegen/c/inverse.c +++ b/pyop2/codegen/c/inverse.c @@ -70,4 +70,6 @@ static void inverse(PetscScalar* __restrict__ Aout, const PetscScalar* __restric free(Awork); if (ipiv != ipiv_buffer) free(ipiv); + if ((Aout_proxy != Aout) && (Aout_proxy != Aout_proxy_buffer)) + free(Aout_proxy); } diff --git a/pyop2/codegen/c/solve.c b/pyop2/codegen/c/solve.c index 0e24eb7e8..c1444f1d1 100644 --- a/pyop2/codegen/c/solve.c +++ b/pyop2/codegen/c/solve.c @@ -73,6 +73,6 @@ void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, con if (Awork != work_buffer) free(Awork); - if (out_proxy != out_proxy_buffer) + if ((out_proxy != out) && (out_proxy != out_proxy_buffer)) free(out_proxy); } From 0ade829070bfa168999a5fde792135fee319cb84 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Fri, 6 May 2022 13:23:26 +0200 Subject: [PATCH 054/100] Don't vectorise the kernel which generates the coordinates for the extrusion --- pyop2/global_kernel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 388505a68..bc8d3013c 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -340,7 +340,8 @@ def code_to_compile(self): has_rw = any(arg.access == op2.RW for arg in self.local_kernel.arguments) is_cplx = (any(arg.dtype == 'complex128' for arg in self.local_kernel.arguments) or any(arg.dtype.dtype == 'complex128' for arg in tuple(wrapper.default_entrypoint.temporary_variables.values()))) - vectorisable = ((not (has_matrix or has_rw)) and (configuration["vectorization_strategy"])) and not is_cplx + extruded_coords = self.local_kernel.name.endswith("extrusion") # FIXME is there a better way to know that this kernel generated the extrusion coords? + vectorisable = ((not (has_matrix or has_rw)) and (configuration["vectorization_strategy"])) and not is_cplx and not extruded_coords if vectorisable: if isinstance(self.local_kernel.code, lp.TranslationUnit): From a4bab8e326469a0d255c04267fd3e97f62741143 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Fri, 6 May 2022 14:55:20 +0200 Subject: [PATCH 055/100] PyOP2 compilation: add a pathway to compile with gcc on Mac. --- pyop2/compilation.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pyop2/compilation.py b/pyop2/compilation.py index 00d14d407..1f62b2c9b 100644 --- a/pyop2/compilation.py +++ b/pyop2/compilation.py @@ -139,6 +139,8 @@ def sniff_compiler(exe): compiler = MacClangARMCompiler elif machine == "x86_64": compiler = MacClangCompiler + elif name == "GNU": + compiler = MacGNUCompiler else: compiler = AnonymousCompiler else: @@ -471,6 +473,11 @@ class MacClangARMCompiler(MacClangCompiler): _ldflags = ("-dynamiclib", "-L/opt/homebrew/opt/gcc/lib/gcc/11") +class MacGNUCompiler(MacClangCompiler): + """A compiler for building a shared library on Mac systems with a GNU compiler.""" + _name = "Mac GNU" + + class LinuxGnuCompiler(Compiler): """The GNU compiler for building a shared library on Linux systems.""" _name = "GNU" From 175eb143d7e1c403ef40e3f1e265d850c050541f Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 7 May 2022 23:41:15 -0500 Subject: [PATCH 056/100] do not vectorize the entire kernel if some instruction are surrounded by conditional depending on vectorizing iname --- pyop2/global_kernel.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index bc8d3013c..d7aa5f49f 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -326,6 +326,8 @@ def builder(self): def code_to_compile(self): """Return the C/C++ source code as a string.""" from pyop2.codegen.rep2loopy import generate + from loopy.symbolic import get_dependencies + from functools import reduce wrapper = generate(self.builder) if self._extruded: @@ -355,7 +357,19 @@ def code_to_compile(self): and isinstance(wrapper.callables_table[name], lp.CallableKernel)): wrapper = lp.inline_callable_kernel(wrapper, name) - wrapper = self.vectorise(wrapper, iname, configuration["simd_width"]) + + all_insn_preds = reduce( + frozenset.union, + (insn.predicates + for insn in wrapper.default_entrypoint.instructions), + frozenset()) + + if iname not in get_dependencies(tuple(all_insn_preds)): + # https://github.com/inducer/loopy/issues/615 + # TODO: get rid of this guard once the loopy issue is fixed + wrapper = self.vectorise(wrapper, iname, + configuration["simd_width"]) + code = lp.generate_code_v2(wrapper) if self.local_kernel.cpp: From 8256bd2d85b6d41703b9ee517abff2495c181f2a Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 8 May 2022 16:22:53 -0500 Subject: [PATCH 057/100] loop being split starts from '0' => do not peel at the head --- pyop2/global_kernel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index d7aa5f49f..4624ece96 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -439,7 +439,7 @@ def vectorise(self, wrapper, iname, batch_size): # }}} # split iname - slabs = (1, 1) + slabs = (0, 1) inner_iname = kernel.get_var_name_generator()(f"{shifted_iname}_batch") # in the ideal world breaks a loop of n*batch_size into two loops: From 4c0ca6e51c49d3bbb5634932786ae918e609c634 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Mon, 9 May 2022 13:35:49 +0200 Subject: [PATCH 058/100] Add comment --- pyop2/global_kernel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 4624ece96..8f4e79f77 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -403,7 +403,6 @@ def vectorise(self, wrapper, iname, batch_size): tmps = {name: tv.copy(alignment=alignment) for name, tv in kernel.temporary_variables.items()} kernel = kernel.copy(temporary_variables=tmps) - shifted_iname = kernel.get_var_name_generator()(f"{iname}_shift") # {{{ record temps that cannot be vectorized @@ -431,7 +430,7 @@ def vectorise(self, wrapper, iname, batch_size): from loopy.symbolic import pw_aff_to_expr import pymbolic.primitives as prim lbound = pw_aff_to_expr(kernel.get_iname_bounds(iname).lower_bound_pw_aff) - + shifted_iname = kernel.get_var_name_generator()(f"{iname}_shift") kernel = lp.affine_map_inames(kernel, iname, shifted_iname, [(prim.Variable(shifted_iname), (prim.Variable(iname) - lbound))]) @@ -439,6 +438,7 @@ def vectorise(self, wrapper, iname, batch_size): # }}} # split iname + # note there is no front slab needed because iname is shifted (see above) slabs = (0, 1) inner_iname = kernel.get_var_name_generator()(f"{shifted_iname}_batch") From e7440925c91b047ba3ce8c3ed11277c5c3e6d119 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 10 May 2022 09:35:51 +0200 Subject: [PATCH 059/100] Fix complex check? --- pyop2/global_kernel.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 8f4e79f77..7ad927a7f 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -340,8 +340,10 @@ def code_to_compile(self): has_matrix = any(isinstance(arg, (MatKernelArg, MixedMatKernelArg)) for arg in self.arguments) has_rw = any(arg.access == op2.RW for arg in self.local_kernel.arguments) - is_cplx = (any(arg.dtype == 'complex128' for arg in self.local_kernel.arguments) - or any(arg.dtype.dtype == 'complex128' for arg in tuple(wrapper.default_entrypoint.temporary_variables.values()))) + is_cplx = (any(dtype == 'complex128' for dtype in self.local_kernel.dtypes) # local args complex? + or any(arg.dtype.dtype == 'complex128' for n in self.local_kernel.code.callables_table + for arg in tuple(self.local_kernel.code.callables_table[n].subkernel.temporary_variables.values())) # local temps complex? + or any(arg.dtype.dtype == 'complex128' for arg in tuple(wrapper.default_entrypoint.temporary_variables.values()))) # global temps complex? extruded_coords = self.local_kernel.name.endswith("extrusion") # FIXME is there a better way to know that this kernel generated the extrusion coords? vectorisable = ((not (has_matrix or has_rw)) and (configuration["vectorization_strategy"])) and not is_cplx and not extruded_coords From 5fc42648c53da7eaaf856519cac97931e73533b4 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 10 May 2022 09:55:19 +0200 Subject: [PATCH 060/100] Fix complex check? --- pyop2/global_kernel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 7ad927a7f..88e95b79a 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -340,10 +340,10 @@ def code_to_compile(self): has_matrix = any(isinstance(arg, (MatKernelArg, MixedMatKernelArg)) for arg in self.arguments) has_rw = any(arg.access == op2.RW for arg in self.local_kernel.arguments) - is_cplx = (any(dtype == 'complex128' for dtype in self.local_kernel.dtypes) # local args complex? - or any(arg.dtype.dtype == 'complex128' for n in self.local_kernel.code.callables_table + is_cplx = (any(dtype.is_complex() for dtype in self.local_kernel.dtypes) # local args complex? + or any(arg.dtype.is_complex() for n in self.local_kernel.code.callables_table for arg in tuple(self.local_kernel.code.callables_table[n].subkernel.temporary_variables.values())) # local temps complex? - or any(arg.dtype.dtype == 'complex128' for arg in tuple(wrapper.default_entrypoint.temporary_variables.values()))) # global temps complex? + or any(arg.dtype.is_complex() for arg in tuple(wrapper.default_entrypoint.temporary_variables.values()))) # global temps complex? extruded_coords = self.local_kernel.name.endswith("extrusion") # FIXME is there a better way to know that this kernel generated the extrusion coords? vectorisable = ((not (has_matrix or has_rw)) and (configuration["vectorization_strategy"])) and not is_cplx and not extruded_coords From 31f0c393523a070faf39ff5cb053fe0afa7fbd0b Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 10 May 2022 11:20:03 +0200 Subject: [PATCH 061/100] Fix complex check? --- pyop2/global_kernel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 88e95b79a..4897c067f 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -340,10 +340,10 @@ def code_to_compile(self): has_matrix = any(isinstance(arg, (MatKernelArg, MixedMatKernelArg)) for arg in self.arguments) has_rw = any(arg.access == op2.RW for arg in self.local_kernel.arguments) - is_cplx = (any(dtype.is_complex() for dtype in self.local_kernel.dtypes) # local args complex? - or any(arg.dtype.is_complex() for n in self.local_kernel.code.callables_table + is_cplx = (any(dtype.dtype=="complex128" for dtype in self.local_kernel.dtypes) # local args complex? + or any(arg.dtype.dtype=="complex128" for n in self.local_kernel.code.callables_table for arg in tuple(self.local_kernel.code.callables_table[n].subkernel.temporary_variables.values())) # local temps complex? - or any(arg.dtype.is_complex() for arg in tuple(wrapper.default_entrypoint.temporary_variables.values()))) # global temps complex? + or any(arg.dtype.dtype=="complex128" for arg in tuple(wrapper.default_entrypoint.temporary_variables.values()))) # global temps complex? extruded_coords = self.local_kernel.name.endswith("extrusion") # FIXME is there a better way to know that this kernel generated the extrusion coords? vectorisable = ((not (has_matrix or has_rw)) and (configuration["vectorization_strategy"])) and not is_cplx and not extruded_coords From 7e8a86adb016489e5482a182c48213cbbd60a5e4 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 10 May 2022 13:40:26 +0200 Subject: [PATCH 062/100] Fix complex check? --- pyop2/global_kernel.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 4897c067f..22bd7da03 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -340,10 +340,8 @@ def code_to_compile(self): has_matrix = any(isinstance(arg, (MatKernelArg, MixedMatKernelArg)) for arg in self.arguments) has_rw = any(arg.access == op2.RW for arg in self.local_kernel.arguments) - is_cplx = (any(dtype.dtype=="complex128" for dtype in self.local_kernel.dtypes) # local args complex? - or any(arg.dtype.dtype=="complex128" for n in self.local_kernel.code.callables_table - for arg in tuple(self.local_kernel.code.callables_table[n].subkernel.temporary_variables.values())) # local temps complex? - or any(arg.dtype.dtype=="complex128" for arg in tuple(wrapper.default_entrypoint.temporary_variables.values()))) # global temps complex? + is_cplx = (any(lp.types.NumpyType(dtype).is_complex() if not isinstance(dtype, lp.types.NumpyType) else dtype for dtype in self.local_kernel.dtypes) # local args complex? + or any(arg.dtype.is_complex() for arg in tuple(wrapper.default_entrypoint.temporary_variables.values()))) # global temps complex? extruded_coords = self.local_kernel.name.endswith("extrusion") # FIXME is there a better way to know that this kernel generated the extrusion coords? vectorisable = ((not (has_matrix or has_rw)) and (configuration["vectorization_strategy"])) and not is_cplx and not extruded_coords From 63f1e522e54465dbd0d2d7424b68d6f8bceb5ffc Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 11 May 2022 15:43:05 -0500 Subject: [PATCH 063/100] clarifies vectorization strategy --- pyop2/configuration.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pyop2/configuration.py b/pyop2/configuration.py index 63256c012..6c495a15b 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -90,6 +90,12 @@ class Configuration(dict): cdim > 1 be built as block sparsities, or dof sparsities. The former saves memory but changes which preconditioners are available for the resulting matrices. (Default yes) + :param vectorization_strategy: A :class:`str` describing the + vectorization strategy that must to be applied to the kernels. Can + be one of the following -- + + - ``sun2020study``: Cross-element vectorization strategy of + ``__. """ # name, env variable, type, default, write once cache_dir = os.path.join(gettempdir(), "pyop2-cache-uid%s" % os.getuid()) @@ -109,7 +115,7 @@ class Configuration(dict): "simd_width": ("PYOP2_SIMD_WIDTH", int, default_simd_width()), "vectorization_strategy": - ("PYOP2_VECT_STRATEGY", str, "ve"), + ("PYOP2_VECT_STRATEGY", str, "sun2020study"), "alignment": ("PYOP2_ALIGNMENT", int, 64), "time": From 8b19370fe9cb986f1df4e1b4ad74305cb33836ee Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 11 May 2022 15:46:42 -0500 Subject: [PATCH 064/100] Updates to transform startegy - corrects typo in is_cplx calculation - do not vectorize BLAS call arguments (thx clang) - Tightens the interpretation of conf[vectorization_strategy] --- pyop2/global_kernel.py | 44 ++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 22bd7da03..e44651535 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -340,8 +340,13 @@ def code_to_compile(self): has_matrix = any(isinstance(arg, (MatKernelArg, MixedMatKernelArg)) for arg in self.arguments) has_rw = any(arg.access == op2.RW for arg in self.local_kernel.arguments) - is_cplx = (any(lp.types.NumpyType(dtype).is_complex() if not isinstance(dtype, lp.types.NumpyType) else dtype for dtype in self.local_kernel.dtypes) # local args complex? - or any(arg.dtype.is_complex() for arg in tuple(wrapper.default_entrypoint.temporary_variables.values()))) # global temps complex? + is_cplx = (any(lp.types.NumpyType(dtype).is_complex() + if not isinstance(dtype, lp.types.NumpyType) + else dtype.is_complex() + for dtype in self.local_kernel.dtypes + ) # local args complex? + or any(arg.dtype.is_complex() + for arg in tuple(wrapper.default_entrypoint.temporary_variables.values()))) # global temps complex? extruded_coords = self.local_kernel.name.endswith("extrusion") # FIXME is there a better way to know that this kernel generated the extrusion coords? vectorisable = ((not (has_matrix or has_rw)) and (configuration["vectorization_strategy"])) and not is_cplx and not extruded_coords @@ -367,8 +372,13 @@ def code_to_compile(self): if iname not in get_dependencies(tuple(all_insn_preds)): # https://github.com/inducer/loopy/issues/615 # TODO: get rid of this guard once the loopy issue is fixed - wrapper = self.vectorise(wrapper, iname, - configuration["simd_width"]) + if configuration["vectorization_strategy"] == "sun2020study": + wrapper = self.vectorise(wrapper, iname, + configuration["simd_width"]) + else: + raise NotImplementedError( + "Vectorization strategy" + f" '{configuration['vectorization_strategy']}'") code = lp.generate_code_v2(wrapper) @@ -388,6 +398,9 @@ def vectorise(self, wrapper, iname, batch_size): if batch_size == 1: return wrapper + from functools import reduce + import pymbolic.primitives as prim + if not configuration["debug"]: # loopy warns for every instruction that cannot be vectorized; # ignore in non-debug mode. @@ -407,7 +420,6 @@ def vectorise(self, wrapper, iname, batch_size): # {{{ record temps that cannot be vectorized # Do not vectorize temporaries used outside *iname* - from functools import reduce temps_not_to_vectorize = reduce(set.union, [(insn.dependency_names() & frozenset(kernel.temporary_variables)) @@ -421,6 +433,19 @@ def vectorise(self, wrapper, iname, batch_size): if (tv.read_only and tv.initializer is not None)} + # {{{ clang (unlike gcc) does not allow taking address of vector-type + # variable + + # FIXME: Perform this only if we know we are not using gcc. + for insn in kernel.instructions: + if ( + isinstance(insn, lp.MultiAssignmentBase) + and isinstance(insn.expression, prim.Call) + and insn.expression.function.name in ["solve", "inverse"]): + temps_not_to_vectorize -= (insn.dependency_names()) + + # }}} + # }}} # {{{ TODO: placeholder until loopy's simplify_using_pwaff gets smarter @@ -428,7 +453,6 @@ def vectorise(self, wrapper, iname, batch_size): # transform to ensure that the loop undergoing array expansion has a # lower bound of '0' from loopy.symbolic import pw_aff_to_expr - import pymbolic.primitives as prim lbound = pw_aff_to_expr(kernel.get_iname_bounds(iname).lower_bound_pw_aff) shifted_iname = kernel.get_var_name_generator()(f"{iname}_shift") kernel = lp.affine_map_inames(kernel, iname, shifted_iname, @@ -442,12 +466,8 @@ def vectorise(self, wrapper, iname, batch_size): slabs = (0, 1) inner_iname = kernel.get_var_name_generator()(f"{shifted_iname}_batch") - # in the ideal world breaks a loop of n*batch_size into two loops: - # an outer loop of n/batch_size - # and an inner loop over batch_size - if configuration["vectorization_strategy"] == "ve": - kernel = lp.split_iname(kernel, shifted_iname, batch_size, slabs=slabs, - inner_iname=inner_iname) + kernel = lp.split_iname(kernel, shifted_iname, batch_size, slabs=slabs, + inner_iname=inner_iname) # adds a new axis to the temporary and indexes it with the provided iname # i.e. stores the value at each instance of the loop. (i.e. array From 7a2cbd6e857edc7b7425c428412b94f33e96c395 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 19 May 2022 11:25:04 +0200 Subject: [PATCH 065/100] Time configuration is not used anywhere and add doc --- pyop2/configuration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyop2/configuration.py b/pyop2/configuration.py index 6c495a15b..3e5dccc15 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -93,6 +93,8 @@ class Configuration(dict): :param vectorization_strategy: A :class:`str` describing the vectorization strategy that must to be applied to the kernels. Can be one of the following -- + :param alignment: A :class:`int` which specifies a size to which all temporaries + are aligned in memory. - ``sun2020study``: Cross-element vectorization strategy of ``__. @@ -118,8 +120,6 @@ class Configuration(dict): ("PYOP2_VECT_STRATEGY", str, "sun2020study"), "alignment": ("PYOP2_ALIGNMENT", int, 64), - "time": - ("PYOP2_TIME", bool, False), "debug": ("PYOP2_DEBUG", bool, False), "compute_kernel_flops": From 69d4921f9fedb54bba01bdad9bfc50834b53f342 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 19 May 2022 11:27:21 +0200 Subject: [PATCH 066/100] Move conditional --- pyop2/global_kernel.py | 60 ++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index e44651535..1d9b8d8f7 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -348,37 +348,37 @@ def code_to_compile(self): or any(arg.dtype.is_complex() for arg in tuple(wrapper.default_entrypoint.temporary_variables.values()))) # global temps complex? extruded_coords = self.local_kernel.name.endswith("extrusion") # FIXME is there a better way to know that this kernel generated the extrusion coords? - vectorisable = ((not (has_matrix or has_rw)) and (configuration["vectorization_strategy"])) and not is_cplx and not extruded_coords + is_loopy_kernel = isinstance(self.local_kernel.code, lp.TranslationUnit) + vectorisable = is_loopy_kernel and((not (has_matrix or has_rw)) and (configuration["vectorization_strategy"])) and not is_cplx and not extruded_coords if vectorisable: - if isinstance(self.local_kernel.code, lp.TranslationUnit): - # change target to generate vectorized code via gcc vector - # extensions - wrapper = wrapper.copy(target=lp.CVectorExtensionsTarget()) - # inline all inner kernels - names = self.local_kernel.code.callables_table - for name in names: - if (name in wrapper.callables_table.keys() - and isinstance(wrapper.callables_table[name], - lp.CallableKernel)): - wrapper = lp.inline_callable_kernel(wrapper, name) - - all_insn_preds = reduce( - frozenset.union, - (insn.predicates - for insn in wrapper.default_entrypoint.instructions), - frozenset()) - - if iname not in get_dependencies(tuple(all_insn_preds)): - # https://github.com/inducer/loopy/issues/615 - # TODO: get rid of this guard once the loopy issue is fixed - if configuration["vectorization_strategy"] == "sun2020study": - wrapper = self.vectorise(wrapper, iname, - configuration["simd_width"]) - else: - raise NotImplementedError( - "Vectorization strategy" - f" '{configuration['vectorization_strategy']}'") + # change target to generate vectorized code via gcc vector + # extensions + wrapper = wrapper.copy(target=lp.CVectorExtensionsTarget()) + # inline all inner kernels + names = self.local_kernel.code.callables_table + for name in names: + if (name in wrapper.callables_table.keys() + and isinstance(wrapper.callables_table[name], + lp.CallableKernel)): + wrapper = lp.inline_callable_kernel(wrapper, name) + + all_insn_preds = reduce( + frozenset.union, + (insn.predicates + for insn in wrapper.default_entrypoint.instructions), + frozenset()) + + if iname not in get_dependencies(tuple(all_insn_preds)): + # https://github.com/inducer/loopy/issues/615 + # TODO: get rid of this guard once the loopy issue is fixed + if configuration["vectorization_strategy"] == "sun2020study": + wrapper = self.vectorise(wrapper, iname, + configuration["simd_width"]) + else: + raise NotImplementedError( + "Vectorization strategy" + f" '{configuration['vectorization_strategy']}'") code = lp.generate_code_v2(wrapper) @@ -443,6 +443,8 @@ def vectorise(self, wrapper, iname, batch_size): and isinstance(insn.expression, prim.Call) and insn.expression.function.name in ["solve", "inverse"]): temps_not_to_vectorize -= (insn.dependency_names()) + print("NO GCC") + print((insn.dependency_names())) # }}} From 43960e61366e75210250103e0fb519061da173b5 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 19 May 2022 11:29:22 +0200 Subject: [PATCH 067/100] sun2020study -> cross-element --- pyop2/configuration.py | 4 ++-- pyop2/global_kernel.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyop2/configuration.py b/pyop2/configuration.py index 3e5dccc15..55fd3918c 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -96,7 +96,7 @@ class Configuration(dict): :param alignment: A :class:`int` which specifies a size to which all temporaries are aligned in memory. - - ``sun2020study``: Cross-element vectorization strategy of + - ``cross-element``: Cross-element vectorization strategy of ``__. """ # name, env variable, type, default, write once @@ -117,7 +117,7 @@ class Configuration(dict): "simd_width": ("PYOP2_SIMD_WIDTH", int, default_simd_width()), "vectorization_strategy": - ("PYOP2_VECT_STRATEGY", str, "sun2020study"), + ("PYOP2_VECT_STRATEGY", str, "cross-element"), "alignment": ("PYOP2_ALIGNMENT", int, 64), "debug": diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 1d9b8d8f7..a92a06f99 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -372,7 +372,7 @@ def code_to_compile(self): if iname not in get_dependencies(tuple(all_insn_preds)): # https://github.com/inducer/loopy/issues/615 # TODO: get rid of this guard once the loopy issue is fixed - if configuration["vectorization_strategy"] == "sun2020study": + if configuration["vectorization_strategy"] == "cross-element": wrapper = self.vectorise(wrapper, iname, configuration["simd_width"]) else: From b4c9926a762476098bed33e9998e3cb4e5f3dee0 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 19 May 2022 11:39:55 +0200 Subject: [PATCH 068/100] Make default_simd_width more readable --- pyop2/configuration.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/pyop2/configuration.py b/pyop2/configuration.py index 55fd3918c..ccf06290d 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -44,16 +44,18 @@ def default_simd_width(): from cpuinfo import get_cpu_info avx_to_width = {'avx': 2, 'avx1': 2, 'avx128': 2, 'avx2': 4, 'avx256': 4, 'avx3': 8, 'avx512': 8} - longest_ext = [t for t in get_cpu_info()["flags"] if t.startswith('avx')][-1] - if longest_ext not in avx_to_width.keys(): - if longest_ext[:6] not in avx_to_width.keys(): - assert longest_ext[:4] in avx_to_width.keys(), \ - "The vector extension of your architecture is unknown. Disable vectorisation!" - return avx_to_width[longest_ext[:4]] - else: - return avx_to_width[longest_ext[:6]] + longest_simd_extension = [t for t in get_cpu_info()["flags"] if t.startswith('avx')][-1] + if longest_simd_extension in avx_to_width.keys(): + return avx_to_width[longest_simd_extension] + elif longest_simd_extension[:6] in avx_to_width.keys(): + return avx_to_width[longest_simd_extension[:6]] + elif longest_simd_extension[:4] in avx_to_width.keys(): + return avx_to_width[longest_simd_extension[:4]] else: - return avx_to_width[longest_ext] + raise ConfigurationError(f"The vector extension of your architecture is unknown.\ + Must be one of {str(avx_to_width.keys())}.\ + We advise to disable vectorisation." + ) class Configuration(dict): From c603f3fe73b15e515e637aa849e9ee108897f7a3 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 19 May 2022 11:41:48 +0200 Subject: [PATCH 069/100] cleanup --- pyop2/global_kernel.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index a92a06f99..5087b13d3 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -443,8 +443,6 @@ def vectorise(self, wrapper, iname, batch_size): and isinstance(insn.expression, prim.Call) and insn.expression.function.name in ["solve", "inverse"]): temps_not_to_vectorize -= (insn.dependency_names()) - print("NO GCC") - print((insn.dependency_names())) # }}} From 1cee3d7f03ffb59debb002c51a4e32131a7c41fe Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 19 May 2022 14:37:50 +0200 Subject: [PATCH 070/100] Lint --- pyop2/global_kernel.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 5087b13d3..0b2467ee9 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -349,7 +349,7 @@ def code_to_compile(self): for arg in tuple(wrapper.default_entrypoint.temporary_variables.values()))) # global temps complex? extruded_coords = self.local_kernel.name.endswith("extrusion") # FIXME is there a better way to know that this kernel generated the extrusion coords? is_loopy_kernel = isinstance(self.local_kernel.code, lp.TranslationUnit) - vectorisable = is_loopy_kernel and((not (has_matrix or has_rw)) and (configuration["vectorization_strategy"])) and not is_cplx and not extruded_coords + vectorisable = is_loopy_kernel and ((not (has_matrix or has_rw)) and (configuration["vectorization_strategy"])) and not is_cplx and not extruded_coords if vectorisable: # change target to generate vectorized code via gcc vector @@ -359,8 +359,8 @@ def code_to_compile(self): names = self.local_kernel.code.callables_table for name in names: if (name in wrapper.callables_table.keys() - and isinstance(wrapper.callables_table[name], - lp.CallableKernel)): + and isinstance(wrapper.callables_table[name], + lp.CallableKernel)): wrapper = lp.inline_callable_kernel(wrapper, name) all_insn_preds = reduce( @@ -374,7 +374,7 @@ def code_to_compile(self): # TODO: get rid of this guard once the loopy issue is fixed if configuration["vectorization_strategy"] == "cross-element": wrapper = self.vectorise(wrapper, iname, - configuration["simd_width"]) + configuration["simd_width"]) else: raise NotImplementedError( "Vectorization strategy" From a671b6ccb50e2ed7b12df1800534f99979bdb9fb Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 19 May 2022 22:49:29 -0500 Subject: [PATCH 071/100] corrects the condition to not vectorize temps passed to BLAS calls --- pyop2/global_kernel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 0b2467ee9..307070e9d 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -387,6 +387,7 @@ def code_to_compile(self): preamble = "".join(process_preambles(getattr(code, "device_preambles", []))) device_code = "\n\n".join(str(dp.ast) for dp in code.device_programs) return preamble + "\nextern \"C\" {\n" + device_code + "\n}\n" + return code.device_code() def vectorise(self, wrapper, iname, batch_size): @@ -442,7 +443,7 @@ def vectorise(self, wrapper, iname, batch_size): isinstance(insn, lp.MultiAssignmentBase) and isinstance(insn.expression, prim.Call) and insn.expression.function.name in ["solve", "inverse"]): - temps_not_to_vectorize -= (insn.dependency_names()) + temps_not_to_vectorize |= (insn.dependency_names()) # }}} From 4aa86e1414ab78d562698f8d5081a6c98b9d954b Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 24 May 2022 15:19:19 +0200 Subject: [PATCH 072/100] Add vectorisation config to cache keys --- pyop2/caching.py | 2 +- pyop2/global_kernel.py | 16 +++++++++------- pyop2/local_kernel.py | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pyop2/caching.py b/pyop2/caching.py index 24a3f5513..aee31225d 100644 --- a/pyop2/caching.py +++ b/pyop2/caching.py @@ -237,7 +237,7 @@ def _cache_key(cls, *args, **kwargs): @cached_property def cache_key(self): """Cache key.""" - return self._key + return self._key + tuple(configuration["vectorization_strategy"]) cached = cachetools.cached diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 307070e9d..401c92a75 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -39,7 +39,7 @@ def __post_init__(self): @property def cache_key(self): - return type(self), self.arity, self.offset + return type(self), self.arity, self.offset, configuration["vectorization_strategy"] @dataclass(eq=False, frozen=True) @@ -59,7 +59,7 @@ def __post_init__(self): @property def cache_key(self): - return type(self), self.base_map.cache_key, tuple(self.permutation) + return type(self), self.base_map.cache_key, tuple(self.permutation), configuration["vectorization_strategy"] @dataclass(frozen=True) @@ -73,7 +73,7 @@ class GlobalKernelArg: @property def cache_key(self): - return type(self), self.dim + return type(self), self.dim, configuration["vectorization_strategy"] @property def maps(self): @@ -112,7 +112,7 @@ def is_indirect(self): @property def cache_key(self): map_key = self.map_.cache_key if self.map_ is not None else None - return type(self), self.dim, map_key, self.index + return type(self), self.dim, map_key, self.index, configuration["vectorization_strategy"] @property def maps(self): @@ -141,7 +141,7 @@ def pack(self): @property def cache_key(self): - return type(self), self.dims, tuple(m.cache_key for m in self.maps), self.unroll + return type(self), self.dims, tuple(m.cache_key for m in self.maps), self.unroll, configuration["vectorization_strategy"] @dataclass(frozen=True) @@ -161,7 +161,7 @@ def __len__(self): @property def cache_key(self): - return tuple(a.cache_key for a in self.arguments) + return tuple(a.cache_key for a in self.arguments) + tuple(configuration["vectorization_strategy"]) @property def maps(self): @@ -192,7 +192,7 @@ def __len__(self): @property def cache_key(self): - return tuple(a.cache_key for a in self.arguments) + return tuple(a.cache_key for a in self.arguments) + tuple(configuration["vectorization_strategy"]) @property def maps(self): @@ -351,6 +351,7 @@ def code_to_compile(self): is_loopy_kernel = isinstance(self.local_kernel.code, lp.TranslationUnit) vectorisable = is_loopy_kernel and ((not (has_matrix or has_rw)) and (configuration["vectorization_strategy"])) and not is_cplx and not extruded_coords + print(vectorisable) if vectorisable: # change target to generate vectorized code via gcc vector # extensions @@ -381,6 +382,7 @@ def code_to_compile(self): f" '{configuration['vectorization_strategy']}'") code = lp.generate_code_v2(wrapper) + print(code) if self.local_kernel.cpp: from loopy.codegen.result import process_preambles diff --git a/pyop2/local_kernel.py b/pyop2/local_kernel.py index 4807463b8..1db70f99b 100644 --- a/pyop2/local_kernel.py +++ b/pyop2/local_kernel.py @@ -115,7 +115,7 @@ def dtypes(self): @property def cache_key(self): - return self._immutable_cache_key, self.accesses, self.dtypes + return self._immutable_cache_key, self.accesses, self.dtypes, configuration["vectorization_strategy"] @cached_property def _immutable_cache_key(self): From 60b4b3e1f31c8a8f6c5e635e8cb64f4234ef62cb Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 24 May 2022 15:18:13 +0200 Subject: [PATCH 073/100] Tests: add a vectorisation test --- test/unit/test_vectorisation.py | 102 ++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 test/unit/test_vectorisation.py diff --git a/test/unit/test_vectorisation.py b/test/unit/test_vectorisation.py new file mode 100644 index 000000000..703b244ba --- /dev/null +++ b/test/unit/test_vectorisation.py @@ -0,0 +1,102 @@ +# This file is part of PyOP2 +# +# PyOP2 is Copyright (c) 2012, Imperial College London and +# others. Please see the AUTHORS file in the main source directory for +# a full list of copyright holders. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * The name of Imperial College London or that of other +# contributors may not be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTERS +# ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +# OF THE POSSIBILITY OF SUCH DAMAGE. +import numpy as np +from pyop2 import op2 +from pyop2.configuration import configuration +import glob +import os +import pytest + + +some_vectorisation_keys = ["__attribute__", "vector_size", "aligned", "#pragma omp simd"] + + +class TestVectorisation: + + @pytest.fixture + def s(self): + return op2.Set(1) + + @pytest.fixture + def md1(self, s): + n = op2.Dat(s, [3], np.float64) + o = op2.Dat(s, [4], np.float64) + md = op2.MixedDat([n, o]) + return md + + @pytest.fixture + def md(self, s): + n = op2.Dat(s, [4], np.float64) + o = op2.Dat(s, [5], np.float64) + md = op2.MixedDat([n, o]) + return md + + def test_vectorisation(self, md1, md): + # Test that vectorised code produced the correct result + ret = md.inner(md1) + assert abs(ret - 32) < 1e-12 + ret = md1.inner(md) + assert abs(ret - 32) < 1e-12 + + # Test that we actually vectorised + list_of_files = glob.glob(configuration["cache_dir"]+"/*/*.c") + latest_file = max(list_of_files, key=os.path.getctime) + with open(latest_file, 'r') as file: + generated_code = file.read() + assert (all(key in generated_code for key in some_vectorisation_keys)), "The kernel for an inner product has not been succesfully vectorised." + + def test_no_vectorisation(self, md1, md): + # turn vectorisation off + op2.init(**{"vectorization_strategy": ""}) + + # Test that unvectorised code produced the correct result + ret = md.inner(md1) + assert abs(ret - 32) < 1e-12 + ret = md1.inner(md) + assert abs(ret - 32) < 1e-12 + + # Test that we did not vectorise + print(configuration["cache_dir"]) + list_of_files = glob.glob(configuration["cache_dir"]+"/*/*.c") + print(list_of_files) + latest_file = max(list_of_files, key=os.path.getctime) + with open(latest_file, 'r') as file: + generated_code = file.read() + assert not (any(key in generated_code for key in some_vectorisation_keys)), "The kernel for an inner product has not been succesfully vectorised." + + # change vect config back to be turned on by default + op2.init(**{"vectorization_strategy": "cross-element"}) + + +if __name__ == '__main__': + pytest.main(os.path.abspath(__file__)) From 1b3c29ef235a168d56aece24a0dbf7977cba8188 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 24 May 2022 15:27:02 +0200 Subject: [PATCH 074/100] Cleanup --- test/unit/test_vectorisation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/unit/test_vectorisation.py b/test/unit/test_vectorisation.py index 703b244ba..fcea2117b 100644 --- a/test/unit/test_vectorisation.py +++ b/test/unit/test_vectorisation.py @@ -86,9 +86,7 @@ def test_no_vectorisation(self, md1, md): assert abs(ret - 32) < 1e-12 # Test that we did not vectorise - print(configuration["cache_dir"]) list_of_files = glob.glob(configuration["cache_dir"]+"/*/*.c") - print(list_of_files) latest_file = max(list_of_files, key=os.path.getctime) with open(latest_file, 'r') as file: generated_code = file.read() From 0a54a34b0ca33b7b6ea461035369233b60194ac9 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 24 May 2022 15:28:12 +0200 Subject: [PATCH 075/100] Cleanup --- pyop2/global_kernel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 401c92a75..a840eba86 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -351,7 +351,6 @@ def code_to_compile(self): is_loopy_kernel = isinstance(self.local_kernel.code, lp.TranslationUnit) vectorisable = is_loopy_kernel and ((not (has_matrix or has_rw)) and (configuration["vectorization_strategy"])) and not is_cplx and not extruded_coords - print(vectorisable) if vectorisable: # change target to generate vectorized code via gcc vector # extensions From 9b23200d8942a89f0044fe74c2d7415842b798fc Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 24 May 2022 17:07:47 +0200 Subject: [PATCH 076/100] Use reconfigure not init for changing the vectorisation strategy in the test --- test/unit/test_vectorisation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/unit/test_vectorisation.py b/test/unit/test_vectorisation.py index fcea2117b..19e51f8b4 100644 --- a/test/unit/test_vectorisation.py +++ b/test/unit/test_vectorisation.py @@ -77,7 +77,7 @@ def test_vectorisation(self, md1, md): def test_no_vectorisation(self, md1, md): # turn vectorisation off - op2.init(**{"vectorization_strategy": ""}) + configuration.reconfigure(vectorization_strategy="") # Test that unvectorised code produced the correct result ret = md.inner(md1) @@ -93,7 +93,7 @@ def test_no_vectorisation(self, md1, md): assert not (any(key in generated_code for key in some_vectorisation_keys)), "The kernel for an inner product has not been succesfully vectorised." # change vect config back to be turned on by default - op2.init(**{"vectorization_strategy": "cross-element"}) + configuration.reconfigure(vectorization_strategy="cross-element") if __name__ == '__main__': From acb9c89f17fe88fa553744ec3c48197ee72c027d Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 24 May 2022 17:50:57 +0200 Subject: [PATCH 077/100] Cleanup --- pyop2/global_kernel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index a840eba86..1b62f25e4 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -381,7 +381,6 @@ def code_to_compile(self): f" '{configuration['vectorization_strategy']}'") code = lp.generate_code_v2(wrapper) - print(code) if self.local_kernel.cpp: from loopy.codegen.result import process_preambles From 49e2779bd9cbcb8af9d2403185fff04f31eba743 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 24 May 2022 17:51:17 +0200 Subject: [PATCH 078/100] Test: improve the vectorisation test. --- test/unit/test_vectorisation.py | 60 ++++++++++++++++----------------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/test/unit/test_vectorisation.py b/test/unit/test_vectorisation.py index 19e51f8b4..7425a9eae 100644 --- a/test/unit/test_vectorisation.py +++ b/test/unit/test_vectorisation.py @@ -33,9 +33,11 @@ import numpy as np from pyop2 import op2 from pyop2.configuration import configuration -import glob import os import pytest +from pyop2.parloop import LegacyParloop +from pyop2.types.glob import Global +from pyop2.types import Access some_vectorisation_keys = ["__attribute__", "vector_size", "aligned", "#pragma omp simd"] @@ -48,49 +50,45 @@ def s(self): return op2.Set(1) @pytest.fixture - def md1(self, s): - n = op2.Dat(s, [3], np.float64) - o = op2.Dat(s, [4], np.float64) - md = op2.MixedDat([n, o]) - return md + def d1(self, s): + return op2.Dat(s, [3], np.float64) @pytest.fixture - def md(self, s): - n = op2.Dat(s, [4], np.float64) - o = op2.Dat(s, [5], np.float64) - md = op2.MixedDat([n, o]) - return md + def d(self, s): + return op2.Dat(s, [4], np.float64) - def test_vectorisation(self, md1, md): + def inner(self, s, o): + s._check_shape(o) + ret = Global(1, data=0, dtype=s.dtype) + inner_parloop = LegacyParloop(s._inner_kernel(o.dtype), s.dataset.set, + s(Access.READ), o(Access.READ), ret(Access.INC)) + inner_parloop.compute() + return (inner_parloop.global_kernel, ret.data_ro[0]) + + def test_vectorisation(self, d1, d): # Test that vectorised code produced the correct result - ret = md.inner(md1) - assert abs(ret - 32) < 1e-12 - ret = md1.inner(md) - assert abs(ret - 32) < 1e-12 + kernel1, ret = self.inner(d, d1) + assert abs(ret - 12) < 1e-12 + kernel2, ret = self.inner(d1, d) + assert abs(ret - 12) < 1e-12 # Test that we actually vectorised - list_of_files = glob.glob(configuration["cache_dir"]+"/*/*.c") - latest_file = max(list_of_files, key=os.path.getctime) - with open(latest_file, 'r') as file: - generated_code = file.read() - assert (all(key in generated_code for key in some_vectorisation_keys)), "The kernel for an inner product has not been succesfully vectorised." + assert all(key in kernel1.code_to_compile for key in some_vectorisation_keys), "The kernel for an inner(d, d) has not been succesfully vectorised." + assert all(key in kernel2.code_to_compile for key in some_vectorisation_keys), "The kernel for an inner(d1, d) has not been succesfully vectorised." - def test_no_vectorisation(self, md1, md): + def test_no_vectorisation(self, d1, d): # turn vectorisation off configuration.reconfigure(vectorization_strategy="") # Test that unvectorised code produced the correct result - ret = md.inner(md1) - assert abs(ret - 32) < 1e-12 - ret = md1.inner(md) - assert abs(ret - 32) < 1e-12 + kernel1, ret = self.inner(d, d1) + assert abs(ret - 12) < 1e-12 + kernel2, ret = self.inner(d1, d) + assert abs(ret - 12) < 1e-12 # Test that we did not vectorise - list_of_files = glob.glob(configuration["cache_dir"]+"/*/*.c") - latest_file = max(list_of_files, key=os.path.getctime) - with open(latest_file, 'r') as file: - generated_code = file.read() - assert not (any(key in generated_code for key in some_vectorisation_keys)), "The kernel for an inner product has not been succesfully vectorised." + assert not any(key in kernel1.code_to_compile for key in some_vectorisation_keys), "The kernel for an inner(d, d) has been vectorised even though we turned it off." + assert not any(key in kernel2.code_to_compile for key in some_vectorisation_keys), "The kernel for an inner(d1, d) has been vectorised even though we turned it off." # change vect config back to be turned on by default configuration.reconfigure(vectorization_strategy="cross-element") From e5fe4d27c2dc5d6432cd98fc2ba8d328df4b8c09 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 24 May 2022 17:58:28 +0200 Subject: [PATCH 079/100] Put vectorisation strategy only in cache key of the global kernel. --- pyop2/caching.py | 2 +- pyop2/global_kernel.py | 15 ++++++++------- pyop2/local_kernel.py | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/pyop2/caching.py b/pyop2/caching.py index aee31225d..24a3f5513 100644 --- a/pyop2/caching.py +++ b/pyop2/caching.py @@ -237,7 +237,7 @@ def _cache_key(cls, *args, **kwargs): @cached_property def cache_key(self): """Cache key.""" - return self._key + tuple(configuration["vectorization_strategy"]) + return self._key cached = cachetools.cached diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 1b62f25e4..8fc763a8e 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -39,7 +39,7 @@ def __post_init__(self): @property def cache_key(self): - return type(self), self.arity, self.offset, configuration["vectorization_strategy"] + return type(self), self.arity, self.offset @dataclass(eq=False, frozen=True) @@ -59,7 +59,7 @@ def __post_init__(self): @property def cache_key(self): - return type(self), self.base_map.cache_key, tuple(self.permutation), configuration["vectorization_strategy"] + return type(self), self.base_map.cache_key, tuple(self.permutation) @dataclass(frozen=True) @@ -73,7 +73,7 @@ class GlobalKernelArg: @property def cache_key(self): - return type(self), self.dim, configuration["vectorization_strategy"] + return type(self), self.dim @property def maps(self): @@ -112,7 +112,7 @@ def is_indirect(self): @property def cache_key(self): map_key = self.map_.cache_key if self.map_ is not None else None - return type(self), self.dim, map_key, self.index, configuration["vectorization_strategy"] + return type(self), self.dim, map_key, self.index @property def maps(self): @@ -141,7 +141,7 @@ def pack(self): @property def cache_key(self): - return type(self), self.dims, tuple(m.cache_key for m in self.maps), self.unroll, configuration["vectorization_strategy"] + return type(self), self.dims, tuple(m.cache_key for m in self.maps), self.unroll @dataclass(frozen=True) @@ -161,7 +161,7 @@ def __len__(self): @property def cache_key(self): - return tuple(a.cache_key for a in self.arguments) + tuple(configuration["vectorization_strategy"]) + return tuple(a.cache_key for a in self.arguments) @property def maps(self): @@ -233,7 +233,8 @@ class GlobalKernel(Cached): @classmethod def _cache_key(cls, local_knl, arguments, **kwargs): key = [cls, local_knl.cache_key, - *kwargs.items(), configuration["simd_width"]] + *kwargs.items(), configuration["simd_width"], + configuration["vectorization_strategy"]] key.extend([a.cache_key for a in arguments]) diff --git a/pyop2/local_kernel.py b/pyop2/local_kernel.py index 1db70f99b..4807463b8 100644 --- a/pyop2/local_kernel.py +++ b/pyop2/local_kernel.py @@ -115,7 +115,7 @@ def dtypes(self): @property def cache_key(self): - return self._immutable_cache_key, self.accesses, self.dtypes, configuration["vectorization_strategy"] + return self._immutable_cache_key, self.accesses, self.dtypes @cached_property def _immutable_cache_key(self): From 0eff9d608a600a4b62723a2c66ca45f62b9f14f8 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 25 May 2022 15:31:34 +0200 Subject: [PATCH 080/100] lint --- pyop2/global_kernel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 8fc763a8e..fcb274c15 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -161,7 +161,7 @@ def __len__(self): @property def cache_key(self): - return tuple(a.cache_key for a in self.arguments) + return tuple(a.cache_key for a in self.arguments) @property def maps(self): From 22ce06eecf9d71a1476f527e677c1458f4e6fdbc Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 25 May 2022 15:31:50 +0200 Subject: [PATCH 081/100] Fix docs --- pyop2/configuration.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pyop2/configuration.py b/pyop2/configuration.py index ccf06290d..7bd86448a 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -95,11 +95,10 @@ class Configuration(dict): :param vectorization_strategy: A :class:`str` describing the vectorization strategy that must to be applied to the kernels. Can be one of the following -- - :param alignment: A :class:`int` which specifies a size to which all temporaries - are aligned in memory. - - ``cross-element``: Cross-element vectorization strategy of ``__. + :param alignment: A :class:`int` which specifies a size to which all temporaries + are aligned in memory. """ # name, env variable, type, default, write once cache_dir = os.path.join(gettempdir(), "pyop2-cache-uid%s" % os.getuid()) From bdefbfa0e15e7cbc2a720995d89fa55e44f0adbb Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 25 May 2022 15:47:19 +0200 Subject: [PATCH 082/100] Fix config error --- pyop2/configuration.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyop2/configuration.py b/pyop2/configuration.py index 7bd86448a..4de718da9 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -54,7 +54,8 @@ def default_simd_width(): else: raise ConfigurationError(f"The vector extension of your architecture is unknown.\ Must be one of {str(avx_to_width.keys())}.\ - We advise to disable vectorisation." + We advise to disable vectorisation \ + with export PYOP2_VECT_STRATEGY=""." ) @@ -192,5 +193,7 @@ def __setitem__(self, key, value): configuration = Configuration() +if configuration["vectorization_strategy"]: + configuration["simd_width"] = default_simd_width() target = CWithGNULibcTarget() From 2a459e5db1db4ed27975576054c8e17de354b091 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 25 May 2022 15:48:08 +0200 Subject: [PATCH 083/100] Fix config error --- pyop2/configuration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyop2/configuration.py b/pyop2/configuration.py index 4de718da9..efd78071c 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -117,7 +117,7 @@ class Configuration(dict): "ldflags": ("PYOP2_LDFLAGS", str, ""), "simd_width": - ("PYOP2_SIMD_WIDTH", int, default_simd_width()), + ("PYOP2_SIMD_WIDTH", int, 1), "vectorization_strategy": ("PYOP2_VECT_STRATEGY", str, "cross-element"), "alignment": From 56c65da33764ee5a74305594d7365cd28ad8b9bb Mon Sep 17 00:00:00 2001 From: Jack Betteridge Date: Fri, 27 May 2022 17:14:10 +0100 Subject: [PATCH 084/100] Don't add py-cpuinfo --- pyop2/__init__.py | 5 +++++ pyop2/compilation.py | 21 +++++++++++++++++++++ pyop2/configuration.py | 23 +---------------------- requirements-ext.txt | 1 - 4 files changed, 27 insertions(+), 23 deletions(-) diff --git a/pyop2/__init__.py b/pyop2/__init__.py index f0deef2e1..864b55af9 100644 --- a/pyop2/__init__.py +++ b/pyop2/__init__.py @@ -7,3 +7,8 @@ from pyop2._version import get_versions __version__ = get_versions()['version'] del get_versions + +from pyop2.configuration import configuration +from pyop2.compilation import max_simd_width +if configuration["vectorization_strategy"]: + configuration["simd_width"] = max_simd_width() diff --git a/pyop2/compilation.py b/pyop2/compilation.py index 1f62b2c9b..2749ddde8 100644 --- a/pyop2/compilation.py +++ b/pyop2/compilation.py @@ -46,6 +46,7 @@ from pyop2.mpi import MPI, collective, COMM_WORLD from pyop2.mpi import dup_comm, get_compilation_comm, set_compilation_comm +from pyop2.caching import cached from pyop2.configuration import configuration from pyop2.logger import warning, debug, progress, INFO from pyop2.exceptions import CompilationError @@ -696,3 +697,23 @@ def clear_cache(prompt=False): shutil.rmtree(cachedir) else: print("Not removing cached libraries") + + +@cached(cache={}) +def max_simd_width(): + prg_str = '''#include + +int get_simd_width(){ + return __builtin_cpu_supports("avx512f") ? 8: + __builtin_cpu_supports("avx") ? 4: + __builtin_cpu_supports("sse") ? 2: + 1; +} +''' + try: + simd_width = load(prg_str, "c", "get_simd_width", restype=ctypes.c_int) + width = simd_width() + except (OSError, CompilationError): + warning("Cannot sniff SIMD width, using default of 4 doubles") + width = 4 + return width diff --git a/pyop2/configuration.py b/pyop2/configuration.py index efd78071c..b859a973a 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -40,25 +40,6 @@ from pyop2.exceptions import ConfigurationError -def default_simd_width(): - from cpuinfo import get_cpu_info - avx_to_width = {'avx': 2, 'avx1': 2, 'avx128': 2, 'avx2': 4, - 'avx256': 4, 'avx3': 8, 'avx512': 8} - longest_simd_extension = [t for t in get_cpu_info()["flags"] if t.startswith('avx')][-1] - if longest_simd_extension in avx_to_width.keys(): - return avx_to_width[longest_simd_extension] - elif longest_simd_extension[:6] in avx_to_width.keys(): - return avx_to_width[longest_simd_extension[:6]] - elif longest_simd_extension[:4] in avx_to_width.keys(): - return avx_to_width[longest_simd_extension[:4]] - else: - raise ConfigurationError(f"The vector extension of your architecture is unknown.\ - Must be one of {str(avx_to_width.keys())}.\ - We advise to disable vectorisation \ - with export PYOP2_VECT_STRATEGY=""." - ) - - class Configuration(dict): r"""PyOP2 configuration parameters @@ -117,7 +98,7 @@ class Configuration(dict): "ldflags": ("PYOP2_LDFLAGS", str, ""), "simd_width": - ("PYOP2_SIMD_WIDTH", int, 1), + ("PYOP2_SIMD_WIDTH", int, 4), "vectorization_strategy": ("PYOP2_VECT_STRATEGY", str, "cross-element"), "alignment": @@ -193,7 +174,5 @@ def __setitem__(self, key, value): configuration = Configuration() -if configuration["vectorization_strategy"]: - configuration["simd_width"] = default_simd_width() target = CWithGNULibcTarget() diff --git a/requirements-ext.txt b/requirements-ext.txt index 2121947f8..75adb64e3 100644 --- a/requirements-ext.txt +++ b/requirements-ext.txt @@ -4,7 +4,6 @@ pytest>=2.3 flake8>=2.1.0 pycparser>=2.10 mpi4py>=1.3.1 -py-cpuinfo decorator<=4.4.2 dataclasses cachetools From ca5c51b598ce77ca234b5deedafdd25c06a0c782 Mon Sep 17 00:00:00 2001 From: Connor Ward Date: Wed, 22 Jun 2022 14:59:41 +0100 Subject: [PATCH 085/100] Add nbytes property N.B. This is currently set to use PYOP2_TIME as the configuration option. This is misleading and should be changed. --- pyop2/configuration.py | 2 ++ pyop2/parloop.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/pyop2/configuration.py b/pyop2/configuration.py index 29717718c..133eb611d 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -92,6 +92,8 @@ class Configuration(dict): ("PYOP2_LDFLAGS", str, ""), "simd_width": ("PYOP2_SIMD_WIDTH", int, 4), + "time": + ("PYOP2_TIME", bool, False), "debug": ("PYOP2_DEBUG", bool, False), "compute_kernel_flops": diff --git a/pyop2/parloop.py b/pyop2/parloop.py index 8384268cf..956692e7f 100644 --- a/pyop2/parloop.py +++ b/pyop2/parloop.py @@ -187,6 +187,10 @@ def _compute(self, part): :arg part: The :class:`SetPartition` to compute over. """ + if configuration["time"]: + nbytes = self.comm.allreduce(self.nbytes) + if self.comm.Get_rank() == 0: + print("{0}_BYTES= {1}".format(self.global_kernel.name, nbytes)) with self._compute_event(): PETSc.Log.logFlops(part.size*self.num_flops) self.global_kernel(self.comm, part.offset, part.offset+part.size, *self.arglist) @@ -195,6 +199,22 @@ def _compute(self, part): def num_flops(self): return self.global_kernel.num_flops(self.iterset) + @cached_property + def nbytes(self): + nbytes = 0 + seen = set() + for arg in self.arguments: + nbytes += arg.data.nbytes + for map_ in arg.maps: + if map_ is None: + continue + for k in map_._kernel_args_: + if k in seen: + continue + nbytes += map_.values.nbytes + seen.add(k) + return nbytes + @mpi.collective def compute(self): # Parloop.compute is an alias for Parloop.__call__ From dc5f3bcaca0b7d4c99f1f871710458d04a6bfa9e Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 22 Jun 2022 16:03:53 +0200 Subject: [PATCH 086/100] Drop unused args --- pyop2/global_kernel.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index fcb274c15..18fd65c10 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -435,6 +435,10 @@ def vectorise(self, wrapper, iname, batch_size): if (tv.read_only and tv.initializer is not None)} + temps_not_to_vectorize |= {name + for name, tv in kernel.temporary_variables.items() + if kernel.writer_map().get(tv.name, set()) | kernel.reader_map().get(tv.name, set()) == set()} + # {{{ clang (unlike gcc) does not allow taking address of vector-type # variable From ac36708244edad40fe024e616c64d99f24e6d286 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Wed, 22 Jun 2022 16:08:29 +0200 Subject: [PATCH 087/100] Time->extra_info --- pyop2/configuration.py | 4 ++-- pyop2/parloop.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyop2/configuration.py b/pyop2/configuration.py index 133eb611d..0f42c6b56 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -92,8 +92,8 @@ class Configuration(dict): ("PYOP2_LDFLAGS", str, ""), "simd_width": ("PYOP2_SIMD_WIDTH", int, 4), - "time": - ("PYOP2_TIME", bool, False), + "extra_info": + ("PYOP2_EXTRA_INFO", bool, False), "debug": ("PYOP2_DEBUG", bool, False), "compute_kernel_flops": diff --git a/pyop2/parloop.py b/pyop2/parloop.py index 956692e7f..7cb5cb518 100644 --- a/pyop2/parloop.py +++ b/pyop2/parloop.py @@ -187,7 +187,7 @@ def _compute(self, part): :arg part: The :class:`SetPartition` to compute over. """ - if configuration["time"]: + if configuration["extra_info"]: nbytes = self.comm.allreduce(self.nbytes) if self.comm.Get_rank() == 0: print("{0}_BYTES= {1}".format(self.global_kernel.name, nbytes)) From 89feb72b7913f2f6f1106e7bcfee7f7b41af5d95 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Fri, 24 Jun 2022 13:11:46 +0100 Subject: [PATCH 088/100] Fix bandwidth calculation --- pyop2/parloop.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pyop2/parloop.py b/pyop2/parloop.py index 7cb5cb518..eed2c464b 100644 --- a/pyop2/parloop.py +++ b/pyop2/parloop.py @@ -203,9 +203,12 @@ def num_flops(self): def nbytes(self): nbytes = 0 seen = set() - for arg in self.arguments: - nbytes += arg.data.nbytes - for map_ in arg.maps: + for lk_arg, gk_arg, pl_arg in self.zipped_arguments: + if lk_arg.access == Access.INC: + nbytes += pl_arg.data.nbytes * 2 + else: + nbytes += pl_arg.data.nbytes + for map_ in pl_arg.maps: if map_ is None: continue for k in map_._kernel_args_: From 085714536a4c005c0cd837f61bbefd8b1a0c8a2c Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Fri, 24 Jun 2022 13:12:24 +0100 Subject: [PATCH 089/100] Add simd compiler flag also to LinuxGNU compiler --- pyop2/compilation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyop2/compilation.py b/pyop2/compilation.py index 2749ddde8..3c89a580b 100644 --- a/pyop2/compilation.py +++ b/pyop2/compilation.py @@ -487,7 +487,7 @@ class LinuxGnuCompiler(Compiler): _cxxflags = ("-fPIC", "-Wall") _ldflags = ("-shared",) - _optflags = ("-march=native", "-O3", "-ffast-math") + _optflags = ("-march=native", "-O3", "-ffast-math","-fopenmp") _debugflags = ("-O0", "-g") def sniff_compiler_version(self, cpp=False): From 662241e3cbfaed5e8079ac693ca78c38db1cd379 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Mon, 27 Jun 2022 11:46:19 +0100 Subject: [PATCH 090/100] Add vectorisation flag to linux clang compiler too --- pyop2/compilation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyop2/compilation.py b/pyop2/compilation.py index 3c89a580b..a4fc5c9eb 100644 --- a/pyop2/compilation.py +++ b/pyop2/compilation.py @@ -544,7 +544,7 @@ class LinuxClangCompiler(Compiler): _cxxflags = ("-fPIC", "-Wall") _ldflags = ("-shared", "-L/usr/lib") - _optflags = ("-march=native", "-O3", "-ffast-math") + _optflags = ("-march=native", "-O3", "-ffast-math", "-fopenmp-simd") _debugflags = ("-O0", "-g") From 203223c0e3706f6dd711bc9ed96e3c452c8d4084 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 6 Jul 2022 13:23:27 -0500 Subject: [PATCH 091/100] account for changed in loopy's vectorization syntax --- pyop2/compilation.py | 2 +- pyop2/global_kernel.py | 7 ++++--- pyop2/parloop.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pyop2/compilation.py b/pyop2/compilation.py index a4fc5c9eb..9761f0e97 100644 --- a/pyop2/compilation.py +++ b/pyop2/compilation.py @@ -487,7 +487,7 @@ class LinuxGnuCompiler(Compiler): _cxxflags = ("-fPIC", "-Wall") _ldflags = ("-shared",) - _optflags = ("-march=native", "-O3", "-ffast-math","-fopenmp") + _optflags = ("-march=native", "-O3", "-ffast-math", "-fopenmp") _debugflags = ("-O0", "-g") def sniff_compiler_version(self, cpp=False): diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 18fd65c10..3e33c2199 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -355,7 +355,9 @@ def code_to_compile(self): if vectorisable: # change target to generate vectorized code via gcc vector # extensions - wrapper = wrapper.copy(target=lp.CVectorExtensionsTarget()) + wrapper = wrapper.copy(target=lp.CVectorExtensionsTarget( + vec_fallback=lp.VectorizationFallback.OMP_SIMD + )) # inline all inner kernels names = self.local_kernel.code.callables_table for name in names: @@ -487,8 +489,7 @@ def vectorise(self, wrapper, iname, batch_size): kernel = lp.tag_array_axes(kernel, name, tag) # tag the inner iname as vectorized - kernel = lp.tag_inames(kernel, - {inner_iname: lp.VectorizeTag(lp.OpenMPSIMDTag())}) + kernel = lp.tag_inames(kernel, {inner_iname: "vec"}) return wrapper.with_kernel(kernel) diff --git a/pyop2/parloop.py b/pyop2/parloop.py index eed2c464b..5d28566cb 100644 --- a/pyop2/parloop.py +++ b/pyop2/parloop.py @@ -207,7 +207,7 @@ def nbytes(self): if lk_arg.access == Access.INC: nbytes += pl_arg.data.nbytes * 2 else: - nbytes += pl_arg.data.nbytes + nbytes += pl_arg.data.nbytes for map_ in pl_arg.maps: if map_ is None: continue From fae323f68b0b27a61d3cdbb7b9650ac042b82c6e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Wed, 6 Jul 2022 14:09:03 -0500 Subject: [PATCH 092/100] run CI with py3.8 Loopy now requires py3.8 --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 45df5ed57..ab79c43fd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,7 +30,7 @@ jobs: - name: Set correct Python version uses: actions/setup-python@v2 with: - python-version: '3.6' + python-version: '3.8' - name: Clone PETSc uses: actions/checkout@v2 From 030cae5e1a8a9749a2647ead15e3f61ccee976b1 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 7 Jul 2022 12:10:44 +0200 Subject: [PATCH 093/100] Fallback for stopping criterium --- pyop2/global_kernel.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 3e33c2199..2ea444d8c 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -491,6 +491,18 @@ def vectorise(self, wrapper, iname, batch_size): # tag the inner iname as vectorized kernel = lp.tag_inames(kernel, {inner_iname: "vec"}) + all_insn_cinsn = list(insn for insn in wrapper.default_entrypoint.instructions if isinstance(insn, lp.CInstruction)) + # {{{ fallback --> + for insn in all_insn_cinsn: + wrapper = lp.distribute_loops(wrapper.default_entrypoint, + insn.id, + outer_inames=shifted_iname) + renamed_j, = wrapper.id_to_insn[insn.id].within_inames - shifted_iname + wrapper = lp.untag_inames(wrapper, renamed_j, VectorizeTag) + wrapper = lp.tag_inames(wrapper, {renamed_j: "unr"}) + + # }}} + return wrapper.with_kernel(kernel) @PETSc.Log.EventDecorator() From ece0e6224d8906e9c551224c9c432a26de4dbdf7 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 7 Jul 2022 18:44:20 +0200 Subject: [PATCH 094/100] Fallback for stopping criterium --- pyop2/global_kernel.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 2ea444d8c..28ff7a0d5 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -491,17 +491,21 @@ def vectorise(self, wrapper, iname, batch_size): # tag the inner iname as vectorized kernel = lp.tag_inames(kernel, {inner_iname: "vec"}) - all_insn_cinsn = list(insn for insn in wrapper.default_entrypoint.instructions if isinstance(insn, lp.CInstruction)) - # {{{ fallback --> - for insn in all_insn_cinsn: - wrapper = lp.distribute_loops(wrapper.default_entrypoint, - insn.id, - outer_inames=shifted_iname) - renamed_j, = wrapper.id_to_insn[insn.id].within_inames - shifted_iname - wrapper = lp.untag_inames(wrapper, renamed_j, VectorizeTag) - wrapper = lp.tag_inames(wrapper, {renamed_j: "unr"}) - - # }}} + # unroll CInstructions + from loopy.match import Id, Or + cinsn_ids = [cinsn.id + for cinsn in kernel.instructions + if (isinstance(cinsn, lp.CInstruction) and cinsn.predicates)] + cinsn_match = Or(tuple(Id(cinsn_id) for cinsn_id in cinsn_ids)) + outer_inames = frozenset([shifted_iname+"_outer"]) + kernel = lp.distribute_loops(kernel, + cinsn_match, + outer_inames=outer_inames) + inames_to_untag = [kernel.id_to_insn[cinsn_id].within_inames - outer_inames + for cinsn_id in cinsn_ids] + for iname_to_untag in inames_to_untag: + kernel = lp.untag_inames(kernel, iname_to_untag, lp.VectorizeTag) + kernel = lp.tag_inames(kernel, {iname_to_untag: "unr" for iname_to_untag in inames_to_untag}) return wrapper.with_kernel(kernel) From 934e14723ecf4e1956de84d796419cf81628c686 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 7 Jul 2022 21:54:16 +0200 Subject: [PATCH 095/100] Reduce inames to untag --- pyop2/global_kernel.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 28ff7a0d5..4b638b5eb 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -383,6 +383,7 @@ def code_to_compile(self): "Vectorization strategy" f" '{configuration['vectorization_strategy']}'") + print(wrapper) code = lp.generate_code_v2(wrapper) if self.local_kernel.cpp: @@ -501,8 +502,8 @@ def vectorise(self, wrapper, iname, batch_size): kernel = lp.distribute_loops(kernel, cinsn_match, outer_inames=outer_inames) - inames_to_untag = [kernel.id_to_insn[cinsn_id].within_inames - outer_inames - for cinsn_id in cinsn_ids] + inames_to_untag = reduce(set.union, [kernel.id_to_insn[cinsn_id].within_inames - outer_inames + for cinsn_id in cinsn_ids], set()) for iname_to_untag in inames_to_untag: kernel = lp.untag_inames(kernel, iname_to_untag, lp.VectorizeTag) kernel = lp.tag_inames(kernel, {iname_to_untag: "unr" for iname_to_untag in inames_to_untag}) From bd95ba38294fdb8b9399801e6b09eaecfb9b67f3 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 7 Jul 2022 21:54:52 +0200 Subject: [PATCH 096/100] Reduce inames to untag --- pyop2/global_kernel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 4b638b5eb..1c11b9990 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -383,7 +383,6 @@ def code_to_compile(self): "Vectorization strategy" f" '{configuration['vectorization_strategy']}'") - print(wrapper) code = lp.generate_code_v2(wrapper) if self.local_kernel.cpp: From fd6650d4f984b1e9cd0611e8ec45d61d689dc1f9 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Thu, 7 Jul 2022 22:26:41 +0200 Subject: [PATCH 097/100] Fallback for stopping criterium --- pyop2/global_kernel.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 1c11b9990..11e47cd9b 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -383,6 +383,7 @@ def code_to_compile(self): "Vectorization strategy" f" '{configuration['vectorization_strategy']}'") + print(wrapper) code = lp.generate_code_v2(wrapper) if self.local_kernel.cpp: @@ -494,18 +495,23 @@ def vectorise(self, wrapper, iname, batch_size): # unroll CInstructions from loopy.match import Id, Or cinsn_ids = [cinsn.id - for cinsn in kernel.instructions - if (isinstance(cinsn, lp.CInstruction) and cinsn.predicates)] + for cinsn in kernel.instructions + if (isinstance(cinsn, lp.CInstruction) and cinsn.predicates)] cinsn_match = Or(tuple(Id(cinsn_id) for cinsn_id in cinsn_ids)) - outer_inames = frozenset([shifted_iname+"_outer"]) + outer_inames = reduce(set.union, [kernel.id_to_insn[cinsn_id].within_inames - set([inner_iname,]) + for cinsn_id in cinsn_ids], set()) kernel = lp.distribute_loops(kernel, cinsn_match, outer_inames=outer_inames) - inames_to_untag = reduce(set.union, [kernel.id_to_insn[cinsn_id].within_inames - outer_inames - for cinsn_id in cinsn_ids], set()) - for iname_to_untag in inames_to_untag: - kernel = lp.untag_inames(kernel, iname_to_untag, lp.VectorizeTag) - kernel = lp.tag_inames(kernel, {iname_to_untag: "unr" for iname_to_untag in inames_to_untag}) + kernel = lp.untag_inames(kernel, inner_iname, lp.VectorizeTag) + kernel = lp.tag_inames(kernel, {inner_iname: "unr"}) + + # remove noop instructions + # FIXME we need to this because there is a bug in distribute loops + noop_insn_ids = set([cinsn.id + for cinsn in kernel.instructions + if isinstance(cinsn, lp.NoOpInstruction)]) + kernel = lp.remove_instructions(kernel, noop_insn_ids) return wrapper.with_kernel(kernel) From f69755d5d43398489a615c5ce1c94f7889c1a0d6 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 11 Jul 2022 07:32:23 -0500 Subject: [PATCH 098/100] unroll (not vectorize) loops surrounding CInstructions --- pyop2/global_kernel.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 11e47cd9b..187a89eaf 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -492,26 +492,20 @@ def vectorise(self, wrapper, iname, batch_size): # tag the inner iname as vectorized kernel = lp.tag_inames(kernel, {inner_iname: "vec"}) - # unroll CInstructions - from loopy.match import Id, Or + # {{{ duplicate the inames surrounding the CInstructions with predicates + cinsn_ids = [cinsn.id for cinsn in kernel.instructions if (isinstance(cinsn, lp.CInstruction) and cinsn.predicates)] - cinsn_match = Or(tuple(Id(cinsn_id) for cinsn_id in cinsn_ids)) - outer_inames = reduce(set.union, [kernel.id_to_insn[cinsn_id].within_inames - set([inner_iname,]) - for cinsn_id in cinsn_ids], set()) - kernel = lp.distribute_loops(kernel, - cinsn_match, - outer_inames=outer_inames) - kernel = lp.untag_inames(kernel, inner_iname, lp.VectorizeTag) - kernel = lp.tag_inames(kernel, {inner_iname: "unr"}) - - # remove noop instructions - # FIXME we need to this because there is a bug in distribute loops - noop_insn_ids = set([cinsn.id - for cinsn in kernel.instructions - if isinstance(cinsn, lp.NoOpInstruction)]) - kernel = lp.remove_instructions(kernel, noop_insn_ids) + + for cinsn_id in cinsn_ids: + kernel = lp.duplicate_inames(kernel, + (inner_iname,), + within=f"id:{cinsn_id}", + tags={inner_iname: "unr"} + ) + + # }}} return wrapper.with_kernel(kernel) From e72f31671d43eea2031751d1470e07df031dcfd4 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 11 Jul 2022 09:54:19 -0500 Subject: [PATCH 099/100] get rid of noop insns --- pyop2/global_kernel.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 187a89eaf..a097f6447 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -383,7 +383,6 @@ def code_to_compile(self): "Vectorization strategy" f" '{configuration['vectorization_strategy']}'") - print(wrapper) code = lp.generate_code_v2(wrapper) if self.local_kernel.cpp: @@ -416,6 +415,17 @@ def vectorise(self, wrapper, iname, batch_size): kernel = wrapper.default_entrypoint + # {{{ get rid of noop insns + + from loopy.match import Id, Or + + noop_insn_ids = [Id(insn.id) + for insn in kernel.instructions + if isinstance(insn, lp.NoOpInstruction)] + kernel = lp.remove_instructions(kernel, Or(tuple(noop_insn_ids))) + + # }}} + # align temps alignment = configuration["alignment"] tmps = {name: tv.copy(alignment=alignment) From 09bf6297374e106a134bd354ea3d2e2f7773c821 Mon Sep 17 00:00:00 2001 From: Sophia Vorderwuelbecke Date: Tue, 4 Oct 2022 15:16:29 +0100 Subject: [PATCH 100/100] Fix merge leftovers for vectorisation in chapter 3 --- pyop2/compilation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyop2/compilation.py b/pyop2/compilation.py index 9761f0e97..4ee80b1b4 100644 --- a/pyop2/compilation.py +++ b/pyop2/compilation.py @@ -635,9 +635,9 @@ def __init__(self, code, argtypes): else: # Sniff compiler from executable if cpp: - exe = configuration["cxx"] or "g++" + exe = configuration["cxx"] or "mpicxx" else: - exe = configuration["cc"] or "gcc" + exe = configuration["cc"] or "mpicc" compiler = sniff_compiler(exe) dll = compiler(cppargs, ldargs, cpp=cpp, comm=comm).get_so(code, extension) if isinstance(jitmodule, GlobalKernel):