diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py index 263b17b3c..057ea9386 100644 --- a/pyop2/codegen/rep2loopy.py +++ b/pyop2/codegen/rep2loopy.py @@ -203,13 +203,14 @@ def solve_fn_lookup(target, identifier): class _PreambleGen(ImmutableRecord): - fields = set(("preamble", )) + fields = {"preamble", "idx"} - def __init__(self, preamble): + def __init__(self, preamble, idx="0"): self.preamble = preamble + self.idx = idx def __call__(self, preamble_info): - yield ("0", self.preamble) + yield (self.idx, self.preamble) class PyOP2KernelCallable(loopy.ScalarCallable): @@ -566,7 +567,9 @@ def generate(builder, wrapper_name=None): options=options, assumptions=assumptions, lang_version=(2018, 2), - name=wrapper_name) + name=wrapper_name, + # TODO, should these really be silenced? + silenced_warnings=["write_race*", "data_dep*"]) # prioritize loops for indices in context.index_ordering: diff --git a/pyop2/compilation.py b/pyop2/compilation.py index e5a9fefdd..71a339cde 100644 --- a/pyop2/compilation.py +++ b/pyop2/compilation.py @@ -218,7 +218,6 @@ def workaround_cflags(self): if version.StrictVersion("7.3") <= ver <= version.StrictVersion("7.5"): # GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90055 # See also https://github.com/firedrakeproject/firedrake/issues/1442 - # And https://github.com/firedrakeproject/firedrake/issues/1717 # Bug also on skylake with the vectoriser in this # combination (disappears without # -fno-tree-loop-vectorize!) @@ -368,7 +367,7 @@ class MacCompiler(Compiler): """ def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None): - opt_flags = ['-march=native', '-O3', '-ffast-math'] + opt_flags = ['-march=native', '-O3', '-ffast-math', '-fopenmp-simd'] if configuration['debug']: opt_flags = ['-O0', '-g'] cc = "mpicc" @@ -396,7 +395,7 @@ class LinuxCompiler(Compiler): :kwarg comm: Optional communicator to compile the code on (only rank 0 compiles code) (defaults to COMM_WORLD).""" def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None): - opt_flags = ['-march=native', '-O3', '-ffast-math'] + opt_flags = ['-march=native', '-O3', '-ffast-math', '-fopenmp-simd'] if configuration['debug']: opt_flags = ['-O0', '-g'] cc = "mpicc" @@ -422,7 +421,7 @@ class LinuxIntelCompiler(Compiler): rank 0 compiles code) (defaults to COMM_WORLD). """ def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None): - opt_flags = ['-Ofast', '-xHost'] + opt_flags = ['-march=native', '-Ofast', '-xHost', '-qopenmp-simd'] if configuration['debug']: opt_flags = ['-O0', '-g'] cc = "mpicc" diff --git a/pyop2/configuration.py b/pyop2/configuration.py index fe5a2c4c5..ce666a64e 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -39,6 +39,22 @@ from pyop2.exceptions import ConfigurationError +def default_simd_width(): + from cpuinfo import get_cpu_info + avx_to_width = {'avx': 2, 'avx1': 2, 'avx128': 2, 'avx2': 4, + 'avx256': 4, 'avx3': 8, 'avx512': 8} + longest_ext = [t for t in get_cpu_info()["flags"] if t.startswith('avx')][-1] + if longest_ext not in avx_to_width.keys(): + if longest_ext[:6] not in avx_to_width.keys(): + assert longest_ext[:4] in avx_to_width.keys(), \ + "The vector extension of your architecture is unknown. Disable vectorisation!" + return avx_to_width[longest_ext[:4]] + else: + return avx_to_width[longest_ext[:6]] + else: + return avx_to_width[longest_ext] + + class Configuration(dict): r"""PyOP2 configuration parameters @@ -78,7 +94,10 @@ class Configuration(dict): # name, env variable, type, default, write once DEFAULTS = { "compiler": ("PYOP2_BACKEND_COMPILER", str, "gcc"), - "simd_width": ("PYOP2_SIMD_WIDTH", int, 4), + "simd_width": ("PYOP2_SIMD_WIDTH", int, default_simd_width()), + "vectorization_strategy": ("PYOP2_VECT_STRATEGY", str, "ve"), + "alignment": ("PYOP2_ALIGNMENT", int, 64), + "time": ("PYOP2_TIME", bool, False), "debug": ("PYOP2_DEBUG", bool, False), "cflags": ("PYOP2_CFLAGS", str, ""), "ldflags": ("PYOP2_LDFLAGS", str, ""), diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 1dbab1c18..1408fd4f8 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -37,6 +37,7 @@ from copy import deepcopy as dcopy import ctypes +import loopy from pyop2.datatypes import IntType, as_ctypes from pyop2 import base @@ -57,8 +58,35 @@ from pyop2.mpi import collective from pyop2.profiling import timed_region from pyop2.utils import cached_property, get_petsc_dir +from pyop2.configuration import configuration -import loopy + +def vectorise(wrapper, iname, batch_size): + """Return a vectorised version of wrapper, vectorising over iname. + + :arg wrapper: A loopy kernel to vectorise. + :arg iname: The iteration index to vectorise over. + :arg batch_size: The vector width.""" + if batch_size == 1: + return wrapper + + wrapper = wrapper.copy(target=loopy.CVecTarget()) + kernel = wrapper.root_kernel + + # split iname and vectorize the inner loop + slabs = (1, 1) + inner_iname = iname + "_batch" + + if configuration["vectorization_strategy"] == "ve": + kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="vec", inner_iname=inner_iname) + + alignment = configuration["alignment"] + tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items()) + kernel = kernel.copy(temporary_variables=tmps) + + wrapper = wrapper.with_root_kernel(kernel) + + return wrapper class JITModule(base.JITModule): @@ -122,6 +150,20 @@ def code_to_compile(self): builder.add_argument(arg) wrapper = generate(builder) + if self._iterset._extruded: + iname = "layer" + else: + iname = "n" + + has_matrix = any(arg._is_mat for arg in self._args) + has_rw = any(arg.access == RW for arg in self._args) + is_cplx = any(arg.dtype.name == 'complex128' for arg in self._args) + vectorisable = not (has_matrix or has_rw) and (configuration["vectorization_strategy"]) + + if (isinstance(self._kernel.code, loopy.LoopKernel) and vectorisable): + wrapper = loopy.inline_callable_kernel(wrapper, self._kernel.name) + if not is_cplx: + wrapper = vectorise(wrapper, iname, configuration["simd_width"]) code = loopy.generate_code_v2(wrapper) if self._kernel._cpp: @@ -137,8 +179,6 @@ def compile(self): if not hasattr(self, '_args'): raise RuntimeError("JITModule has no args associated with it, should never happen") - from pyop2.configuration import configuration - compiler = configuration["compiler"] extension = "cpp" if self._kernel._cpp else "c" cppargs = self._cppargs @@ -184,6 +224,24 @@ def argtypes(self): class ParLoop(petsc_base.ParLoop): + def set_nbytes(self, args): + nbytes = 0 + seen = set() + for arg in args: + if arg.access is INC: + nbytes += arg.data.nbytes * 2 + else: + nbytes += arg.data.nbytes + for map_ in arg.map_tuple: + if map_ is None: + continue + for k in map_._kernel_args_: + if k in seen: + continue + nbytes += map_.values.nbytes + seen.add(k) + self.nbytes = nbytes + def prepare_arglist(self, iterset, *args): arglist = iterset._kernel_args_ for arg in args: diff --git a/requirements-ext.txt b/requirements-ext.txt index 758ccd963..a73f7da24 100644 --- a/requirements-ext.txt +++ b/requirements-ext.txt @@ -5,3 +5,4 @@ flake8>=2.1.0 pycparser>=2.10 mpi4py>=1.3.1 decorator +py-cpuinfo diff --git a/requirements-git.txt b/requirements-git.txt index 718e27330..4790f7f1b 100644 --- a/requirements-git.txt +++ b/requirements-git.txt @@ -1,4 +1,4 @@ git+https://github.com/firedrakeproject/petsc.git@firedrake#egg=petsc --no-deps git+https://github.com/firedrakeproject/petsc4py.git@firedrake#egg=petsc4py git+https://github.com/coneoproject/COFFEE.git#egg=coffee -git+https://github.com/firedrakeproject/loopy.git@firedrake#egg=loopy +git+https://github.com/firedrakeproject/loopy.git@cvec#egg=loopy