Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vectorisation #589

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions pyop2/codegen/rep2loopy.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,13 +203,14 @@ def solve_fn_lookup(target, identifier):


class _PreambleGen(ImmutableRecord):
fields = set(("preamble", ))
fields = {"preamble", "idx"}

def __init__(self, preamble):
def __init__(self, preamble, idx="0"):
self.preamble = preamble
self.idx = idx

def __call__(self, preamble_info):
yield ("0", self.preamble)
yield (self.idx, self.preamble)


class PyOP2KernelCallable(loopy.ScalarCallable):
Expand Down Expand Up @@ -566,7 +567,9 @@ def generate(builder, wrapper_name=None):
options=options,
assumptions=assumptions,
lang_version=(2018, 2),
name=wrapper_name)
name=wrapper_name,
# TODO, should these really be silenced?
silenced_warnings=["write_race*", "data_dep*"])

# prioritize loops
for indices in context.index_ordering:
Expand Down
7 changes: 3 additions & 4 deletions pyop2/compilation.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,6 @@ def workaround_cflags(self):
if version.StrictVersion("7.3") <= ver <= version.StrictVersion("7.5"):
# GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90055
# See also https://github.com/firedrakeproject/firedrake/issues/1442
# And https://github.com/firedrakeproject/firedrake/issues/1717
# Bug also on skylake with the vectoriser in this
# combination (disappears without
# -fno-tree-loop-vectorize!)
Expand Down Expand Up @@ -368,7 +367,7 @@ class MacCompiler(Compiler):
"""

def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None):
opt_flags = ['-march=native', '-O3', '-ffast-math']
opt_flags = ['-march=native', '-O3', '-ffast-math', '-fopenmp-simd']
if configuration['debug']:
opt_flags = ['-O0', '-g']
cc = "mpicc"
Expand Down Expand Up @@ -396,7 +395,7 @@ class LinuxCompiler(Compiler):
:kwarg comm: Optional communicator to compile the code on (only
rank 0 compiles code) (defaults to COMM_WORLD)."""
def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None):
opt_flags = ['-march=native', '-O3', '-ffast-math']
opt_flags = ['-march=native', '-O3', '-ffast-math', '-fopenmp-simd']
if configuration['debug']:
opt_flags = ['-O0', '-g']
cc = "mpicc"
Expand All @@ -422,7 +421,7 @@ class LinuxIntelCompiler(Compiler):
rank 0 compiles code) (defaults to COMM_WORLD).
"""
def __init__(self, cppargs=[], ldargs=[], cpp=False, comm=None):
opt_flags = ['-Ofast', '-xHost']
opt_flags = ['-march=native', '-Ofast', '-xHost', '-qopenmp-simd']
if configuration['debug']:
opt_flags = ['-O0', '-g']
cc = "mpicc"
Expand Down
21 changes: 20 additions & 1 deletion pyop2/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,22 @@
from pyop2.exceptions import ConfigurationError


def default_simd_width():
from cpuinfo import get_cpu_info
avx_to_width = {'avx': 2, 'avx1': 2, 'avx128': 2, 'avx2': 4,
'avx256': 4, 'avx3': 8, 'avx512': 8}
longest_ext = [t for t in get_cpu_info()["flags"] if t.startswith('avx')][-1]
if longest_ext not in avx_to_width.keys():
if longest_ext[:6] not in avx_to_width.keys():
assert longest_ext[:4] in avx_to_width.keys(), \
"The vector extension of your architecture is unknown. Disable vectorisation!"
return avx_to_width[longest_ext[:4]]
else:
return avx_to_width[longest_ext[:6]]
else:
return avx_to_width[longest_ext]


class Configuration(dict):
r"""PyOP2 configuration parameters

Expand Down Expand Up @@ -78,7 +94,10 @@ class Configuration(dict):
# name, env variable, type, default, write once
DEFAULTS = {
"compiler": ("PYOP2_BACKEND_COMPILER", str, "gcc"),
"simd_width": ("PYOP2_SIMD_WIDTH", int, 4),
"simd_width": ("PYOP2_SIMD_WIDTH", int, default_simd_width()),
"vectorization_strategy": ("PYOP2_VECT_STRATEGY", str, "ve"),
"alignment": ("PYOP2_ALIGNMENT", int, 64),
"time": ("PYOP2_TIME", bool, False),
"debug": ("PYOP2_DEBUG", bool, False),
"cflags": ("PYOP2_CFLAGS", str, ""),
"ldflags": ("PYOP2_LDFLAGS", str, ""),
Expand Down
64 changes: 61 additions & 3 deletions pyop2/sequential.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from copy import deepcopy as dcopy

import ctypes
import loopy

from pyop2.datatypes import IntType, as_ctypes
from pyop2 import base
Expand All @@ -57,8 +58,35 @@
from pyop2.mpi import collective
from pyop2.profiling import timed_region
from pyop2.utils import cached_property, get_petsc_dir
from pyop2.configuration import configuration

import loopy

def vectorise(wrapper, iname, batch_size):
"""Return a vectorised version of wrapper, vectorising over iname.

:arg wrapper: A loopy kernel to vectorise.
:arg iname: The iteration index to vectorise over.
:arg batch_size: The vector width."""
if batch_size == 1:
return wrapper

wrapper = wrapper.copy(target=loopy.CVecTarget())
kernel = wrapper.root_kernel

# split iname and vectorize the inner loop
slabs = (1, 1)
inner_iname = iname + "_batch"

if configuration["vectorization_strategy"] == "ve":
kernel = loopy.split_iname(kernel, iname, batch_size, slabs=slabs, inner_tag="vec", inner_iname=inner_iname)

alignment = configuration["alignment"]
tmps = dict((name, tv.copy(alignment=alignment)) for name, tv in kernel.temporary_variables.items())
kernel = kernel.copy(temporary_variables=tmps)

wrapper = wrapper.with_root_kernel(kernel)

return wrapper


class JITModule(base.JITModule):
Expand Down Expand Up @@ -122,6 +150,20 @@ def code_to_compile(self):
builder.add_argument(arg)

wrapper = generate(builder)
if self._iterset._extruded:
iname = "layer"
else:
iname = "n"

has_matrix = any(arg._is_mat for arg in self._args)
has_rw = any(arg.access == RW for arg in self._args)
is_cplx = any(arg.dtype.name == 'complex128' for arg in self._args)
vectorisable = not (has_matrix or has_rw) and (configuration["vectorization_strategy"])

if (isinstance(self._kernel.code, loopy.LoopKernel) and vectorisable):
wrapper = loopy.inline_callable_kernel(wrapper, self._kernel.name)
if not is_cplx:
wrapper = vectorise(wrapper, iname, configuration["simd_width"])
code = loopy.generate_code_v2(wrapper)

if self._kernel._cpp:
Expand All @@ -137,8 +179,6 @@ def compile(self):
if not hasattr(self, '_args'):
raise RuntimeError("JITModule has no args associated with it, should never happen")

from pyop2.configuration import configuration

compiler = configuration["compiler"]
extension = "cpp" if self._kernel._cpp else "c"
cppargs = self._cppargs
Expand Down Expand Up @@ -184,6 +224,24 @@ def argtypes(self):

class ParLoop(petsc_base.ParLoop):

def set_nbytes(self, args):
nbytes = 0
seen = set()
for arg in args:
if arg.access is INC:
nbytes += arg.data.nbytes * 2
sv2518 marked this conversation as resolved.
Show resolved Hide resolved
else:
nbytes += arg.data.nbytes
for map_ in arg.map_tuple:
if map_ is None:
continue
for k in map_._kernel_args_:
if k in seen:
continue
nbytes += map_.values.nbytes
seen.add(k)
self.nbytes = nbytes

def prepare_arglist(self, iterset, *args):
arglist = iterset._kernel_args_
for arg in args:
Expand Down
1 change: 1 addition & 0 deletions requirements-ext.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ flake8>=2.1.0
pycparser>=2.10
mpi4py>=1.3.1
decorator
py-cpuinfo
2 changes: 1 addition & 1 deletion requirements-git.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
git+https://github.com/firedrakeproject/petsc.git@firedrake#egg=petsc
--no-deps git+https://github.com/firedrakeproject/petsc4py.git@firedrake#egg=petsc4py
git+https://github.com/coneoproject/COFFEE.git#egg=coffee
git+https://github.com/firedrakeproject/loopy.git@firedrake#egg=loopy
git+https://github.com/firedrakeproject/loopy.git@cvec#egg=loopy