Skip to content

Commit

Permalink
Merge commit '6958807390a43dca6f68cf9039aed9f4c72c700d'
Browse files Browse the repository at this point in the history
  • Loading branch information
whitneywhtsang committed Nov 17, 2024
2 parents 7d4f1ce + 6958807 commit 8382e76
Show file tree
Hide file tree
Showing 27 changed files with 764 additions and 185 deletions.
25 changes: 25 additions & 0 deletions .github/workflows/llvm-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ jobs:
config:
- {runner: 'Ubuntu 20.04', runs_on: 'ubuntu-20.04', target-os: 'ubuntu', arch: 'x64'}
- {runner: 'Ubuntu 20.04 ARM64', runs_on: 'ubuntu-20.04', target-os: 'ubuntu', arch: 'arm64'}
- {runner: 'CentOS 7', runs_on: ['self-hosted', 'CPU'], target-os: 'centos', arch: 'x64'}
- {runner: 'AlmaLinux 8', runs_on: ['self-hosted', 'CPU'], target-os: 'almalinux', arch: 'x64'}
- {runner: 'MacOS X64', runs_on: 'macos-12', target-os: 'macos', arch: 'x64'}
- {runner: 'MacOS ARM64', runs_on: 'macos-12', target-os: 'macos', arch: 'arm64'}
Expand Down Expand Up @@ -233,6 +234,30 @@ jobs:
tar czf "${{ env.llvm_install_dir }}.tar.gz" "${{ env.llvm_install_dir }}"
- name: Configure, Build, Test, and Install LLVM (CentOS)
if: matrix.config.target-os == 'centos'
run: |
# if this step crashes, it can leave behind a stale docker container
docker container prune -f
docker rmi -f $(docker images -q)
docker build --tag llvm-build --build-arg llvm_dir=llvm-project \
-f llvm-build/.github/workflows/llvm-build/centos.Dockerfile .
# Create temporary container to copy cache and installed artifacts.
CONTAINER_ID=$(docker create llvm-build)
docker cp "${CONTAINER_ID}:/install" "${{ env.llvm_install_dir }}"
tar czf "${{ env.llvm_install_dir }}.tar.gz" "${{ env.llvm_install_dir }}"
# We remove the existing directory, otherwise docker will
# create a subdirectory inside the existing directory.
rm -rf "${{ env.SCCACHE_DIR }}"
docker cp "${CONTAINER_ID}:/sccache" "${{ env.SCCACHE_DIR }}"
sudo chown -R "$(id -u -n):$(id -g -n)" "${{ env.SCCACHE_DIR }}"
docker rm "${CONTAINER_ID}"
- name: Configure, Build, Test, and Install LLVM (AlmaLinux)
if: matrix.config.target-os == 'almalinux'
run: |
Expand Down
56 changes: 56 additions & 0 deletions .github/workflows/llvm-build/centos.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
FROM centos:7
ARG llvm_dir=llvm-project
# Add the cache artifacts and the LLVM source tree to the container
ADD sccache /sccache
ADD "${llvm_dir}" /source/llvm-project
ENV SCCACHE_DIR="/sccache"
ENV SCCACHE_CACHE_SIZE="2G"

RUN echo -e "[llvmtoolset-build]\nname=LLVM Toolset 13.0 - Build\nbaseurl=https://buildlogs.centos.org/c7-llvm-toolset-13.0.x86_64/\ngpgcheck=0\nenabled=1" > /etc/yum.repos.d/llvmtoolset-build.repo

# Note: This is required patch since CentOS have reached EOL
# otherwise any yum install setp will fail
RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo

# Install build dependencies
RUN yum install --assumeyes centos-release-scl

# The definition of insanity is doing the same thing and expecting a different result
RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo

RUN yum install --assumeyes --nogpgcheck llvm-toolset-13.0
RUN yum install --assumeyes rh-python38-python-devel rh-python38-python-pip
SHELL [ "/usr/bin/scl", "enable", "llvm-toolset-13.0", "rh-python38" ]

RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --upgrade cmake ninja sccache

# Install MLIR's Python Dependencies
RUN python3 -m pip install -r /source/llvm-project/mlir/python/requirements.txt

# Configure, Build, Test, and Install LLVM
RUN cmake -GNinja -Bbuild \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_C_COMPILER=clang \
-DCMAKE_CXX_COMPILER=clang++ \
-DCMAKE_ASM_COMPILER=clang \
-DCMAKE_C_COMPILER_LAUNCHER=sccache \
-DCMAKE_CXX_COMPILER_LAUNCHER=sccache \
-DCMAKE_CXX_FLAGS="-Wno-everything" \
-DCMAKE_LINKER=lld \
-DCMAKE_INSTALL_PREFIX="/install" \
-DLLVM_BUILD_UTILS=ON \
-DLLVM_BUILD_TOOLS=ON \
-DLLVM_ENABLE_ASSERTIONS=ON \
-DMLIR_ENABLE_BINDINGS_PYTHON=ON \
-DLLVM_ENABLE_PROJECTS=mlir \
-DLLVM_ENABLE_TERMINFO=OFF \
-DLLVM_INSTALL_UTILS=ON \
-DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" \
/source/llvm-project/llvm

RUN ninja -C build install
1 change: 1 addition & 0 deletions include/triton/Dialect/Triton/IR/TritonOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ def TT_AddPtrOp : TT_Op<"addptr",
let results = (outs TT_PtrLike:$result);

let assemblyFormat = "$ptr `,` $offset attr-dict `:` type($result) `,` type($offset)";
let hasFolder = 1;
}

def TT_AdvanceOp : TT_Op<"advance",
Expand Down
8 changes: 7 additions & 1 deletion include/triton/Dialect/TritonGPU/Transforms/Utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,13 @@ int getNVIDIAComputeCapability(Operation *module);
std::optional<mlir::triton::gpu::SharedEncodingAttr>
getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible);

bool loadIsMMAv3(Operation *loadOp);
enum class MMALoadType {
SharedV3,
Registers, // may be v2 or v3
DoNotPipeline, // could be a valid shared/registers MMA operand, but skip
// pipelining
};
MMALoadType getMMALoadType(Operation *loadOp);
} // namespace mlir

#endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
9 changes: 9 additions & 0 deletions lib/Dialect/Triton/IR/Ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -848,6 +848,15 @@ void MakeTensorPtrOp::build(OpBuilder &builder, OperationState &state,
builder.getDenseI32ArrayAttr(order));
}

//-- AddPtrOp --
OpFoldResult AddPtrOp::fold(FoldAdaptor adaptor) {
// addptr(ptr, 0) -> ptr
if (matchPattern(adaptor.getOffset(), m_Zero())) {
return getPtr();
}
return {};
}

//-- AdvanceOp --
OpFoldResult AdvanceOp::fold(FoldAdaptor adaptor) {
// advance(ptr, 0, 0) -> ptr
Expand Down
32 changes: 1 addition & 31 deletions lib/Dialect/Triton/Transforms/Combine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#include "mlir/Support/LLVM.h"
#include "mlir/Support/LogicalResult.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "triton/Analysis/Utility.h"
#include "triton/Dialect/Triton/IR/Dialect.h"
#include "triton/Dialect/Triton/Transforms/Passes.h"

Expand All @@ -18,35 +17,7 @@ namespace mlir::triton {
namespace {

bool isZero(Value val) {
if (matchPattern(val, m_Zero()) || matchPattern(val, m_AnyZeroFloat()))
return true;
// broadcast(constant_0)
if (auto bc = val.getDefiningOp<BroadcastOp>()) {
if (matchPattern(bc.getSrc(), m_Zero()) ||
matchPattern(bc.getSrc(), m_AnyZeroFloat()))
return true;
}
return false;
}

bool isBroadcastConstantCombinable(Attribute value) {
if (auto denseValue = dyn_cast<DenseElementsAttr>(value)) {
return denseValue.isSplat();
}
return isa<FloatAttr, IntegerAttr>(value);
}

DenseElementsAttr getConstantValue(Builder &builder, Attribute value,
Value bcast_res) {
auto resType = cast<ShapedType>(bcast_res.getType());
DenseElementsAttr res;
if (auto denseValue = dyn_cast<DenseElementsAttr>(value)) {
res =
DenseElementsAttr::get(resType, denseValue.getSplatValue<Attribute>());
} else {
res = DenseElementsAttr::get(resType, value);
}
return res;
return (matchPattern(val, m_Zero()) || matchPattern(val, m_AnyZeroFloat()));
}

bool isAddPtrOffsetCombinable(Value first, Value second) {
Expand Down Expand Up @@ -231,7 +202,6 @@ class CombineOpsPass : public TritonCombineOpsBase<CombineOpsPass> {
// %}
patterns.add<CombineSelectMaskedLoadPattern>(context);
patterns.add<CombineAddPtrPattern>(context);
patterns.add<CombineBroadcastConstantPattern>(context);
patterns.add<CombineBroadcastMulReducePattern>(context);

if (applyPatternsAndFoldGreedily(m, std::move(patterns)).failed())
Expand Down
7 changes: 0 additions & 7 deletions lib/Dialect/Triton/Transforms/Combine.td
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,4 @@ def CombineAddPtrPattern : Pat<
(TT_AddPtrOp $ptr, (Arith_AddIOp $idx0, $idx1, DefOverflow)),
[(Constraint<CPred<"isAddPtrOffsetCombinable($0, $1)">> $idx0, $idx1)]>;

// broadcast(cst) => cst
def getConstantValue : NativeCodeCall<"getConstantValue($_builder, $0, $1)">;
def CombineBroadcastConstantPattern : Pat<
(TT_BroadcastOp:$bcast_res (Arith_ConstantOp $value)),
(Arith_ConstantOp (getConstantValue $value, $bcast_res), (location $bcast_res)),
[(Constraint<CPred<"isBroadcastConstantCombinable($0)">> $value)]>;

#endif
16 changes: 2 additions & 14 deletions lib/Dialect/Triton/Transforms/ReorderBroadcast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,18 +206,6 @@ struct MoveBroadcastAfterElementwisePattern
}
};

template <typename OpType>
class CanonicalizePattern : public OpRewritePattern<OpType> {
public:
explicit CanonicalizePattern(MLIRContext *context)
: OpRewritePattern<OpType>(context) {}

LogicalResult matchAndRewrite(OpType op,
PatternRewriter &rewriter) const override {
return OpType::canonicalize(op, rewriter);
}
};

class ReorderBroadcastPass
: public ::impl::TritonReorderBroadcastBase<ReorderBroadcastPass> {
public:
Expand All @@ -226,8 +214,8 @@ class ReorderBroadcastPass
RewritePatternSet patterns(context);
ModuleOp m = getOperation();

patterns.add<CanonicalizePattern<BroadcastOp>>(context);
patterns.add<CanonicalizePattern<ExpandDimsOp>>(context);
BroadcastOp::getCanonicalizationPatterns(patterns, context);
ExpandDimsOp::getCanonicalizationPatterns(patterns, context);
// elementwise(broadcast(a)) => broadcast(elementwise(a))
patterns.add<MoveBroadcastAfterElementwisePattern>(context);
// elementwise(splat(a), splat(b), ...) => splat(elementwise(a, b, ...))
Expand Down
11 changes: 9 additions & 2 deletions lib/Dialect/TritonGPU/Transforms/LoopScheduling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,18 @@ filterPipelinedLoad(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>

bool hasSharedEncoding = false;
if (use->hasTrait<OpTrait::DotLike>()) {
if (loadIsMMAv3(op)) {
auto mmaLoadType = getMMALoadType(op);
auto dot = dyn_cast<tt::DotOp>(use);
auto warpGroupDot = dyn_cast<ttng::WarpGroupDotOp>(use);
bool isMMAv3Shared = mmaLoadType == MMALoadType::SharedV3;
bool isMMAv3Registers =
(mmaLoadType == MMALoadType::Registers) && warpGroupDot;

if (isMMAv3Shared) {
hasSharedEncoding = true;
} else if (isa<tt::ExperimentalDescriptorLoadOp>(op)) {
hasSharedEncoding = true;
} else if (auto dot = dyn_cast<tt::DotOp>(use)) {
} else if (isMMAv3Registers || dot) {
// FIXME: if we have a better solution in handling incompatible shared
// encoding, we can simplify the logic here by checking if all users are
// dot encoding. Fow now, getSharedEncIfAllUsersAreDotEnc will be used
Expand Down
Loading

0 comments on commit 8382e76

Please sign in to comment.