Skip to content

Commit

Permalink
Be more aggressive with collapse
Browse files Browse the repository at this point in the history
Signed-off-by: Ian Wood <[email protected]>
  • Loading branch information
IanWood1 committed Nov 19, 2024
1 parent ef1381b commit ef99f8d
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 28 deletions.
70 changes: 42 additions & 28 deletions compiler/src/iree/compiler/DispatchCreation/CollapseDimensions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
#include "iree/compiler/Dialect/LinalgExt/Transforms/Transforms.h"
#include "iree/compiler/DispatchCreation/Passes.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/Support/Debug.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Dialect/Affine/Utils.h"
Expand Down Expand Up @@ -121,8 +122,6 @@ static SmallVector<ReassociationIndices> getCollapsibleLoops(Operation *op) {
(rDimsSet.count(prePos) && rDimsSet.count(nextPos));
};

ReassociationIndices range;
AffineExpr preExpr;
// Find the largest sequence of dimensions that are
// - Either preserved in all maps, or
// - are completely absent
Expand All @@ -134,23 +133,36 @@ static SmallVector<ReassociationIndices> getCollapsibleLoops(Operation *op) {
// found in all maps. If so, add to sequence (to get a sequence of 3)
// and repeat till the last element of sequence and the next result
// expression is not found as a sequence in all maps.
for (auto nextExpr :
fusionInterfaceOp.getIndexingMapsArray().front().getResults()) {
unsigned position = cast<AffineDimExpr>(nextExpr).getPosition();
if (!range.empty()) {
if (!hasAllMapsSameSequence(preExpr, nextExpr) ||
!hasSameIteratorType(preExpr, nextExpr)) {

llvm::DenseSet<unsigned> handledExprs;
for (auto map : fusionInterfaceOp.getIndexingMapsArray()) {
ReassociationIndices range;
AffineExpr preExpr;
for (auto nextExpr : map.getResults()) {
unsigned position = cast<AffineDimExpr>(nextExpr).getPosition();
if (handledExprs.contains(position)) {
preExpr = nullptr;
if (range.size() > 1) {
contiguousLoops.push_back({range.begin(), range.end()});
}
range.clear();
continue;
} else if (!range.empty()) {
if (!hasAllMapsSameSequence(preExpr, nextExpr) ||
!hasSameIteratorType(preExpr, nextExpr)) {
if (range.size() > 1) {
contiguousLoops.push_back({range.begin(), range.end()});
}
range.clear();
}
}
range.push_back(position);
handledExprs.insert(position);
preExpr = nextExpr;
}
if (range.size() > 1) {
contiguousLoops.push_back(range);
}
range.push_back(position);
preExpr = nextExpr;
}
if (range.size() > 1) {
contiguousLoops.push_back(range);
}

return contiguousLoops;
Expand Down Expand Up @@ -192,21 +204,21 @@ static bool isEligibleForCollapse(Operation *op) {
}

// TODO(#17948) GPU codegen fails when we collapse the dimensions of softmax.
if (llvm::any_of(genericOp.getDpsInputOperands(),
[&](OpOperand *operand) -> bool {
auto genericOperand =
operand->get().getDefiningOp<linalg::GenericOp>();
if (!genericOperand) {
return false;
}

if (genericOperand.getNumReductionLoops() == 0) {
return false;
}

return genericOp.getMatchingIndexingMap(operand)
.isProjectedPermutation();
})) {
if (llvm::any_of(
genericOp.getDpsInputOperands(), [&](OpOperand *operand) -> bool {
auto genericOperand =
operand->get().getDefiningOp<linalg::GenericOp>();
if (!genericOperand) {
return false;
}

if (genericOperand.getNumReductionLoops() == 0) {
return false;
}

auto map = genericOp.getMatchingIndexingMap(operand);
return !map.isPermutation() && map.isProjectedPermutation();
})) {
return false;
}

Expand Down Expand Up @@ -615,6 +627,7 @@ hoistTensorReshapesOutOfDispatchRegion(
// 1. Get the slice of operations within `dispatchOp` that produce the yielded
// value.
BackwardSliceOptions sliceOptions;
sliceOptions.omitBlockArguments = true;
sliceOptions.filter = [&](Operation *op) {
return op->getParentOfType<IREE::Flow::DispatchRegionOp>();
};
Expand Down Expand Up @@ -868,6 +881,7 @@ collapseDimensionsForDispatch(IRRewriter &rewriter,
BackwardSliceOptions sliceOptions;
sliceOptions.inclusive = true;
sliceOptions.omitBlockArguments = true;
sliceOptions.omitUsesFromAbove = false;
sliceOptions.filter = [&](Operation *op) -> bool {
auto parentOp = op->getParentOfType<IREE::Flow::DispatchRegionOp>();
return isEligibleForCollapse(op) && parentOp == regionOp;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -619,3 +619,41 @@ util.func public @collapse_attention_with_truncf(%arg0: tensor<20x4096x16xf32>,
// CHECK: %[[TRUNC:.*]] = linalg.generic
// CHECK-SAME: ins(%[[ATTN]] : tensor<20x4096x64xf32>
// CHECK: flow.return %[[TRUNC]] : tensor<20x4096x64xf16>

// -----

util.func public @collapse(%10: tensor<2x32x32x1280xi8>, %11 : tensor<10240x1280xi8>, %12 : tensor<10240xi32>, %13 : tensor<10240xf32>) -> (tensor<2x32x32x10240xf16>) {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%14 = tensor.empty() : tensor<2x32x32x10240xf16>
%15 = tensor.empty() : tensor<2x32x32x10240xi32>
%16 = linalg.fill ins(%c0_i32 : i32) outs(%15 : tensor<2x32x32x10240xi32>) -> tensor<2x32x32x10240xi32>
%dispatch = flow.dispatch.region -> (tensor<2x32x32x10240xf16>) {
%17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%10, %11 : tensor<2x32x32x1280xi8>, tensor<10240x1280xi8>) outs(%16 : tensor<2x32x32x10240xi32>) {
^bb0(%in: i8, %in_0: i8, %out: i32):
%19 = arith.extsi %in : i8 to i32
%20 = arith.extsi %in_0 : i8 to i32
%21 = arith.muli %19, %20 : i32
%22 = arith.addi %out, %21 : i32
linalg.yield %22 : i32
} -> tensor<2x32x32x10240xi32>
%18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17, %12, %13 : tensor<2x32x32x10240xi32>, tensor<10240xi32>, tensor<10240xf32>) outs(%14 : tensor<2x32x32x10240xf16>) {
^bb0(%in: i32, %in_0: i32, %in_1: f32, %out: f16):
%19 = arith.addi %in, %in_0 : i32
%20 = arith.sitofp %19 : i32 to f32
%21 = arith.mulf %20, %in_1 : f32
%22 = arith.truncf %21 : f32 to f16
linalg.yield %22 : f16
} -> tensor<2x32x32x10240xf16>
flow.return %18 : tensor<2x32x32x10240xf16>
}
util.return %dispatch : tensor<2x32x32x10240xf16>
}


// CHECK-LABEL: util.func public @collapse
// CHECK: %[[GEN0:.*]] = linalg.generic
// CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction"]
// CHECK: %[[GEN1:.*]] = linalg.generic
// CHECK-SAME: iterator_types = ["parallel", "parallel"]
// CHECK: flow.return %[[GEN1]] : tensor<2048x10240xf16>

0 comments on commit ef99f8d

Please sign in to comment.