From ef99f8d36e5a44e73d212420eab05d45f1a6a013 Mon Sep 17 00:00:00 2001
From: Ian Wood <ianwood2024@u.northwestern.edu>
Date: Tue, 19 Nov 2024 07:35:49 -0800
Subject: [PATCH] Be more aggressive with collapse

Signed-off-by: Ian Wood <ianwood2024@u.northwestern.edu>
---
 .../DispatchCreation/CollapseDimensions.cpp   | 70 +++++++++++--------
 .../test/collapse_dimensions.mlir             | 38 ++++++++++
 2 files changed, 80 insertions(+), 28 deletions(-)
diff --git a/compiler/src/iree/compiler/DispatchCreation/CollapseDimensions.cpp b/compiler/src/iree/compiler/DispatchCreation/CollapseDimensions.cpp
index ba795789d8c56..389dc5ff9190a 100644
--- a/compiler/src/iree/compiler/DispatchCreation/CollapseDimensions.cpp
+++ b/compiler/src/iree/compiler/DispatchCreation/CollapseDimensions.cpp
@@ -12,6 +12,7 @@
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "iree/compiler/Dialect/LinalgExt/Transforms/Transforms.h"
 #include "iree/compiler/DispatchCreation/Passes.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/Utils.h"
@@ -121,8 +122,6 @@ static SmallVector<ReassociationIndices> getCollapsibleLoops(Operation *op) {
            (rDimsSet.count(prePos) && rDimsSet.count(nextPos));
   };
 
-  ReassociationIndices range;
-  AffineExpr preExpr;
   // Find the largest sequence of dimensions that are
   // - Either preserved in all maps, or
   // - are completely absent
@@ -134,23 +133,36 @@ static SmallVector<ReassociationIndices> getCollapsibleLoops(Operation *op) {
   //    found in all maps. If so, add to sequence (to get a sequence of 3)
   //    and repeat till the last element of sequence and the next result
   //    expression is not found as a sequence in all maps.
-  for (auto nextExpr :
-       fusionInterfaceOp.getIndexingMapsArray().front().getResults()) {
-    unsigned position = cast<AffineDimExpr>(nextExpr).getPosition();
-    if (!range.empty()) {
-      if (!hasAllMapsSameSequence(preExpr, nextExpr) ||
-          !hasSameIteratorType(preExpr, nextExpr)) {
+
+  llvm::DenseSet<unsigned> handledExprs;
+  for (auto map : fusionInterfaceOp.getIndexingMapsArray()) {
+    ReassociationIndices range;
+    AffineExpr preExpr;
+    for (auto nextExpr : map.getResults()) {
+      unsigned position = cast<AffineDimExpr>(nextExpr).getPosition();
+      if (handledExprs.contains(position)) {
+        preExpr = nullptr;
         if (range.size() > 1) {
           contiguousLoops.push_back({range.begin(), range.end()});
         }
         range.clear();
+        continue;
+      } else if (!range.empty()) {
+        if (!hasAllMapsSameSequence(preExpr, nextExpr) ||
+            !hasSameIteratorType(preExpr, nextExpr)) {
+          if (range.size() > 1) {
+            contiguousLoops.push_back({range.begin(), range.end()});
+          }
+          range.clear();
+        }
       }
+      range.push_back(position);
+      handledExprs.insert(position);
+      preExpr = nextExpr;
+    }
+    if (range.size() > 1) {
+      contiguousLoops.push_back(range);
     }
-    range.push_back(position);
-    preExpr = nextExpr;
-  }
-  if (range.size() > 1) {
-    contiguousLoops.push_back(range);
   }
 
   return contiguousLoops;
@@ -192,21 +204,21 @@ static bool isEligibleForCollapse(Operation *op) {
   }
 
   // TODO(#17948) GPU codegen fails when we collapse the dimensions of softmax.
-  if (llvm::any_of(genericOp.getDpsInputOperands(),
-                   [&](OpOperand *operand) -> bool {
-                     auto genericOperand =
-                         operand->get().getDefiningOp<linalg::GenericOp>();
-                     if (!genericOperand) {
-                       return false;
-                     }
-
-                     if (genericOperand.getNumReductionLoops() == 0) {
-                       return false;
-                     }
-
-                     return genericOp.getMatchingIndexingMap(operand)
-                         .isProjectedPermutation();
-                   })) {
+  if (llvm::any_of(
+          genericOp.getDpsInputOperands(), [&](OpOperand *operand) -> bool {
+            auto genericOperand =
+                operand->get().getDefiningOp<linalg::GenericOp>();
+            if (!genericOperand) {
+              return false;
+            }
+
+            if (genericOperand.getNumReductionLoops() == 0) {
+              return false;
+            }
+
+            auto map = genericOp.getMatchingIndexingMap(operand);
+            return !map.isPermutation() && map.isProjectedPermutation();
+          })) {
     return false;
   }
 
@@ -615,6 +627,7 @@ hoistTensorReshapesOutOfDispatchRegion(
   // 1. Get the slice of operations within `dispatchOp` that produce the yielded
   // value.
   BackwardSliceOptions sliceOptions;
+  sliceOptions.omitBlockArguments = true;
   sliceOptions.filter = [&](Operation *op) {
     return op->getParentOfType<IREE::Flow::DispatchRegionOp>();
   };
@@ -868,6 +881,7 @@ collapseDimensionsForDispatch(IRRewriter &rewriter,
   BackwardSliceOptions sliceOptions;
   sliceOptions.inclusive = true;
   sliceOptions.omitBlockArguments = true;
+  sliceOptions.omitUsesFromAbove = false;
   sliceOptions.filter = [&](Operation *op) -> bool {
     auto parentOp = op->getParentOfType<IREE::Flow::DispatchRegionOp>();
     return isEligibleForCollapse(op) && parentOp == regionOp;
diff --git a/compiler/src/iree/compiler/DispatchCreation/test/collapse_dimensions.mlir b/compiler/src/iree/compiler/DispatchCreation/test/collapse_dimensions.mlir
index ae4146fd2b649..00b3fe71b3a92 100644
--- a/compiler/src/iree/compiler/DispatchCreation/test/collapse_dimensions.mlir
+++ b/compiler/src/iree/compiler/DispatchCreation/test/collapse_dimensions.mlir
@@ -619,3 +619,41 @@ util.func public @collapse_attention_with_truncf(%arg0: tensor<20x4096x16xf32>,
 //       CHECK:   %[[TRUNC:.*]] = linalg.generic
 //  CHECK-SAME:      ins(%[[ATTN]] : tensor<20x4096x64xf32>
 //       CHECK:   flow.return %[[TRUNC]] : tensor<20x4096x64xf16>
+
+// -----
+
+util.func public @collapse(%10: tensor<2x32x32x1280xi8>, %11 : tensor<10240x1280xi8>, %12 : tensor<10240xi32>, %13 : tensor<10240xf32>) -> (tensor<2x32x32x10240xf16>) {
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %14 = tensor.empty() : tensor<2x32x32x10240xf16>
+  %15 = tensor.empty() : tensor<2x32x32x10240xi32>
+  %16 = linalg.fill ins(%c0_i32 : i32) outs(%15 : tensor<2x32x32x10240xi32>) -> tensor<2x32x32x10240xi32>
+  %dispatch = flow.dispatch.region -> (tensor<2x32x32x10240xf16>) {
+    %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d3, d4)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction"]} ins(%10, %11 : tensor<2x32x32x1280xi8>, tensor<10240x1280xi8>) outs(%16 : tensor<2x32x32x10240xi32>) {
+    ^bb0(%in: i8, %in_0: i8, %out: i32):
+      %19 = arith.extsi %in : i8 to i32
+      %20 = arith.extsi %in_0 : i8 to i32
+      %21 = arith.muli %19, %20 : i32
+      %22 = arith.addi %out, %21 : i32
+      linalg.yield %22 : i32
+    } -> tensor<2x32x32x10240xi32>
+    %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%17, %12, %13 : tensor<2x32x32x10240xi32>, tensor<10240xi32>, tensor<10240xf32>) outs(%14 : tensor<2x32x32x10240xf16>) {
+    ^bb0(%in: i32, %in_0: i32, %in_1: f32, %out: f16):
+      %19 = arith.addi %in, %in_0 : i32
+      %20 = arith.sitofp %19 : i32 to f32
+      %21 = arith.mulf %20, %in_1 : f32
+      %22 = arith.truncf %21 : f32 to f16
+      linalg.yield %22 : f16
+    } -> tensor<2x32x32x10240xf16>
+    flow.return %18 : tensor<2x32x32x10240xf16>
+  }
+  util.return %dispatch  : tensor<2x32x32x10240xf16>
+}
+
+
+// CHECK-LABEL: util.func public @collapse
+//       CHECK:   %[[GEN0:.*]] = linalg.generic
+//  CHECK-SAME:      iterator_types = ["parallel", "parallel", "reduction"]
+//       CHECK:   %[[GEN1:.*]] = linalg.generic
+//  CHECK-SAME:      iterator_types = ["parallel", "parallel"]
+//       CHECK:   flow.return %[[GEN1]] : tensor<2048x10240xf16>