[Im2col] Add option to unroll decomposed im2col loops (#18342)

This adds an option to the DecomposeIm2colPass to unroll the resulting loop nest of the decomposition, and sets it to true by default. This is an easier form to handle in most lowerings of the op. In particular, it is easier to generate better barrier placement on GPU lowerings, and unrolling the loops results in better overall barrier placement with the current pipeline. --------- Signed-off-by: Max Dawkins <[email protected]>
iree-org · Sep 6, 2024 · edc5d5e · edc5d5e
1 parent b78def2
commit edc5d5e
Show file tree

Hide file tree

Showing 3 changed files with 116 additions and 17 deletions.
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/DecomposeIm2col.cpp b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/DecomposeIm2col.cpp
@@ -12,36 +12,58 @@
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir::iree_compiler::IREE::LinalgExt {
 
 #define GEN_PASS_DEF_DECOMPOSEIM2COLPASS
 #include "iree/compiler/Dialect/LinalgExt/Transforms/Passes.h.inc"
 
-namespace {
-
-/// Pattern to decompose the tiled im2col op.
-struct DecomposeIm2col : public OpRewritePattern<Im2colOp> {
-  using OpRewritePattern<Im2colOp>::OpRewritePattern;
+static LogicalResult decomposeIm2col(Im2colOp im2colOp, RewriterBase &rewriter,
+                                     bool unroll) {
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(im2colOp);
+  FailureOr<SmallVector<Value>> decomposedIm2col =
+      im2colOp.decomposeOperation(rewriter);
+  if (failed(decomposedIm2col)) {
+    return failure();
+  }
+  rewriter.replaceOp(im2colOp, decomposedIm2col.value().front());
+  if (!unroll) {
+    return success();
+  }
 
-  LogicalResult matchAndRewrite(Im2colOp im2colOp,
-                                PatternRewriter &rewriter) const override {
-    FailureOr<SmallVector<Value>> decomposedIm2col =
-        im2colOp.decomposeOperation(rewriter);
-    if (failed(decomposedIm2col)) {
+  // Unroll the loop nest created by the im2col op decomposition.
+  auto outerLoop = decomposedIm2col.value().front().getDefiningOp<scf::ForOp>();
+  assert(outerLoop &&
+         "expected im2col op decomposition to produce scf.for loop nest.");
+  SmallVector<scf::ForOp> loopNest({outerLoop});
+  while (auto innerLoop =
+             outerLoop.getYieldedValues()[0].getDefiningOp<scf::ForOp>()) {
+    loopNest.push_back(innerLoop);
+    outerLoop = innerLoop;
+  }
+  for (auto loop : llvm::reverse(loopNest)) {
+    std::optional<int64_t> ub = getConstantIntValue(loop.getUpperBound());
+    if (!ub.has_value() || ub.value() == 1) {
+      continue;
+    }
+    rewriter.setInsertionPoint(loop);
+    if (failed(mlir::loopUnrollByFactor(loop, ub.value()))) {
+      loop.emitOpError("failed to unroll loop");
       return failure();
     }
-    rewriter.replaceOp(im2colOp, decomposedIm2col.value().front());
-    return success();
   }
-};
-
-} // namespace
+  return success();
+}
 
 namespace {
 struct DecomposeIm2colPass final
     : impl::DecomposeIm2colPassBase<DecomposeIm2colPass> {
+  using impl::DecomposeIm2colPassBase<
+      DecomposeIm2colPass>::DecomposeIm2colPassBase;
+
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<
         affine::AffineDialect, IREE::LinalgExt::IREELinalgExtDialect,
@@ -54,8 +76,18 @@ struct DecomposeIm2colPass final
 
 void DecomposeIm2colPass::runOnOperation() {
   MLIRContext *context = &getContext();
+  auto funcOp = getOperation();
+
+  SmallVector<Im2colOp> candidates;
+  funcOp->walk([&](Im2colOp op) { candidates.push_back(op); });
+  IRRewriter rewriter(context);
+  for (auto im2colOp : candidates) {
+    if (failed(decomposeIm2col(im2colOp, rewriter, unroll))) {
+      return signalPassFailure();
+    }
+  }
+
   RewritePatternSet patterns(context);
-  patterns.add<DecomposeIm2col>(context);
   memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
   if (failed(
           applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) {

diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/Passes.td b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/Passes.td
@@ -54,6 +54,10 @@ def DecomposeIm2colPass :
     InterfacePass<"iree-linalg-ext-decompose-im2col", "mlir::FunctionOpInterface"> {
   let summary =
       "Decomposes im2col ops into insert and extract slice ops";
+  let options = [
+    Option<"unroll", "unroll", "bool", /*default=*/"true",
+           "Unroll the resulting loop nest after decomposition.">,
+  ];
 }
 
 def DecomposeWinogradTransformPass :

diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/decompose_im2col.mlir b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/decompose_im2col.mlir
@@ -1,4 +1,5 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-linalg-ext-decompose-im2col))" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-linalg-ext-decompose-im2col{unroll=false}))" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-linalg-ext-decompose-im2col{unroll=true}))" --split-input-file %s | FileCheck %s --check-prefix=CHECK-UNROLL
 
 #map = affine_map<(d0) -> (d0 * 4)>
 module {
@@ -71,3 +72,65 @@ module {
 //      CHECK:     scf.yield %[[mLOOP]] : tensor<2x?x?xf32>
 //      CHECK:   }
 //      CHECK:   return %[[bLOOP]] : tensor<2x?x?xf32>
+
+// -----
+
+#map = affine_map<(d0) -> (d0 * 4)>
+module {
+  func.func @im2col_unrolled(%arg0: tensor<2x34x34x640xf32>, %m_off: index, %k: index) -> tensor<2x2x4xf32> {
+    %0 = tensor.empty() : tensor<2x2x4xf32>
+    %k_off = affine.apply #map(%k)
+    %7 = iree_linalg_ext.im2col strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%m_off] k_offset = [%k_off] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%arg0 : tensor<2x34x34x640xf32>) outs(%0 : tensor<2x2x4xf32>) -> tensor<2x2x4xf32>
+    return %7 : tensor<2x2x4xf32>
+  }
+}
+//  CHECK-UNROLL-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 160) * 640)>
+//  CHECK-UNROLL-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0) floordiv 32 + s1 floordiv 480)>
+//  CHECK-UNROLL-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0) mod 32 + s1 floordiv 160 - (s1 floordiv 480) * 3)>
+//      CHECK-UNROLL: func.func @im2col_unrolled(%[[ARG0:.+]]: tensor<2x34x34x640xf32>
+// CHECK-UNROLL-SAME:   %[[mOFF:.+]]: index, %[[K:.+]]: index)
+//  CHECK-UNROLL-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//  CHECK-UNROLL-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//      CHECK-UNROLL:   %[[OUT_TILE:.+]] = tensor.empty() : tensor<2x2x4xf32>
+
+//  First iteration
+//
+//  CHECK-UNROLL-DAG:   %[[kIDX:.+]] = affine.apply #[[MAP]]()[%[[K]]]
+//  CHECK-UNROLL-DAG:   %[[hIDX:.+]] = affine.apply #[[MAP1]](%[[C0]])[%[[mOFF]], %[[K]]]
+//  CHECK-UNROLL-DAG:   %[[wIDX:.+]] = affine.apply #[[MAP2]](%[[C0]])[%[[mOFF]], %[[K]]]
+//      CHECK-UNROLL:   %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[C0]], %[[hIDX]], %[[wIDX]], %[[kIDX]]] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<4xf32>
+//      CHECK-UNROLL:   %[[OUT_SLICE:.+]] = tensor.extract_slice %[[OUT_TILE]][%[[C0]], %[[C0]], 0] [1, 1, 4] [1, 1, 1] : tensor<2x2x4xf32> to tensor<4xf32>
+//      CHECK-UNROLL:   %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<4xf32>) outs(%[[OUT_SLICE]] : tensor<4xf32>) -> tensor<4xf32>
+//      CHECK-UNROLL:   %[[INSERT0:.+]] = tensor.insert_slice %[[COPY]] into %[[OUT_TILE]][%[[C0]], %[[C0]], 0] [1, 1, 4] [1, 1, 1] : tensor<4xf32> into tensor<2x2x4xf32>
+
+//  Second iteration
+//
+//  CHECK-UNROLL-DAG:   %[[kIDX:.+]] = affine.apply #[[MAP]]()[%[[K]]]
+//  CHECK-UNROLL-DAG:   %[[hIDX:.+]] = affine.apply #[[MAP1]](%[[C1]])[%[[mOFF]], %[[K]]]
+//  CHECK-UNROLL-DAG:   %[[wIDX:.+]] = affine.apply #[[MAP2]](%[[C1]])[%[[mOFF]], %[[K]]]
+//      CHECK-UNROLL:   %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[C0]], %[[hIDX]], %[[wIDX]], %[[kIDX]]] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<4xf32>
+//      CHECK-UNROLL:   %[[OUT_SLICE:.+]] = tensor.extract_slice %[[INSERT0]][%[[C0]], %[[C1]], 0] [1, 1, 4] [1, 1, 1] : tensor<2x2x4xf32> to tensor<4xf32>
+//      CHECK-UNROLL:   %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<4xf32>) outs(%[[OUT_SLICE]] : tensor<4xf32>) -> tensor<4xf32>
+//      CHECK-UNROLL:   %[[INSERT1:.+]] = tensor.insert_slice %[[COPY]] into %[[INSERT0]][%[[C0]], %[[C1]], 0] [1, 1, 4] [1, 1, 1] : tensor<4xf32> into tensor<2x2x4xf32>
+
+//  Third iteration
+//
+//  CHECK-UNROLL-DAG:   %[[kIDX:.+]] = affine.apply #[[MAP]]()[%[[K]]]
+//  CHECK-UNROLL-DAG:   %[[hIDX:.+]] = affine.apply #[[MAP1]](%[[C0]])[%[[mOFF]], %[[K]]]
+//  CHECK-UNROLL-DAG:   %[[wIDX:.+]] = affine.apply #[[MAP2]](%[[C0]])[%[[mOFF]], %[[K]]]
+//      CHECK-UNROLL:   %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[C1]], %[[hIDX]], %[[wIDX]], %[[kIDX]]] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<4xf32>
+//      CHECK-UNROLL:   %[[OUT_SLICE:.+]] = tensor.extract_slice %[[INSERT1]][%[[C1]], %[[C0]], 0] [1, 1, 4] [1, 1, 1] : tensor<2x2x4xf32> to tensor<4xf32>
+//      CHECK-UNROLL:   %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<4xf32>) outs(%[[OUT_SLICE]] : tensor<4xf32>) -> tensor<4xf32>
+//      CHECK-UNROLL:   %[[INSERT2:.+]] = tensor.insert_slice %[[COPY]] into %[[INSERT1]][%[[C1]], %[[C0]], 0] [1, 1, 4] [1, 1, 1] : tensor<4xf32> into tensor<2x2x4xf32>
+
+//  Fourth iteration
+//
+//  CHECK-UNROLL-DAG:   %[[kIDX:.+]] = affine.apply #[[MAP]]()[%[[K]]]
+//  CHECK-UNROLL-DAG:   %[[hIDX:.+]] = affine.apply #[[MAP1]](%[[C1]])[%[[mOFF]], %[[K]]]
+//  CHECK-UNROLL-DAG:   %[[wIDX:.+]] = affine.apply #[[MAP2]](%[[C1]])[%[[mOFF]], %[[K]]]
+//      CHECK-UNROLL:   %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[C1]], %[[hIDX]], %[[wIDX]], %[[kIDX]]] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<4xf32>
+//      CHECK-UNROLL:   %[[OUT_SLICE:.+]] = tensor.extract_slice %[[INSERT2]][%[[C1]], %[[C1]], 0] [1, 1, 4] [1, 1, 1] : tensor<2x2x4xf32> to tensor<4xf32>
+//      CHECK-UNROLL:   %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<4xf32>) outs(%[[OUT_SLICE]] : tensor<4xf32>) -> tensor<4xf32>
+//      CHECK-UNROLL:   %[[INSERT3:.+]] = tensor.insert_slice %[[COPY]] into %[[INSERT2]][%[[C1]], %[[C1]], 0] [1, 1, 4] [1, 1, 1] : tensor<4xf32> into tensor<2x2x4xf32>
+
+//      CHECK-UNROLL:   return %[[INSERT3]] : tensor<2x2x4xf32>