Skip to content

Commit

Permalink
[Im2col] Add option to unroll decomposed im2col loops (#18342)
Browse files Browse the repository at this point in the history
This adds an option to the DecomposeIm2colPass to unroll the resulting
loop nest of the decomposition, and sets it to true by default. This is
an easier form to handle in most lowerings of the op. In particular, it
is easier to generate better barrier placement on GPU lowerings, and
unrolling the loops results in better overall barrier placement with the
current pipeline.

---------

Signed-off-by: Max Dawkins <[email protected]>
  • Loading branch information
Max191 authored Sep 6, 2024
1 parent b78def2 commit edc5d5e
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,36 +12,58 @@
#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
#include "mlir/Dialect/Utils/StaticValueUtils.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

namespace mlir::iree_compiler::IREE::LinalgExt {

#define GEN_PASS_DEF_DECOMPOSEIM2COLPASS
#include "iree/compiler/Dialect/LinalgExt/Transforms/Passes.h.inc"

namespace {

/// Pattern to decompose the tiled im2col op.
struct DecomposeIm2col : public OpRewritePattern<Im2colOp> {
using OpRewritePattern<Im2colOp>::OpRewritePattern;
static LogicalResult decomposeIm2col(Im2colOp im2colOp, RewriterBase &rewriter,
bool unroll) {
OpBuilder::InsertionGuard g(rewriter);
rewriter.setInsertionPoint(im2colOp);
FailureOr<SmallVector<Value>> decomposedIm2col =
im2colOp.decomposeOperation(rewriter);
if (failed(decomposedIm2col)) {
return failure();
}
rewriter.replaceOp(im2colOp, decomposedIm2col.value().front());
if (!unroll) {
return success();
}

LogicalResult matchAndRewrite(Im2colOp im2colOp,
PatternRewriter &rewriter) const override {
FailureOr<SmallVector<Value>> decomposedIm2col =
im2colOp.decomposeOperation(rewriter);
if (failed(decomposedIm2col)) {
// Unroll the loop nest created by the im2col op decomposition.
auto outerLoop = decomposedIm2col.value().front().getDefiningOp<scf::ForOp>();
assert(outerLoop &&
"expected im2col op decomposition to produce scf.for loop nest.");
SmallVector<scf::ForOp> loopNest({outerLoop});
while (auto innerLoop =
outerLoop.getYieldedValues()[0].getDefiningOp<scf::ForOp>()) {
loopNest.push_back(innerLoop);
outerLoop = innerLoop;
}
for (auto loop : llvm::reverse(loopNest)) {
std::optional<int64_t> ub = getConstantIntValue(loop.getUpperBound());
if (!ub.has_value() || ub.value() == 1) {
continue;
}
rewriter.setInsertionPoint(loop);
if (failed(mlir::loopUnrollByFactor(loop, ub.value()))) {
loop.emitOpError("failed to unroll loop");
return failure();
}
rewriter.replaceOp(im2colOp, decomposedIm2col.value().front());
return success();
}
};

} // namespace
return success();
}

namespace {
struct DecomposeIm2colPass final
: impl::DecomposeIm2colPassBase<DecomposeIm2colPass> {
using impl::DecomposeIm2colPassBase<
DecomposeIm2colPass>::DecomposeIm2colPassBase;

void getDependentDialects(DialectRegistry &registry) const override {
registry.insert<
affine::AffineDialect, IREE::LinalgExt::IREELinalgExtDialect,
Expand All @@ -54,8 +76,18 @@ struct DecomposeIm2colPass final

void DecomposeIm2colPass::runOnOperation() {
MLIRContext *context = &getContext();
auto funcOp = getOperation();

SmallVector<Im2colOp> candidates;
funcOp->walk([&](Im2colOp op) { candidates.push_back(op); });
IRRewriter rewriter(context);
for (auto im2colOp : candidates) {
if (failed(decomposeIm2col(im2colOp, rewriter, unroll))) {
return signalPassFailure();
}
}

RewritePatternSet patterns(context);
patterns.add<DecomposeIm2col>(context);
memref::populateResolveRankedShapedTypeResultDimsPatterns(patterns);
if (failed(
applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ def DecomposeIm2colPass :
InterfacePass<"iree-linalg-ext-decompose-im2col", "mlir::FunctionOpInterface"> {
let summary =
"Decomposes im2col ops into insert and extract slice ops";
let options = [
Option<"unroll", "unroll", "bool", /*default=*/"true",
"Unroll the resulting loop nest after decomposition.">,
];
}

def DecomposeWinogradTransformPass :
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-linalg-ext-decompose-im2col))" --split-input-file %s | FileCheck %s
// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-linalg-ext-decompose-im2col{unroll=false}))" --split-input-file %s | FileCheck %s
// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-linalg-ext-decompose-im2col{unroll=true}))" --split-input-file %s | FileCheck %s --check-prefix=CHECK-UNROLL

#map = affine_map<(d0) -> (d0 * 4)>
module {
Expand Down Expand Up @@ -71,3 +72,65 @@ module {
// CHECK: scf.yield %[[mLOOP]] : tensor<2x?x?xf32>
// CHECK: }
// CHECK: return %[[bLOOP]] : tensor<2x?x?xf32>

// -----

#map = affine_map<(d0) -> (d0 * 4)>
module {
func.func @im2col_unrolled(%arg0: tensor<2x34x34x640xf32>, %m_off: index, %k: index) -> tensor<2x2x4xf32> {
%0 = tensor.empty() : tensor<2x2x4xf32>
%k_off = affine.apply #map(%k)
%7 = iree_linalg_ext.im2col strides = [1, 1] dilations = [1, 1] kernel_size = [3, 3] m_offset = [%m_off] k_offset = [%k_off] batch_pos = [0] m_pos = [1, 2] k_pos = [3] ins(%arg0 : tensor<2x34x34x640xf32>) outs(%0 : tensor<2x2x4xf32>) -> tensor<2x2x4xf32>
return %7 : tensor<2x2x4xf32>
}
}
// CHECK-UNROLL-DAG: #[[MAP:.+]] = affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 160) * 640)>
// CHECK-UNROLL-DAG: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0) floordiv 32 + s1 floordiv 480)>
// CHECK-UNROLL-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0, s1] -> ((d0 + s0) mod 32 + s1 floordiv 160 - (s1 floordiv 480) * 3)>
// CHECK-UNROLL: func.func @im2col_unrolled(%[[ARG0:.+]]: tensor<2x34x34x640xf32>
// CHECK-UNROLL-SAME: %[[mOFF:.+]]: index, %[[K:.+]]: index)
// CHECK-UNROLL-DAG: %[[C0:.+]] = arith.constant 0 : index
// CHECK-UNROLL-DAG: %[[C1:.+]] = arith.constant 1 : index
// CHECK-UNROLL: %[[OUT_TILE:.+]] = tensor.empty() : tensor<2x2x4xf32>

// First iteration
//
// CHECK-UNROLL-DAG: %[[kIDX:.+]] = affine.apply #[[MAP]]()[%[[K]]]
// CHECK-UNROLL-DAG: %[[hIDX:.+]] = affine.apply #[[MAP1]](%[[C0]])[%[[mOFF]], %[[K]]]
// CHECK-UNROLL-DAG: %[[wIDX:.+]] = affine.apply #[[MAP2]](%[[C0]])[%[[mOFF]], %[[K]]]
// CHECK-UNROLL: %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[C0]], %[[hIDX]], %[[wIDX]], %[[kIDX]]] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<4xf32>
// CHECK-UNROLL: %[[OUT_SLICE:.+]] = tensor.extract_slice %[[OUT_TILE]][%[[C0]], %[[C0]], 0] [1, 1, 4] [1, 1, 1] : tensor<2x2x4xf32> to tensor<4xf32>
// CHECK-UNROLL: %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<4xf32>) outs(%[[OUT_SLICE]] : tensor<4xf32>) -> tensor<4xf32>
// CHECK-UNROLL: %[[INSERT0:.+]] = tensor.insert_slice %[[COPY]] into %[[OUT_TILE]][%[[C0]], %[[C0]], 0] [1, 1, 4] [1, 1, 1] : tensor<4xf32> into tensor<2x2x4xf32>

// Second iteration
//
// CHECK-UNROLL-DAG: %[[kIDX:.+]] = affine.apply #[[MAP]]()[%[[K]]]
// CHECK-UNROLL-DAG: %[[hIDX:.+]] = affine.apply #[[MAP1]](%[[C1]])[%[[mOFF]], %[[K]]]
// CHECK-UNROLL-DAG: %[[wIDX:.+]] = affine.apply #[[MAP2]](%[[C1]])[%[[mOFF]], %[[K]]]
// CHECK-UNROLL: %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[C0]], %[[hIDX]], %[[wIDX]], %[[kIDX]]] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<4xf32>
// CHECK-UNROLL: %[[OUT_SLICE:.+]] = tensor.extract_slice %[[INSERT0]][%[[C0]], %[[C1]], 0] [1, 1, 4] [1, 1, 1] : tensor<2x2x4xf32> to tensor<4xf32>
// CHECK-UNROLL: %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<4xf32>) outs(%[[OUT_SLICE]] : tensor<4xf32>) -> tensor<4xf32>
// CHECK-UNROLL: %[[INSERT1:.+]] = tensor.insert_slice %[[COPY]] into %[[INSERT0]][%[[C0]], %[[C1]], 0] [1, 1, 4] [1, 1, 1] : tensor<4xf32> into tensor<2x2x4xf32>

// Third iteration
//
// CHECK-UNROLL-DAG: %[[kIDX:.+]] = affine.apply #[[MAP]]()[%[[K]]]
// CHECK-UNROLL-DAG: %[[hIDX:.+]] = affine.apply #[[MAP1]](%[[C0]])[%[[mOFF]], %[[K]]]
// CHECK-UNROLL-DAG: %[[wIDX:.+]] = affine.apply #[[MAP2]](%[[C0]])[%[[mOFF]], %[[K]]]
// CHECK-UNROLL: %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[C1]], %[[hIDX]], %[[wIDX]], %[[kIDX]]] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<4xf32>
// CHECK-UNROLL: %[[OUT_SLICE:.+]] = tensor.extract_slice %[[INSERT1]][%[[C1]], %[[C0]], 0] [1, 1, 4] [1, 1, 1] : tensor<2x2x4xf32> to tensor<4xf32>
// CHECK-UNROLL: %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<4xf32>) outs(%[[OUT_SLICE]] : tensor<4xf32>) -> tensor<4xf32>
// CHECK-UNROLL: %[[INSERT2:.+]] = tensor.insert_slice %[[COPY]] into %[[INSERT1]][%[[C1]], %[[C0]], 0] [1, 1, 4] [1, 1, 1] : tensor<4xf32> into tensor<2x2x4xf32>

// Fourth iteration
//
// CHECK-UNROLL-DAG: %[[kIDX:.+]] = affine.apply #[[MAP]]()[%[[K]]]
// CHECK-UNROLL-DAG: %[[hIDX:.+]] = affine.apply #[[MAP1]](%[[C1]])[%[[mOFF]], %[[K]]]
// CHECK-UNROLL-DAG: %[[wIDX:.+]] = affine.apply #[[MAP2]](%[[C1]])[%[[mOFF]], %[[K]]]
// CHECK-UNROLL: %[[IN_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[C1]], %[[hIDX]], %[[wIDX]], %[[kIDX]]] [1, 1, 1, 4] [1, 1, 1, 1] : tensor<2x34x34x640xf32> to tensor<4xf32>
// CHECK-UNROLL: %[[OUT_SLICE:.+]] = tensor.extract_slice %[[INSERT2]][%[[C1]], %[[C1]], 0] [1, 1, 4] [1, 1, 1] : tensor<2x2x4xf32> to tensor<4xf32>
// CHECK-UNROLL: %[[COPY:.+]] = linalg.copy ins(%[[IN_SLICE]] : tensor<4xf32>) outs(%[[OUT_SLICE]] : tensor<4xf32>) -> tensor<4xf32>
// CHECK-UNROLL: %[[INSERT3:.+]] = tensor.insert_slice %[[COPY]] into %[[INSERT2]][%[[C1]], %[[C1]], 0] [1, 1, 4] [1, 1, 1] : tensor<4xf32> into tensor<2x2x4xf32>

// CHECK-UNROLL: return %[[INSERT3]] : tensor<2x2x4xf32>

0 comments on commit edc5d5e

Please sign in to comment.