From 2f6cae157dc59a88afbd067aef7d6d3ee65ef2b6 Mon Sep 17 00:00:00 2001
From: CJ77Qi <1753673307@qq.com>
Date: Sun, 29 Sep 2024 23:25:38 +0800
Subject: [PATCH] gpu gemm codegen pipeline

---
 Pipelines/BufferizeOpt.cpp             |  35 ++++
 Pipelines/CMakeLists.txt               |  15 ++
 Pipelines/GPU/CMakeLists.txt           |  11 +
 Pipelines/GPU/GemmCodegenTransform.cpp | 266 +++++++++++++++++++++++++
 Pipelines/LinalgMemrefOpt.cpp          |  44 ++++
 Pipelines/LinalgTensorOpt.cpp          |  68 +++++++
 6 files changed, 439 insertions(+)
 create mode 100644 Pipelines/BufferizeOpt.cpp
 create mode 100644 Pipelines/CMakeLists.txt
 create mode 100644 Pipelines/GPU/CMakeLists.txt
 create mode 100644 Pipelines/GPU/GemmCodegenTransform.cpp
 create mode 100644 Pipelines/LinalgMemrefOpt.cpp
 create mode 100644 Pipelines/LinalgTensorOpt.cpp

diff --git a/Pipelines/BufferizeOpt.cpp b/Pipelines/BufferizeOpt.cpp
new file mode 100644
index 000000000..b7abdb368
--- /dev/null
+++ b/Pipelines/BufferizeOpt.cpp
@@ -0,0 +1,35 @@
+#include "Pipelines/BufferizeOpt.h"
+#include "mlir/Dialect/Bufferization/Transforms/OneShotModuleBufferize.h"
+#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "Utils/GemmCodegenUtils.h"
+#include "Utils/PipelineUtils.h"
+
+using namespace mlir;
+
+void mlir::buddy::createBufferizeOptPipeline(OpPassManager &pm, 
+                                             const BuddyBufferizeOptOptions &options) {
+    mlir::buddy::invokeOpPassPipelineBuilder(
+        [&](OpPassManager &pm) {
+            // OneShotBufferization not implement bufferize on funcOp's arguments on default
+            bufferization::OneShotBufferizationOptions bufferizeOptions;
+            bufferizeOptions.bufferizeFunctionBoundaries = true;
+            // bufferizeOptions.allowReturnAllocsFromLoops
+            pm.addNestedPass<func::FuncOp>(bufferization::createEmptyTensorEliminationPass());
+            pm.addPass(bufferization::createOneShotBufferizePass(bufferizeOptions));
+            pm.addNestedPass<func::FuncOp>(memref::createFoldMemRefAliasOpsPass());
+            addCleanUpPassPipeline(pm);
+        }, pm);
+}
+
+void mlir::buddy::registerBufferizeOptPassPipeline() {
+    PassPipelineRegistration<BuddyBufferizeOptOptions>(
+        "bufferize-opt",
+        "bufferize opt lowering tensor to memref",
+        createBufferizeOptPipeline
+    );
+}
\ No newline at end of file
diff --git a/Pipelines/CMakeLists.txt b/Pipelines/CMakeLists.txt
new file mode 100644
index 000000000..a6c36eceb
--- /dev/null
+++ b/Pipelines/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_subdirectory(GPU)
+
+add_mlir_library(BuddyPipelines
+    LinalgTensorOpt.cpp
+    BufferizeOpt.cpp
+    LinalgMemrefOpt.cpp
+    
+    LINK_LIBS PUBLIC
+    MLIRIR
+    BuddyGPUPipelines
+    BuddyTransformPasses
+    BuddyGPUPasses
+    BuddyGemmCodegenUtils
+    BuddyPipelineUtils
+)
\ No newline at end of file
diff --git a/Pipelines/GPU/CMakeLists.txt b/Pipelines/GPU/CMakeLists.txt
new file mode 100644
index 000000000..5ca25dd74
--- /dev/null
+++ b/Pipelines/GPU/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_mlir_library(BuddyGPUPipelines
+  GemmCodegenTransform.cpp
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRPDLDialect
+  MLIRTransformDialect
+  MLIRTransforms
+  BuddyTransformPasses
+  BuddyGemmCodegenUtils
+)
\ No newline at end of file
diff --git a/Pipelines/GPU/GemmCodegenTransform.cpp b/Pipelines/GPU/GemmCodegenTransform.cpp
new file mode 100644
index 000000000..b4e599da0
--- /dev/null
+++ b/Pipelines/GPU/GemmCodegenTransform.cpp
@@ -0,0 +1,266 @@
+#include "Transform/Transforms/TransformInsertion.h"
+#include "Pipelines/GPU/GemmCodegenTransform.h"
+#include "Utils/GemmCodegenUtils.h"
+#include "Utils/PipelineUtils.h"
+
+#include "mlir/Dialect/Transform/IR/TransformOps.h"
+#include "mlir/Dialect/PDL/IR/PDLOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
+#include "mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h"
+#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "llvm/ADT/SmallSet.h"
+
+#include <optional>
+
+using namespace mlir;
+using namespace mlir::buddy;
+
+namespace {
+
+void createAddGemmCodegenLoweringConfigTransformImpl(
+    OpPassManager &pm, const std::string &anchor, const std::string &prefix,
+    ArrayRef<int64_t> tileConfig, ArrayRef<int64_t> workGroup, int64_t stages) {
+
+    SmallVector<int64_t> vecTileConfig{tileConfig};
+    SmallVector<int64_t> vecWorkGroup{workGroup};
+
+    TransformInsertionConfig config;
+    config.funcAnchor = anchor;
+    config.matchPrefix = prefix;
+    // transform operation takes effect needed to have this op
+    config.opFilter = [=](Operation *op){
+        if (isLinalgMatmul(op)) {
+            return true;
+        }
+        return false;
+    };
+
+    // pdlV is a handle of op
+    config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, Value pdlV) {
+        auto tileConfigAttrs = b.getAttr<ArrayAttr>(llvm::to_vector(
+            llvm::map_range(vecTileConfig, [&](int64_t i) -> Attribute {
+                return b.getI64IntegerAttr(i);
+            })));
+        auto workgroupAttrs = b.getAttr<ArrayAttr>(llvm::to_vector(
+            llvm::map_range(vecWorkGroup, [&](int64_t i) -> Attribute {
+                return b.getI64IntegerAttr(i);
+            })));
+        auto stagesAttr = b.getI64IntegerAttr(stages);
+
+        auto func = b.create<transform::GetParentOp>(
+            pdlV.getType(), pdlV,
+            /* isolated_from_above */ true,
+            /* allow_empty_results */ false,
+            /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()),
+            /* deduplicate */ false,
+            /* nth_parent */ 1);
+        
+        Value tileConfigValue_M = b.create<transform::ParamConstantOp>(
+            transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)),
+            /* value */ tileConfigAttrs.getValue()[0]
+        );
+
+        Value tileConfigValue_N = b.create<transform::ParamConstantOp>(
+            transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)),
+            /* value */ tileConfigAttrs.getValue()[1]
+        );
+
+        Value tileConfigValue_K = b.create<transform::ParamConstantOp>(
+            // /* type */ pdl::AttributeType::get(b.getContext()),
+            transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)),
+            /* value */ tileConfigAttrs.getValue()[2]
+        );
+
+        Value workGroupValue_X = b.create<transform::ParamConstantOp>(
+            transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)),
+            /* value */ workgroupAttrs.getValue()[0]
+        );
+
+        Value workGroupValue_Y = b.create<transform::ParamConstantOp>(
+            transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)),
+            /* value */ workgroupAttrs.getValue()[1]
+        );
+
+        Value workGroupValue_Z = b.create<transform::ParamConstantOp>(
+            transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)),
+            /* value */ workgroupAttrs.getValue()[2]
+        );
+
+        Value stagesValue = b.create<transform::ParamConstantOp>(
+            transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)),
+            /* value */ stagesAttr
+        );
+
+        b.create<transform::AnnotateOp>(func, getGemmTileMConfigAttrName(),
+                                        tileConfigValue_M);
+        b.create<transform::AnnotateOp>(func, getGemmTileNConfigAttrName(),
+                                        tileConfigValue_N);
+        b.create<transform::AnnotateOp>(func, getGemmTileKConfigAttrName(),
+                                        tileConfigValue_K);
+        b.create<transform::AnnotateOp>(func, getGemmBlockXSizeAttrName(),
+                                        workGroupValue_X);
+        b.create<transform::AnnotateOp>(func, getGemmBlockYSizeAttrName(),
+                                        workGroupValue_Y);
+        b.create<transform::AnnotateOp>(func, getGemmBlockZSizeAttrName(),
+                                        workGroupValue_Z);
+        b.create<transform::AnnotateOp>(func, getGemmPipelineStageAttrName(),
+                                        stagesValue);
+    };
+
+    pm.addPass(createGenericTransformInsertionPass(config));
+}
+
+} // namespace
+
+void mlir::buddy::createGemmTileConfigInsertTransform(
+    OpPassManager &pm, const GPUGemmCodegenConfigOptions &options) {
+    invokeOpPassPipelineBuilder(
+        createAddGemmCodegenLoweringConfigTransformImpl, pm,
+        options.funcAnchor, options.annotatePrefix, options.tileConfig,
+        options.workGroup, options.stages);
+}
+
+namespace {
+
+// TODO: Epilogue
+void createGemmTileTransformImpl(OpPassManager &pm,
+                             const std::string &anchor, 
+                             const std::string &prefix) {
+    TransformInsertionConfig config;
+    config.funcAnchor = anchor;
+    config.matchPrefix = prefix;
+    config.opFilter = [=](Operation *op){
+        if (isLinalgMatmul(op)) {
+            return true;
+        }
+        return false;
+    };
+    config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, Value pdlV) {
+        func::FuncOp funcOp = op->getParentOfType<func::FuncOp>();
+        linalg::LinalgOp linalgOp = cast<linalg::LinalgOp>(op);
+
+        SmallVector<int64_t, 3> tileConfig = getGemmTileSize(funcOp).value();
+        SmallVector<int64_t, 3> workGroup = getGemmBlockSize(funcOp).value();
+        int64_t stages = getGemmPipelineStages(funcOp).value();
+
+        bool hasEpilogue =  false;
+
+        auto func = b.create<transform::GetParentOp>(
+            pdlV.getType(), pdlV,
+            /* isolated_from_above */ false,
+            /* allow_empty_results */ false,
+            /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()),
+            /* deduplicate */ false,
+            /* nth_parent */ 1);
+
+        auto linalgFillType = transform::OperationType::get(
+            b.getContext(), linalg::FillOp::getOperationName()
+        );
+        auto linalgFillOp = b.create<transform::MatchOp>(
+            /* resultTypes */ linalgFillType, 
+            /* target */ func, 
+            /* opNames */ linalg::FillOp::getOperationName()
+        );
+
+        SmallVector<int64_t> mappingIdx;
+        bool isBMM = linalgOp.getNumParallelLoops() == 3;
+        if (isBMM) {
+            // 2 -> blockIdx.z 1 -> blockIdx.y 0->blockIdx.x
+            mappingIdx = {2, 1, 0};
+        } else {
+            // 1 -> blockIdx.y 0 -> blockIdx.x
+            mappingIdx = {1, 0};
+        }
+
+        // get GPU BlockIdx mapping
+        auto mapping = llvm::to_vector(llvm::map_range(
+            mappingIdx, 
+            [](int64_t i){return static_cast<gpu::MappingId>(i);
+            }));
+        auto mappingAttrs = llvm::to_vector(llvm::map_range(
+            mapping,
+            [&](gpu::MappingId dim) -> Attribute {
+                return gpu::GPUBlockMappingAttr::get(b.getContext(), dim);
+            }));
+
+        SmallVector<int64_t> parallelTileSizes;
+        if (isBMM) {
+            parallelTileSizes = {1, tileConfig[0], tileConfig[1]};
+        } else {
+            parallelTileSizes = {tileConfig[0], tileConfig[1]};
+        }
+        
+        // tile DimM and DimN and each tile dispathes to block 
+        Value tiledMatmulOp;
+        if (hasEpilogue) {
+            // TODO
+        } else {
+            transform::TileUsingForallOp tiledResultOp = 
+                b.create<transform::TileUsingForallOp>(
+                /* target */ pdlV,
+                /* staticTileSizes */ parallelTileSizes,
+                /* ctor tag */ transform::TileSizesSpec(),
+                /* mapping */ b.getArrayAttr(mappingAttrs)
+                );
+
+            if (linalgFillOp) {
+                b.create<transform::FuseIntoContainingOp>(
+                    /* producerOp */ linalgFillOp,
+                    /* containingOp */ tiledResultOp.getForallOp()
+                );
+            }
+            tiledMatmulOp = tiledResultOp.getTiledOp();
+        }
+                
+        // only tile DimK of the matmul which is dispatched to each block
+        SmallVector<int64_t> reduceTileSize;
+        if (isBMM) {
+            reduceTileSize = {0, 0, 0, tileConfig[2]};
+        } else {
+            reduceTileSize = {0, 0, tileConfig[2]};
+        }
+
+        auto tiledKMatmulOp = 
+            b.create<transform::TileUsingForOp>(
+              /* target */ tiledMatmulOp,
+              /* staticTileSizes */ reduceTileSize
+            );
+
+        // for k in K steps tileConfig[2]
+        auto forLoops = tiledKMatmulOp.getLoops();
+        // tiledmatmul computes at (BM, BN, tileConfig[2])
+        auto kMatmulOp = tiledKMatmulOp.getTiledLinalgOp();
+
+        if (!forLoops.empty()) {
+            b.create<transform::AnnotateOp>(forLoops[0], getMatmulKMainLoopMarker(),
+                                            Value());
+        } else {
+            b.create<transform::AnnotateOp>(kMatmulOp, getMatmulKMainLoopMarker(),
+                                            Value());
+        }
+
+        // Value mmaLevel = b.create<transform::ParamConstantOp>(
+        //     /* type */ transform::ParamType::get(b.getContext(), b.getStringAttr()),
+        //     /* value */ b.getStringAttr("Threadblock")
+        // );
+
+        // b.create<transform::AnnotateOp>(kMatmulOp, getLinalgMMALevelAttrName(),
+        //                                 mmaLevel);
+        b.create<transform::AnnotateOp>(kMatmulOp, getMMAPatternAttrName(),
+                                        Value());
+    };
+    pm.addPass(createGenericTransformInsertionPass(config));
+}
+} // namespace
+
+void mlir::buddy::createGemmTileTransform(OpPassManager &pm,
+                             const GPUGemmGeneralOptions &options) {
+    invokeOpPassPipelineBuilder(
+        createGemmTileTransformImpl, pm,
+        options.funcAnchor, options.annotatePrefix);
+}
\ No newline at end of file
diff --git a/Pipelines/LinalgMemrefOpt.cpp b/Pipelines/LinalgMemrefOpt.cpp
new file mode 100644
index 000000000..ad49f19a4
--- /dev/null
+++ b/Pipelines/LinalgMemrefOpt.cpp
@@ -0,0 +1,44 @@
+#include "Pipelines/LinalgMemrefOpt.h"
+#include "GPU/Transforms/GPUDistributeToWarp.h"
+#include "GPU/Transforms/RemoveReduntantLoops.h"
+#include "GPU/Transforms/TensorCoreVectorization.h"
+#include "Linalg/Transforms/LinalgPromotion.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassRegistry.h"
+#include "Utils/PipelineUtils.h"
+#include "mlir/Transforms/Passes.h"
+#include <string>
+
+using namespace mlir;
+
+namespace {
+
+void addGemmLinalgMemrefOptPipeline(OpPassManager &pm) {
+    // TODO : use funcAnchor to nest the specific matmul func
+    pm.addNestedPass<func::FuncOp>(createLinalgPromotionPass());
+    pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+    pm.addNestedPass<func::FuncOp>(createCSEPass());
+    pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+    pm.addNestedPass<func::FuncOp>(createGPUDistributeToWarpPass());
+    pm.addNestedPass<func::FuncOp>(createRemoveReduntantLoops());
+    pm.addNestedPass<func::FuncOp>(createTensorCoreVectorizationPass());
+}
+
+void createLinalgMemrefOptPipelineImpl(OpPassManager &pm, 
+                                       const std::string target) {
+    addGemmLinalgMemrefOptPipeline(pm);
+}
+
+}
+
+void mlir::buddy::createLinalgMemrefOptPipeline(OpPassManager &pm, 
+                                                const LinalgMemrefOptPipelineOptions &options) {
+    invokeOpPassPipelineBuilder(createLinalgMemrefOptPipelineImpl, pm, options.target);
+}
+
+void mlir::buddy::registerLinalgMemrefOptPipeline() {
+    PassPipelineRegistration<LinalgMemrefOptPipelineOptions>(
+        "linalg-memref-opt", "Linalg Opt Pipeline with Memref",
+        createLinalgMemrefOptPipeline);
+}
\ No newline at end of file
diff --git a/Pipelines/LinalgTensorOpt.cpp b/Pipelines/LinalgTensorOpt.cpp
new file mode 100644
index 000000000..8ee373fda
--- /dev/null
+++ b/Pipelines/LinalgTensorOpt.cpp
@@ -0,0 +1,68 @@
+#include "Pipelines/LinalgTensorOpt.h"
+#include "Utils/GemmCodegenUtils.h"
+#include "Utils/PipelineUtils.h"
+#include "Pipelines/GPU/GemmCodegenTransform.h"
+
+#include "Transform/Transforms/TransformDialectInterpreter.h"
+#include "Transform/Transforms/TransformInsertion.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace {
+
+void addGPULinalgOptPasses(OpPassManager &pm) {
+    { // Gemm Codegen Linalg Tensor Opt
+        // TODO : to mark the func that has gemm linalg op 
+        // now the below option's funcanchor is set to empty 
+        // which considers that all func has matmul op
+        auto funcGemmAnchor = mlir::buddy::getGemmMarkerAttrName().str();
+        // TileSizeConfig of Dim (M) & Dim(N) & Dim(K) -> BM & BN & BK
+        // blockIdx.y = M / BM
+        // blockIdx.x = N / BN
+        SmallVector<int64_t> tileConfig = {128, 128, 64};
+        // threadIdx.x y z
+        SmallVector<int64_t> workGroup = {32, 4, 1};
+        int64_t stages = 3;
+        mlir::buddy::GPUGemmCodegenConfigOptions configOptions;
+        configOptions.tileConfig = tileConfig;
+        configOptions.workGroup = workGroup;
+        configOptions.stages = stages;
+        createGemmTileConfigInsertTransform(pm, configOptions);
+        pm.addPass(createTransformDialectInterpreter(true));
+
+        mlir::buddy::GPUGemmGeneralOptions generalOptions;
+        createGemmTileTransform(pm, generalOptions);
+        pm.addPass(createTransformDialectInterpreter(true));
+        pm.addPass(createCanonicalizerPass());
+        pm.addPass(createCSEPass());
+    }
+}
+
+void createLinalgTensorOptPassPipelineimpl(OpPassManager &pm,
+                                            const std::string &target,
+                                            const std::string &arch) {
+    if (target == "gpu") {
+        addGPULinalgOptPasses(pm);
+    } else {
+        /*TODO*/
+    }
+}
+
+} // namespace
+
+void mlir::buddy::createLinalgTensorOptPassPipeline(OpPassManager &pm,
+                                                    const LinalgTensorOptPipelineOptions &options) {
+    mlir::buddy::invokeOpPassPipelineBuilder(createLinalgTensorOptPassPipelineimpl, pm, options.target, options.arch);
+}
+
+void mlir::buddy::registerLinalgTensorOptPassPipeline() {
+    PassPipelineRegistration<LinalgTensorOptPipelineOptions>(
+        "linalg-tensor-opt", 
+        "Linalg with Tensor Opt Pass Pipeline", 
+        mlir::buddy::createLinalgTensorOptPassPipeline);
+}