From 2f6cae157dc59a88afbd067aef7d6d3ee65ef2b6 Mon Sep 17 00:00:00 2001 From: CJ77Qi <1753673307@qq.com> Date: Sun, 29 Sep 2024 23:25:38 +0800 Subject: [PATCH] gpu gemm codegen pipeline --- Pipelines/BufferizeOpt.cpp | 35 ++++ Pipelines/CMakeLists.txt | 15 ++ Pipelines/GPU/CMakeLists.txt | 11 + Pipelines/GPU/GemmCodegenTransform.cpp | 266 +++++++++++++++++++++++++ Pipelines/LinalgMemrefOpt.cpp | 44 ++++ Pipelines/LinalgTensorOpt.cpp | 68 +++++++ 6 files changed, 439 insertions(+) create mode 100644 Pipelines/BufferizeOpt.cpp create mode 100644 Pipelines/CMakeLists.txt create mode 100644 Pipelines/GPU/CMakeLists.txt create mode 100644 Pipelines/GPU/GemmCodegenTransform.cpp create mode 100644 Pipelines/LinalgMemrefOpt.cpp create mode 100644 Pipelines/LinalgTensorOpt.cpp diff --git a/Pipelines/BufferizeOpt.cpp b/Pipelines/BufferizeOpt.cpp new file mode 100644 index 000000000..b7abdb368 --- /dev/null +++ b/Pipelines/BufferizeOpt.cpp @@ -0,0 +1,35 @@ +#include "Pipelines/BufferizeOpt.h" +#include "mlir/Dialect/Bufferization/Transforms/OneShotModuleBufferize.h" +#include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h" +#include "mlir/Dialect/Bufferization/Transforms/Passes.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/MemRef/Transforms/Passes.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassRegistry.h" +#include "Utils/GemmCodegenUtils.h" +#include "Utils/PipelineUtils.h" + +using namespace mlir; + +void mlir::buddy::createBufferizeOptPipeline(OpPassManager &pm, + const BuddyBufferizeOptOptions &options) { + mlir::buddy::invokeOpPassPipelineBuilder( + [&](OpPassManager &pm) { + // OneShotBufferization not implement bufferize on funcOp's arguments on default + bufferization::OneShotBufferizationOptions bufferizeOptions; + bufferizeOptions.bufferizeFunctionBoundaries = true; + // bufferizeOptions.allowReturnAllocsFromLoops + pm.addNestedPass(bufferization::createEmptyTensorEliminationPass()); + pm.addPass(bufferization::createOneShotBufferizePass(bufferizeOptions)); + pm.addNestedPass(memref::createFoldMemRefAliasOpsPass()); + addCleanUpPassPipeline(pm); + }, pm); +} + +void mlir::buddy::registerBufferizeOptPassPipeline() { + PassPipelineRegistration( + "bufferize-opt", + "bufferize opt lowering tensor to memref", + createBufferizeOptPipeline + ); +} \ No newline at end of file diff --git a/Pipelines/CMakeLists.txt b/Pipelines/CMakeLists.txt new file mode 100644 index 000000000..a6c36eceb --- /dev/null +++ b/Pipelines/CMakeLists.txt @@ -0,0 +1,15 @@ +add_subdirectory(GPU) + +add_mlir_library(BuddyPipelines + LinalgTensorOpt.cpp + BufferizeOpt.cpp + LinalgMemrefOpt.cpp + + LINK_LIBS PUBLIC + MLIRIR + BuddyGPUPipelines + BuddyTransformPasses + BuddyGPUPasses + BuddyGemmCodegenUtils + BuddyPipelineUtils +) \ No newline at end of file diff --git a/Pipelines/GPU/CMakeLists.txt b/Pipelines/GPU/CMakeLists.txt new file mode 100644 index 000000000..5ca25dd74 --- /dev/null +++ b/Pipelines/GPU/CMakeLists.txt @@ -0,0 +1,11 @@ +add_mlir_library(BuddyGPUPipelines + GemmCodegenTransform.cpp + + LINK_LIBS PUBLIC + MLIRIR + MLIRPDLDialect + MLIRTransformDialect + MLIRTransforms + BuddyTransformPasses + BuddyGemmCodegenUtils +) \ No newline at end of file diff --git a/Pipelines/GPU/GemmCodegenTransform.cpp b/Pipelines/GPU/GemmCodegenTransform.cpp new file mode 100644 index 000000000..b4e599da0 --- /dev/null +++ b/Pipelines/GPU/GemmCodegenTransform.cpp @@ -0,0 +1,266 @@ +#include "Transform/Transforms/TransformInsertion.h" +#include "Pipelines/GPU/GemmCodegenTransform.h" +#include "Utils/GemmCodegenUtils.h" +#include "Utils/PipelineUtils.h" + +#include "mlir/Dialect/Transform/IR/TransformOps.h" +#include "mlir/Dialect/PDL/IR/PDLOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h" +#include "mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h" +#include "mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/IR/BuiltinOps.h" +#include "llvm/ADT/SmallSet.h" + +#include + +using namespace mlir; +using namespace mlir::buddy; + +namespace { + +void createAddGemmCodegenLoweringConfigTransformImpl( + OpPassManager &pm, const std::string &anchor, const std::string &prefix, + ArrayRef tileConfig, ArrayRef workGroup, int64_t stages) { + + SmallVector vecTileConfig{tileConfig}; + SmallVector vecWorkGroup{workGroup}; + + TransformInsertionConfig config; + config.funcAnchor = anchor; + config.matchPrefix = prefix; + // transform operation takes effect needed to have this op + config.opFilter = [=](Operation *op){ + if (isLinalgMatmul(op)) { + return true; + } + return false; + }; + + // pdlV is a handle of op + config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, Value pdlV) { + auto tileConfigAttrs = b.getAttr(llvm::to_vector( + llvm::map_range(vecTileConfig, [&](int64_t i) -> Attribute { + return b.getI64IntegerAttr(i); + }))); + auto workgroupAttrs = b.getAttr(llvm::to_vector( + llvm::map_range(vecWorkGroup, [&](int64_t i) -> Attribute { + return b.getI64IntegerAttr(i); + }))); + auto stagesAttr = b.getI64IntegerAttr(stages); + + auto func = b.create( + pdlV.getType(), pdlV, + /* isolated_from_above */ true, + /* allow_empty_results */ false, + /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()), + /* deduplicate */ false, + /* nth_parent */ 1); + + Value tileConfigValue_M = b.create( + transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)), + /* value */ tileConfigAttrs.getValue()[0] + ); + + Value tileConfigValue_N = b.create( + transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)), + /* value */ tileConfigAttrs.getValue()[1] + ); + + Value tileConfigValue_K = b.create( + // /* type */ pdl::AttributeType::get(b.getContext()), + transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)), + /* value */ tileConfigAttrs.getValue()[2] + ); + + Value workGroupValue_X = b.create( + transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)), + /* value */ workgroupAttrs.getValue()[0] + ); + + Value workGroupValue_Y = b.create( + transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)), + /* value */ workgroupAttrs.getValue()[1] + ); + + Value workGroupValue_Z = b.create( + transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)), + /* value */ workgroupAttrs.getValue()[2] + ); + + Value stagesValue = b.create( + transform::ParamType::get(b.getContext(), mlir::IntegerType::get(b.getContext(), 64)), + /* value */ stagesAttr + ); + + b.create(func, getGemmTileMConfigAttrName(), + tileConfigValue_M); + b.create(func, getGemmTileNConfigAttrName(), + tileConfigValue_N); + b.create(func, getGemmTileKConfigAttrName(), + tileConfigValue_K); + b.create(func, getGemmBlockXSizeAttrName(), + workGroupValue_X); + b.create(func, getGemmBlockYSizeAttrName(), + workGroupValue_Y); + b.create(func, getGemmBlockZSizeAttrName(), + workGroupValue_Z); + b.create(func, getGemmPipelineStageAttrName(), + stagesValue); + }; + + pm.addPass(createGenericTransformInsertionPass(config)); +} + +} // namespace + +void mlir::buddy::createGemmTileConfigInsertTransform( + OpPassManager &pm, const GPUGemmCodegenConfigOptions &options) { + invokeOpPassPipelineBuilder( + createAddGemmCodegenLoweringConfigTransformImpl, pm, + options.funcAnchor, options.annotatePrefix, options.tileConfig, + options.workGroup, options.stages); +} + +namespace { + +// TODO: Epilogue +void createGemmTileTransformImpl(OpPassManager &pm, + const std::string &anchor, + const std::string &prefix) { + TransformInsertionConfig config; + config.funcAnchor = anchor; + config.matchPrefix = prefix; + config.opFilter = [=](Operation *op){ + if (isLinalgMatmul(op)) { + return true; + } + return false; + }; + config.transformBuilder = [=](ImplicitLocOpBuilder &b, Operation *op, Value pdlV) { + func::FuncOp funcOp = op->getParentOfType(); + linalg::LinalgOp linalgOp = cast(op); + + SmallVector tileConfig = getGemmTileSize(funcOp).value(); + SmallVector workGroup = getGemmBlockSize(funcOp).value(); + int64_t stages = getGemmPipelineStages(funcOp).value(); + + bool hasEpilogue = false; + + auto func = b.create( + pdlV.getType(), pdlV, + /* isolated_from_above */ false, + /* allow_empty_results */ false, + /* op_name */ b.getStringAttr(func::FuncOp::getOperationName()), + /* deduplicate */ false, + /* nth_parent */ 1); + + auto linalgFillType = transform::OperationType::get( + b.getContext(), linalg::FillOp::getOperationName() + ); + auto linalgFillOp = b.create( + /* resultTypes */ linalgFillType, + /* target */ func, + /* opNames */ linalg::FillOp::getOperationName() + ); + + SmallVector mappingIdx; + bool isBMM = linalgOp.getNumParallelLoops() == 3; + if (isBMM) { + // 2 -> blockIdx.z 1 -> blockIdx.y 0->blockIdx.x + mappingIdx = {2, 1, 0}; + } else { + // 1 -> blockIdx.y 0 -> blockIdx.x + mappingIdx = {1, 0}; + } + + // get GPU BlockIdx mapping + auto mapping = llvm::to_vector(llvm::map_range( + mappingIdx, + [](int64_t i){return static_cast(i); + })); + auto mappingAttrs = llvm::to_vector(llvm::map_range( + mapping, + [&](gpu::MappingId dim) -> Attribute { + return gpu::GPUBlockMappingAttr::get(b.getContext(), dim); + })); + + SmallVector parallelTileSizes; + if (isBMM) { + parallelTileSizes = {1, tileConfig[0], tileConfig[1]}; + } else { + parallelTileSizes = {tileConfig[0], tileConfig[1]}; + } + + // tile DimM and DimN and each tile dispathes to block + Value tiledMatmulOp; + if (hasEpilogue) { + // TODO + } else { + transform::TileUsingForallOp tiledResultOp = + b.create( + /* target */ pdlV, + /* staticTileSizes */ parallelTileSizes, + /* ctor tag */ transform::TileSizesSpec(), + /* mapping */ b.getArrayAttr(mappingAttrs) + ); + + if (linalgFillOp) { + b.create( + /* producerOp */ linalgFillOp, + /* containingOp */ tiledResultOp.getForallOp() + ); + } + tiledMatmulOp = tiledResultOp.getTiledOp(); + } + + // only tile DimK of the matmul which is dispatched to each block + SmallVector reduceTileSize; + if (isBMM) { + reduceTileSize = {0, 0, 0, tileConfig[2]}; + } else { + reduceTileSize = {0, 0, tileConfig[2]}; + } + + auto tiledKMatmulOp = + b.create( + /* target */ tiledMatmulOp, + /* staticTileSizes */ reduceTileSize + ); + + // for k in K steps tileConfig[2] + auto forLoops = tiledKMatmulOp.getLoops(); + // tiledmatmul computes at (BM, BN, tileConfig[2]) + auto kMatmulOp = tiledKMatmulOp.getTiledLinalgOp(); + + if (!forLoops.empty()) { + b.create(forLoops[0], getMatmulKMainLoopMarker(), + Value()); + } else { + b.create(kMatmulOp, getMatmulKMainLoopMarker(), + Value()); + } + + // Value mmaLevel = b.create( + // /* type */ transform::ParamType::get(b.getContext(), b.getStringAttr()), + // /* value */ b.getStringAttr("Threadblock") + // ); + + // b.create(kMatmulOp, getLinalgMMALevelAttrName(), + // mmaLevel); + b.create(kMatmulOp, getMMAPatternAttrName(), + Value()); + }; + pm.addPass(createGenericTransformInsertionPass(config)); +} +} // namespace + +void mlir::buddy::createGemmTileTransform(OpPassManager &pm, + const GPUGemmGeneralOptions &options) { + invokeOpPassPipelineBuilder( + createGemmTileTransformImpl, pm, + options.funcAnchor, options.annotatePrefix); +} \ No newline at end of file diff --git a/Pipelines/LinalgMemrefOpt.cpp b/Pipelines/LinalgMemrefOpt.cpp new file mode 100644 index 000000000..ad49f19a4 --- /dev/null +++ b/Pipelines/LinalgMemrefOpt.cpp @@ -0,0 +1,44 @@ +#include "Pipelines/LinalgMemrefOpt.h" +#include "GPU/Transforms/GPUDistributeToWarp.h" +#include "GPU/Transforms/RemoveReduntantLoops.h" +#include "GPU/Transforms/TensorCoreVectorization.h" +#include "Linalg/Transforms/LinalgPromotion.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Pass/PassRegistry.h" +#include "Utils/PipelineUtils.h" +#include "mlir/Transforms/Passes.h" +#include + +using namespace mlir; + +namespace { + +void addGemmLinalgMemrefOptPipeline(OpPassManager &pm) { + // TODO : use funcAnchor to nest the specific matmul func + pm.addNestedPass(createLinalgPromotionPass()); + pm.addNestedPass(createCanonicalizerPass()); + pm.addNestedPass(createCSEPass()); + pm.addNestedPass(createCanonicalizerPass()); + pm.addNestedPass(createGPUDistributeToWarpPass()); + pm.addNestedPass(createRemoveReduntantLoops()); + pm.addNestedPass(createTensorCoreVectorizationPass()); +} + +void createLinalgMemrefOptPipelineImpl(OpPassManager &pm, + const std::string target) { + addGemmLinalgMemrefOptPipeline(pm); +} + +} + +void mlir::buddy::createLinalgMemrefOptPipeline(OpPassManager &pm, + const LinalgMemrefOptPipelineOptions &options) { + invokeOpPassPipelineBuilder(createLinalgMemrefOptPipelineImpl, pm, options.target); +} + +void mlir::buddy::registerLinalgMemrefOptPipeline() { + PassPipelineRegistration( + "linalg-memref-opt", "Linalg Opt Pipeline with Memref", + createLinalgMemrefOptPipeline); +} \ No newline at end of file diff --git a/Pipelines/LinalgTensorOpt.cpp b/Pipelines/LinalgTensorOpt.cpp new file mode 100644 index 000000000..8ee373fda --- /dev/null +++ b/Pipelines/LinalgTensorOpt.cpp @@ -0,0 +1,68 @@ +#include "Pipelines/LinalgTensorOpt.h" +#include "Utils/GemmCodegenUtils.h" +#include "Utils/PipelineUtils.h" +#include "Pipelines/GPU/GemmCodegenTransform.h" + +#include "Transform/Transforms/TransformDialectInterpreter.h" +#include "Transform/Transforms/TransformInsertion.h" + +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/Linalg/Passes.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Transforms/Passes.h" + +using namespace mlir; + +namespace { + +void addGPULinalgOptPasses(OpPassManager &pm) { + { // Gemm Codegen Linalg Tensor Opt + // TODO : to mark the func that has gemm linalg op + // now the below option's funcanchor is set to empty + // which considers that all func has matmul op + auto funcGemmAnchor = mlir::buddy::getGemmMarkerAttrName().str(); + // TileSizeConfig of Dim (M) & Dim(N) & Dim(K) -> BM & BN & BK + // blockIdx.y = M / BM + // blockIdx.x = N / BN + SmallVector tileConfig = {128, 128, 64}; + // threadIdx.x y z + SmallVector workGroup = {32, 4, 1}; + int64_t stages = 3; + mlir::buddy::GPUGemmCodegenConfigOptions configOptions; + configOptions.tileConfig = tileConfig; + configOptions.workGroup = workGroup; + configOptions.stages = stages; + createGemmTileConfigInsertTransform(pm, configOptions); + pm.addPass(createTransformDialectInterpreter(true)); + + mlir::buddy::GPUGemmGeneralOptions generalOptions; + createGemmTileTransform(pm, generalOptions); + pm.addPass(createTransformDialectInterpreter(true)); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); + } +} + +void createLinalgTensorOptPassPipelineimpl(OpPassManager &pm, + const std::string &target, + const std::string &arch) { + if (target == "gpu") { + addGPULinalgOptPasses(pm); + } else { + /*TODO*/ + } +} + +} // namespace + +void mlir::buddy::createLinalgTensorOptPassPipeline(OpPassManager &pm, + const LinalgTensorOptPipelineOptions &options) { + mlir::buddy::invokeOpPassPipelineBuilder(createLinalgTensorOptPassPipelineimpl, pm, options.target, options.arch); +} + +void mlir::buddy::registerLinalgTensorOptPassPipeline() { + PassPipelineRegistration( + "linalg-tensor-opt", + "Linalg with Tensor Opt Pass Pipeline", + mlir::buddy::createLinalgTensorOptPassPipeline); +}