buddy-compiler · 6somehow · Sep 26, 2024
diff --git a/examples/MLIRLinalg/linalg-batch-matmul-dync.mlir b/examples/MLIRLinalg/linalg-batch-matmul-dync.mlir
@@ -0,0 +1,67 @@
+// RUN: buddy-opt %s \
+// RUN:     -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+// RUN:     -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+// RUN:     -convert-func-to-llvm -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+module {
+  func.func private @printMemrefF32(memref<*xf32>)
+
+  // Definition for the batch matrix multiplication function
+  func.func @buddy_batchmatmul_f32(%A: memref<?x?x?xf32>, %B: memref<?x?x?xf32>, %C: memref<?x?x?xf32>) {
+    linalg.batch_matmul 
+      ins(%A, %B: memref<?x?x?xf32>, memref<?x?x?xf32>)
+      outs(%C: memref<?x?x?xf32>)
+    return
+  }
+
+  func.func @main(){
+      // Set up dims.
+      %cBatch = arith.constant 10:index
+      %cM = arith.constant 2 : index
+      %cN = arith.constant 5 : index
+      %cK = arith.constant 4 : index
+
+      // Set Init Value.
+      %cf1 = arith.constant 1.0 : f32
+      %cf2 = arith.constant 2.0 : f32
+      %c0 = arith.constant 0.0 : f32
+
+      %A = memref.alloc(%cBatch,%cM, %cK) : memref<?x?x?xf32>
+      %B = memref.alloc(%cBatch,%cK, %cN) : memref<?x?x?xf32>
+      %C = memref.alloc(%cBatch,%cM, %cN) : memref<?x?x?xf32>
+
+      linalg.fill
+      ins(%cf1 : f32)
+      outs(%A:memref<?x?x?xf32>)
+
+      linalg.fill
+      ins(%cf2 : f32)
+      outs(%B:memref<?x?x?xf32>)
+
+      linalg.fill
+      ins(%c0 : f32)
+      outs(%C:memref<?x?x?xf32>)
+
+      call @buddy_batchmatmul_f32(%A, %B, %C) : (memref<?x?x?xf32>, memref<?x?x?xf32>, memref<?x?x?xf32>) -> ()
+
+      // Print output.
+      // CHECK: Unranked Memref base@ = {{.*}} rank = 2 offset = 0 sizes = [4, 4] strides = [4, 1] data =
+      // CHECK-NEXT: [
+      // CHECK-SAME:  [5, 5, 5, 5],
+      // CHECK-NEXT:  [5, 5, 5, 5],
+      // CHECK-NEXT:  [5, 5, 5, 5],
+      // CHECK-NEXT:  [5, 5, 5, 5]
+      // CHECK-SAME: ]
+      %print_C = memref.cast %C : memref<?x?x?xf32> to memref<*xf32>
+      call @printMemrefF32(%print_C) : (memref<*xf32>) -> ()
+
+      memref.dealloc %C : memref<?x?x?xf32>
+      memref.dealloc %B : memref<?x?x?xf32>
+      memref.dealloc %A : memref<?x?x?xf32>
+      return 
+    }
+}
diff --git a/examples/MLIRLinalg/linalg-conv2d_nhwc_fhwc.mlir b/examples/MLIRLinalg/linalg-conv2d_nhwc_fhwc.mlir
@@ -0,0 +1,96 @@
+// RUN: buddy-opt %s \
+// RUN:     -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+// RUN:     -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+// RUN:     -convert-func-to-llvm -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+module {
+  func.func private @printMemrefF32(memref<*xf32>)
+  func.func @alloc_2d_filled_f32(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: f32) -> memref<?x?x?x?xf32> {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = memref.alloc(%arg0, %arg1, %arg2, %arg3) : memref<?x?x?x?xf32>
+    scf.for %arg5 = %c0 to %arg0 step %c1 {
+      scf.for %arg6 = %c0 to %arg1 step %c1 {
+        scf.for %arg7 = %c0 to %arg2 step %c1 {
+          scf.for %arg8 = %c0 to %arg3 step %c1 {
+            %iarg8=arith.index_cast %arg8 : index to i32
+            %loopf= arith.sitofp %iarg8 : i32 to f32
+            memref.store %loopf, %0[%arg5, %arg6, %arg7, %arg8] : memref<?x?x?x?xf32>
+          }
+        }
+      }
+    }
+    return %0 : memref<?x?x?x?xf32>
+  }
+  func.func @conv_2d_nhwc_fhwc(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf32>, %arg2: memref<?x?x?x?xf32>) {
+    linalg.conv_2d_nhwc_fhwc ins(%arg0, %arg1 : memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) outs(%arg2 : memref<?x?x?x?xf32>)
+    return
+  }
+  func.func @main() {
+    // Intput(image, filter) and output value.
+    %cst = arith.constant 0.500000e+00 : f32
+    %cst_0 = arith.constant 0.000000e+00 : f32
+
+    %current_image_n = arith.constant 2 : index
+    %current_image_c = arith.constant 18 : index
+    %current_image_h = arith.constant 8 : index
+    %current_image_w = arith.constant 8 : index
+
+    %current_filter_f = arith.constant 2 : index
+    %current_filter_c = arith.constant 18 : index
+    %current_filter_h = arith.constant 4 : index
+    %current_filter_w = arith.constant 4 : index
+
+    %current_output_n = arith.constant 2 : index
+    %current_output_c = arith.constant 2 : index
+    %current_output_h = arith.constant 5 : index
+    %current_output_w = arith.constant 5 : index
+
+    // Image.
+    %image = call @alloc_2d_filled_f32(%current_image_n,%current_image_h, %current_image_w, %current_image_c,  %cst) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
+    // Filter.
+    %filter = call @alloc_2d_filled_f32(%current_filter_f, %current_filter_h, %current_filter_w,%current_filter_c,  %cst) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
+    // Output.
+    %output = call @alloc_2d_filled_f32(%current_output_n, %current_output_h, %current_output_w,%current_output_c,  %cst_0) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
+
+    call @conv_2d_nhwc_fhwc(%image, %filter, %output) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
+
+    %3 = memref.cast %output : memref<?x?x?x?xf32> to memref<*xf32>
+
+    // Print output.
+    // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [2, 2, 4, 4] strides = [32, 16, 4, 1] data =
+    // CHECK-NEXT:  [
+    // CHECK-SAME:    [
+    // CHECK-SAME:      [
+    // CHECK-COUNT-3:     [32, 32, 32, 32],
+    // CHECK-NEXT:        [32, 32, 32, 32]
+    // CHECK-SAME:      ],
+    // CHECK-NEXT:      [
+    // CHECK-COUNT-3:     [32, 32, 32, 32],
+    // CHECK-NEXT:        [32, 32, 32, 32]
+    // CHECK-SAME:      ]
+    // CHECK-SAME:    ],
+    // CHECK-NEXT:    [
+    // CHECK-SAME:      [
+    // CHECK-COUNT-3:     [32, 32, 32, 32],
+    // CHECK-NEXT:        [32, 32, 32, 32]
+    // CHECK-SAME:      ],
+    // CHECK-NEXT:      [
+    // CHECK-COUNT-3:     [32, 32, 32, 32],
+    // CHECK-NEXT:        [32, 32, 32, 32]
+    // CHECK-SAME:      ]
+    // CHECK-SAME:    ]
+    // CHECK-SAME:  ]
+    call @printMemrefF32(%3) : (memref<*xf32>) -> ()
+
+    memref.dealloc %output : memref<?x?x?x?xf32>
+    memref.dealloc %image : memref<?x?x?x?xf32>
+    memref.dealloc %filter : memref<?x?x?x?xf32>
+    return
+  }
+}
+
diff --git a/examples/MLIRLinalg/linalg-depthwise_conv_2d_nhwc_hwc.mlir b/examples/MLIRLinalg/linalg-depthwise_conv_2d_nhwc_hwc.mlir
@@ -0,0 +1,71 @@
+// RUN: buddy-opt %s \
+// RUN:     -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+// RUN:     -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+// RUN:     -convert-func-to-llvm -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+module {
+  func.func private @printMemrefF32(memref<*xf32>)
+
+  func.func @depthwise_conv_2d_nhwc_hwc(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?x?xf32>) {
+    linalg.depthwise_conv_2d_nhwc_hwc 
+      {dilations = dense<[1,1]> : tensor<2xi64>, strides = dense<[1,1]> : tensor<2xi64>} 
+      ins(%arg0, %arg1 : memref<?x?x?x?xf32>, memref<?x?x?xf32>) 
+      outs(%arg2 : memref<?x?x?x?xf32>)
+    return
+  }
+
+  func.func @main() {
+    // Constants for input image, filter, and output sizes.
+    %cst = arith.constant 0.500000e+00 : f32
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    %cf1 = arith.constant 1.0 : f32
+
+    %image_n = arith.constant 2 : index
+    %image_h = arith.constant 8 : index
+    %image_w = arith.constant 8 : index
+    %image_c = arith.constant 18 : index
+
+    %filter_h = arith.constant 4 : index
+    %filter_w = arith.constant 4 : index
+    %filter_c = arith.constant 18 : index
+
+    %output_n = arith.constant 2 : index
+    %output_h = arith.constant 5 : index
+    %output_w = arith.constant 5 : index
+    %output_c = arith.constant 18 : index
+
+    %image = memref.alloc(%image_n,%image_h,%image_w,%image_c) : memref<?x?x?x?xf32>
+    %filter = memref.alloc(%filter_h,%filter_w,%filter_c) : memref<?x?x?xf32>
+    %output = memref.alloc(%output_n,%output_h,%output_w,%output_c) : memref<?x?x?x?xf32>
+
+    // Allocate and fill image, filter, and output.
+    linalg.fill
+      ins(%cf1 : f32)
+      outs(%image:memref<?x?x?x?xf32>)
+
+    linalg.fill
+      ins(%cf1 : f32)
+      outs(%filter:memref<?x?x?xf32>)
+    linalg.fill
+      ins(%cf1 : f32)
+      outs(%output:memref<?x?x?x?xf32>)
+
+    // Call depthwise convolution.
+    call @depthwise_conv_2d_nhwc_hwc(%image, %filter, %output) : (memref<?x?x?x?xf32>, memref<?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
+
+    %output_cast = memref.cast %output : memref<?x?x?x?xf32> to memref<*xf32>
+
+    // Print the output.
+    call @printMemrefF32(%output_cast) : (memref<*xf32>) -> ()
+
+    // Deallocate memory.
+    memref.dealloc %output : memref<?x?x?x?xf32>
+    memref.dealloc %image : memref<?x?x?x?xf32>
+    memref.dealloc %filter : memref<?x?x?xf32>
+    return
+  }
+}
diff --git a/examples/MLIRLinalg/makefile b/examples/MLIRLinalg/makefile
@@ -60,6 +60,45 @@ linalg-conv2d-tiling-run:
 		-convert-func-to-llvm -reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
 
+linalg-conv2d_nhwc_fhwc-optimize-lower:
+	@${BUDDY_OPT} linalg-conv2d_nhwc_fhwc.mlir \
+		-conv-nhwc-fhwc-optimize="vec-size=16" \
+	  -o ./log.mlir
+
+linalg-conv2d_nhwc_fhwc-tile-optimize-lower:
+	@${BUDDY_OPT} linalg-conv2d_nhwc_fhwc.mlir \
+		-conv-nhwc-fhwc-tile-optimize="vec-size=16 tiling-height=2 tiling-width=3" \
+	  -o ./log.mlir
+
+linalg-conv2d_nhwc_fhwc-optimize-run:
+	@${BUDDY_OPT} linalg-conv2d_nhwc_fhwc.mlir ${MLIR_OPT_OPTIONS} \
+		-conv-nhwc-fhwc-optimize="vec-size=16" \
+		-lower-affine -convert-scf-to-cf \
+		-convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+		-convert-func-to-llvm -reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+linalg-conv2d_nhwc_fhwc-tile-optimize-run:
+	@${BUDDY_OPT} linalg-conv2d_nhwc_fhwc.mlir ${MLIR_OPT_OPTIONS} \
+		-conv-nhwc-fhwc-tile-optimize="vec-size=16 tiling-height=2 tiling-width=3" \
+		-lower-affine -convert-scf-to-cf \
+		-convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+		-convert-func-to-llvm -reconcile-unrealized-casts | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+linalg-depthwise_conv_2d_nhwc_hwc-optimize-lower:
+	@${BUDDY_OPT} linalg-depthwise_conv_2d_nhwc_hwc.mlir \
+		-depthwise-conv-nhwc-hwc-optimize="vec-size=16" \
+	 -o ./log.mlir
+
+linalg-depthwise_conv_2d_nhwc_hwc-optimize-run:
+	@${BUDDY_OPT} linalg-depthwise_conv_2d_nhwc_hwc.mlir \
+		-depthwise-conv-nhwc-hwc-optimize="vec-size=16" \
+		-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+		-convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+		-convert-func-to-llvm -reconcile-unrealized-casts  | \
+	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
 linalg-generic-lower:
 	@${MLIR_OPT} ./linalg-generic.mlir \
 		-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
@@ -177,6 +216,16 @@ linalg-batch-matmul-optimize-lower:
 		-batchmatmul-optimize="vector-size=64" \
 		-o ./log.mlir
 
+linalg-batch-matmul-tile-optimize-lower:
+	@${BUDDY_OPT} linalg-batch-matmul-dync.mlir ${MLIR_OPT_OPTIONS} \
+		-batchmatmul-tile-optimize="vec-size=64 kernel-m=4 kernel-n=2" \
+		-o ./log.mlir
+
+linalg-batch-matmul-scf-optimize-lower:
+	@${BUDDY_OPT} linalg-batch-matmul-dync.mlir ${MLIR_OPT_OPTIONS} \
+		-batchmatmul-scf-optimize="vector-size=64" \
+		-o ./log.mlir
+
 linalg-batch-matmul-optimize-translate:
 	@${BUDDY_OPT} linalg-batch-matmul-f32.mlir ${MLIR_OPT_OPTIONS} \
 		-batchmatmul-optimize="vector-size=64" \

diff --git a/midend/lib/Conversion/CMakeLists.txt b/midend/lib/Conversion/CMakeLists.txt
@@ -14,3 +14,4 @@ add_subdirectory(LowerLinalgToGemmini)
 add_subdirectory(SchedulingOnDevices)
 add_subdirectory(LowerSche)
 add_subdirectory(FuncBufferize)
+add_subdirectory(DepthwiseConvOptimization)
diff --git a/midend/lib/Conversion/ConvOptimization/CMakeLists.txt b/midend/lib/Conversion/ConvOptimization/CMakeLists.txt
@@ -1,3 +1,5 @@
 add_mlir_library(ConvOptimization
 	ConvOptimize.cpp
+  ConvNhwcFhwcOptimize.cpp
+  ConvNhwcFhwcOptimizeTile.cpp
   )