Skip to content

Commit

Permalink
Interleave sync instructions in matmul designs (#1678)
Browse files Browse the repository at this point in the history
  • Loading branch information
andrej committed Aug 12, 2024
1 parent ca6c1ea commit f2c88ab
Show file tree
Hide file tree
Showing 4 changed files with 181 additions and 75 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from aie.dialects.aiex import *
from aie.dialects.scf import *
import aie.utils.trace as trace_utils
from aie.utils.trace import PortEvent


def main():
Expand Down Expand Up @@ -245,44 +246,74 @@ def sequence(A, B, C):
ddr_id=2,
size=trace_size,
offset=C_sz_in_bytes,
events=[
PortEvent(
trace_utils.CoreEvent.PORT_RUNNING_0,
port_number=1,
master=True,
),
PortEvent(
trace_utils.CoreEvent.PORT_RUNNING_1,
port_number=2,
master=True,
),
PortEvent(
trace_utils.CoreEvent.PORT_RUNNING_2,
port_number=5,
master=True,
),
trace_utils.CoreEvent.INSTR_EVENT_0,
trace_utils.CoreEvent.INSTR_EVENT_1,
trace_utils.CoreEvent.MEMORY_STALL,
trace_utils.CoreEvent.LOCK_STALL,
trace_utils.CoreEvent.INSTR_VECTOR,
],
)

# only do 5 tile rows at a time before synchronizing, so we can reuse BDs
rows_per_block = 5
# only do 4 tile rows at a time before synchronizing, so we can reuse BDs
rows_per_block = 6
for tile_row_block in range(ceildiv(M_div_m, rows_per_block)):
C_row_offset = tile_row_block * rows_per_block * m * N
num_tile_rows = min(
[rows_per_block, M_div_m - tile_row_block * rows_per_block]
)
npu_dma_memcpy_nd(
metadata="outC",
bd_id=0,
mem=C,
offsets=[0, 0, 0, C_row_offset],
sizes=[num_tile_rows, N_div_n, m, n],
strides=[m_x_N, n, N, 1],
)
for tile_row in range(num_tile_rows):
A_row_offset = (
((tile_row_block * rows_per_block) + tile_row) * m * K
# we only sync on half the BDs before reusing them, so the other half can concurrently keep running
# that's what this loop is for
for pingpong in [0, 1]:
C_row_offset = (
tile_row_block * rows_per_block * m * N
+ pingpong * rows_per_block // 2 * m * N
)
npu_dma_memcpy_nd(
metadata="inA",
bd_id=2 * tile_row + 1,
mem=A,
offsets=[0, 0, 0, A_row_offset],
sizes=[N_div_n, K_div_k, m, k],
strides=[0, k, K, 1],
row_base = (
tile_row_block * rows_per_block
+ pingpong * rows_per_block // 2
)
bd_id_base = 8 * pingpong
num_tile_rows = min([rows_per_block // 2, M_div_m - row_base])
npu_dma_memcpy_nd(
metadata="inB",
bd_id=2 * tile_row + 2,
mem=B,
sizes=[N_div_n, K_div_k, k, n],
strides=[n, k_x_N, N, 1],
metadata="outC",
bd_id=bd_id_base,
mem=C,
offsets=[0, 0, 0, C_row_offset],
sizes=[num_tile_rows, N_div_n, m, n],
strides=[m_x_N, n, N, 1],
)

npu_sync(column=0, row=0, direction=0, channel=0)
for tile_row in range(num_tile_rows):
A_row_offset = (row_base + tile_row) * m * K
npu_dma_memcpy_nd(
metadata="inA",
bd_id=bd_id_base + 2 * tile_row + 1,
mem=A,
offsets=[0, 0, 0, A_row_offset],
sizes=[N_div_n, K_div_k, m, k],
strides=[0, k, K, 1],
)
npu_dma_memcpy_nd(
metadata="inB",
bd_id=bd_id_base + 2 * tile_row + 2,
mem=B,
sizes=[N_div_n, K_div_k, k, n],
strides=[n, k_x_N, N, 1],
)
if tile_row_block > 0 or (tile_row_block == 0 and pingpong > 0):
npu_sync(column=0, row=0, direction=0, channel=0)
npu_sync(column=0, row=0, direction=0, channel=0)

print(ctx.module)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ Each of `inA_fifos`, `inB_fifos`, `OutC_fifos`, `memA_fifos`, `memB_fifos` and `

Of note is the `object_fifo_link()` operation. This operation establishes a connection between the `mem*` FIFOs and the `in*` and `outC` FIFOs. By linking ObjectFIFOs, the output received at one end of the source FIFO is fed as input into the ObjectFIFO listed as the destination.

[![data movement diagram](diagram.png)](https://excalidraw.com/#room=23df780b85d72d80cbc6,1czLdPr_vK9-OjtxFIWTpw)

<!-- 2. Creation of Object Fifos for Matrix A:
* The input matrix A is streamed from the host to the AIE array using object fifos. `inA_fifos` and `memA_fifos` are dictionaries created to store the object fifos for input matrix A. `inA_fifo_names` and `memA_fifo_names` are lists storing the names of corresponding object fifos.
Expand Down
161 changes: 117 additions & 44 deletions programming_examples/basic/matrix_multiplication/whole_array/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,56 +301,129 @@ def core_body():
)
def sequence(A, B, C):
# We are limited in the number of BDs. After synchronizing, we can reuse BDs.
# We only transfer 5 rows of tiles at once before starting a new transfer block.
# We only transfer 6 rows of tiles at once before starting a new transfer block.
tb_max_n_rows = (
5 # tb = transfer block; block of transfers before sync call
4 # tb = transfer block; block of transfers before sync call
)
for tb in range(ceildiv(M // m // n_aie_rows, tb_max_n_rows)):
tb_n_rows = min(
[tb_max_n_rows, M // m // n_aie_rows - tb * tb_max_n_rows]
)
C_row_offset = tb * tb_max_n_rows * m * n_aie_rows * N
for col in range(n_aie_cols):
C_col_offset = col * n
C_offset = C_col_offset + C_row_offset
npu_dma_memcpy_nd(
metadata=C_l2l3_fifos[col].sym_name.value,
bd_id=0,
mem=C,
offsets=[0, 0, 0, C_offset],
sizes=[tb_n_rows, N // n // n_aie_cols, m * n_aie_rows, n],
strides=[m * n_aie_rows * N, n * n_aie_cols, N, 1],
for pingpong in [0, 1]:
M // m // n_aie_rows // tb_max_n_rows
row_base = tb * tb_max_n_rows + pingpong * tb_max_n_rows // 2
bd_id_base = 8 * pingpong
tb_n_rows = min(
[tb_max_n_rows // 2, M // m // n_aie_rows - row_base]
)
for tile_row in range(tb_n_rows):
A_block_offset = (
((tb * tb_max_n_rows) + tile_row) * n_aie_rows * m * K
)
A_row_offset = col * n_A_tiles_per_shim * m * K
A_offset = A_block_offset + A_row_offset
B_col_offset = col * n
if tb_n_rows <= 0:
# for small input sizes, we may not even need a "pong" iteration
break
for col in range(n_aie_cols):

# C Output Transfer:
# The smallest transfer unit is a (m*n_aie_rows)-x-(n)-sized sub-tile of the matrix.
# Transfer one such tile for every (n_aie_cols)-th column, evenly spaced,
# then repeat that (tb_n_rows) times for the next contiguous blocks of rows.
# Each shim will start at a different column offset, transferring interleaved
# columns. For example, shim 0 may transfer the blocks marked 0 below, and shim 1
# may transfer the blocks marked 1.
#
# N
# ----------------
# |0011 0011 |
# |0011 0011 |
# |0011 0011 |
# M |0011 0011 |
# | |
# | |
# | |
# | |
# ----------------
C_row_offset = row_base * m * n_aie_rows * N
C_col_offset = col * n
C_offset = C_col_offset + C_row_offset
npu_dma_memcpy_nd(
metadata=A_l3l2_fifos[col].sym_name.value,
bd_id=2 * tile_row + 1,
mem=A,
offsets=[0, 0, 0, A_offset],
sizes=[
N // n // n_aie_cols,
K // k,
m * n_A_tiles_per_shim,
k,
],
strides=[0, k, K, 1],
metadata=C_l2l3_fifos[col].sym_name.value,
bd_id=bd_id_base,
mem=C,
offsets=[0, 0, 0, C_offset],
sizes=[tb_n_rows, N // n // n_aie_cols, m * n_aie_rows, n],
strides=[m * n_aie_rows * N, n * n_aie_cols, N, 1],
)
npu_dma_memcpy_nd(
metadata=B_l3l2_fifos[col].sym_name.value,
bd_id=2 * tile_row + 2,
mem=B,
offsets=[0, 0, 0, B_col_offset],
sizes=[N // n // n_aie_cols, K // k, k, n],
strides=[n * n_aie_cols, k * N, N, 1],
)
for col in range(n_aie_cols):
npu_sync(column=col, row=0, direction=0, channel=0)

for tile_row in range(tb_n_rows):

# A input transfer:
#
# The smallest transfer unit is a (m*n_A_tiles_per_shim)-sized sub-tile of the input matrix.
# Transfer one such tile for every column, contiguously.
# Repeat this transfer with identical tiles a total of (N//n//n_aie_cols) times.
# Each shim transfers the tiles for separate rows. For example, shim 0 may transfer the
# tiles marked 0 below, and shim 1 may transfer the tiles marked 1.
# K
# ----------------
# |0000000000000000| (repeated N//n//n_aie_cols times)
# |0000000000000000|
# |1111111111111111|
# M |1111111111111111|
# | |
# | |
# | |
# | |
# ----------------
A_block_offset = (
(row_base + tile_row) * n_aie_rows * m * K
) # base address for this transfer block for all BDs
A_row_offset = (
col * n_A_tiles_per_shim * m * K
) # base address for the shim in this column
A_offset = A_block_offset + A_row_offset
npu_dma_memcpy_nd(
metadata=A_l3l2_fifos[col].sym_name.value,
bd_id=bd_id_base + 2 * tile_row + 1,
mem=A,
offsets=[0, 0, 0, A_offset],
sizes=[
N // n // n_aie_cols,
K // k,
m * n_A_tiles_per_shim,
k,
],
strides=[0, k, K, 1],
)

# B input transfer:
# Transfer the first a (n)-wide block of columns of B,
# Then transfer the (n_aie_columns)-th such block, and so on.
# Each shim will start at a different column offset.
# For example, shim 0 may transfer the tiles marked 0 below,
# and shim 1 may transfer the tiles marked 1.
#
# N
# ----------------
# |0011 0011 |
# |0011 0011 |
# |0011 0011 |
# K |0011 0011 |
# |0011 0011 |
# |0011 0011 |
# |0011 0011 |
# |0011 0011 |
# ----------------
B_col_offset = col * n
npu_dma_memcpy_nd(
metadata=B_l3l2_fifos[col].sym_name.value,
bd_id=bd_id_base + 2 * tile_row + 2,
mem=B,
offsets=[0, 0, 0, B_col_offset],
sizes=[N // n // n_aie_cols, K // k, k, n],
strides=[n * n_aie_cols, k * N, N, 1],
)
if tb > 0 or (tb == 0 and pingpong > 0):
for col in range(n_aie_cols):
npu_sync(
column=col, row=0, direction=0, channel=0
) # C done
for col in range(n_aie_cols):
npu_sync(column=col, row=0, direction=0, channel=0)


if __name__ == "__main__":
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit f2c88ab

Please sign in to comment.