Skip to content

Commit

Permalink
Expand A72 and N1 models (#89)
Browse files Browse the repository at this point in the history
  • Loading branch information
hanno-becker authored Sep 29, 2024
1 parent b2241b6 commit 4f7c000
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 29 deletions.
27 changes: 18 additions & 9 deletions slothy/targets/aarch64/aarch64_neon.py
Original file line number Diff line number Diff line change
Expand Up @@ -2259,12 +2259,15 @@ class vuxtl(AArch64Instruction): # pylint: disable=missing-docstring,invalid-nam
inputs = ["Va"]
outputs = ["Vd"]

class vqrdmulh(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
class Vqdmulh(AArch64Instruction):
pass

class vqrdmulh(Vqdmulh): # pylint: disable=missing-docstring,invalid-name
pattern = "sqrdmulh <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>"
inputs = ["Va", "Vb"]
outputs = ["Vd"]

class vqrdmulh_lane(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
class vqrdmulh_lane(Vqdmulh): # pylint: disable=missing-docstring,invalid-name
pattern = "sqrdmulh <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>[<index>]"
inputs = ["Va", "Vb"]
outputs = ["Vd"]
Expand All @@ -2276,7 +2279,7 @@ def make(cls, src):
[ f"v{i}" for i in range(0,16) ]]
return obj

class vqdmulh_lane(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
class vqdmulh_lane(Vqdmulh): # pylint: disable=missing-docstring,invalid-name
pattern = "sqdmulh <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>[<index>]"
inputs = ["Va", "Vb"]
outputs = ["Vd"]
Expand Down Expand Up @@ -2380,12 +2383,15 @@ class vext(AArch64NeonLogical): # pylint: disable=missing-docstring,invalid-name
inputs = ["Va", "Vb"]
outputs = ["Vd"]

class vmul(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
class Vmul(AArch64Instruction):
pass

class vmul(Vmul): # pylint: disable=missing-docstring,invalid-name
pattern = "mul <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>"
inputs = ["Va", "Vb"]
outputs = ["Vd"]

class vmul_lane(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
class vmul_lane(Vmul): # pylint: disable=missing-docstring,invalid-name
pattern = "mul <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>[<index>]"
inputs = ["Va", "Vb"]
outputs = ["Vd"]
Expand All @@ -2398,12 +2404,15 @@ def make(cls, src):

return obj

class vmla(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
class Vmla(AArch64Instruction):
pass

class vmla(Vmla): # pylint: disable=missing-docstring,invalid-name
pattern = "mla <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>"
inputs = ["Va", "Vb"]
in_outs=["Vd"]

class vmla_lane(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
class vmla_lane(Vmla): # pylint: disable=missing-docstring,invalid-name
pattern = "mla <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>[<index>]"
inputs = ["Va", "Vb"]
in_outs=["Vd"]
Expand All @@ -2415,12 +2424,12 @@ def make(cls, src):
[ f"v{i}" for i in range(0,16) ]]
return obj

class vmls(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
class vmls(Vmla): # pylint: disable=missing-docstring,invalid-name
pattern = "mls <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>"
inputs = ["Va", "Vb"]
in_outs = ["Vd"]

class vmls_lane(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
class vmls_lane(Vmla): # pylint: disable=missing-docstring,invalid-name
pattern = "mls <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>[<index>]"
inputs = ["Va", "Vb"]
in_outs=["Vd"]
Expand Down
4 changes: 3 additions & 1 deletion slothy/targets/aarch64/cortex_a72_frontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def get_min_max_objective(slothy):

vsrshr : [ExecutionUnit.ASIMD1],

St4 : [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1],
(St4, St2) : [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1],

Ld4 : [[ExecutionUnit.ASIMD0, ExecutionUnit.LOAD0, ExecutionUnit.LOAD1],
[ExecutionUnit.ASIMD1, ExecutionUnit.LOAD0, ExecutionUnit.LOAD1]]
Expand Down Expand Up @@ -166,6 +166,7 @@ def get_min_max_objective(slothy):

vsrshr : 1,

St2 : 4,
St4 : 8,
Ld4 : 4
}
Expand Down Expand Up @@ -195,6 +196,7 @@ def get_min_max_objective(slothy):
(add, add_imm, add_lsl, add_lsr) : 2,

vsrshr : 3, # approx
St2 : 4,
St4 : 8,
Ld4 : 4
}
Expand Down
63 changes: 44 additions & 19 deletions slothy/targets/aarch64/neoverse_n1_experimental.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,20 +96,21 @@ def get_min_max_objective(slothy):
# [ExecutionUnit.VEC0, ExecutionUnit.LSU1],
# [ExecutionUnit.VEC1, ExecutionUnit.LSU0],
# [ExecutionUnit.VEC1, ExecutionUnit.LSU1]],
(vuzp1, vuzp2, vzip1,
Vrev, uaddlp) : ExecutionUnit.V(),
# TODO: As above, this should somehow occupy both V and L
St4 : ExecutionUnit.V(),
(Vzip, Vrev, uaddlp) : ExecutionUnit.V(),
(vmov) : ExecutionUnit.V(),
VecToGprMov : ExecutionUnit.V(),
Transpose : ExecutionUnit.V(),
(vmovi) : ExecutionUnit.V(),
(vand, vadd) : ExecutionUnit.V(),
(vand, vadd, vsub) : ExecutionUnit.V(),
(vxtn) : ExecutionUnit.V(),
(vuxtl, vshl, vshl_d,
vshli, vshrn) : ExecutionUnit.V1(),
vshli, vsrshr, vshrn) : ExecutionUnit.V1(),
vusra : ExecutionUnit.V1(),
AESInstruction : ExecutionUnit.V0(),
(vmul, Vmlal, vmull,
vmull2) : ExecutionUnit.V0(),
(Vmul, Vmla, Vqdmulh,
Vmull, Vmlal) : ExecutionUnit.V0(),
AArch64NeonLogical : ExecutionUnit.V(),
(AArch64BasicArithmetic,
AArch64ConditionalSelect,
Expand All @@ -131,21 +132,21 @@ def get_min_max_objective(slothy):
(Ldr_X, Str_X,
Ldr_Q, Str_Q) : 1,
(Ldp_X, Stp_X) : 2,
(vuzp1, vuzp2, vzip1,
uaddlp, Vrev) : 1,
St4 : 6, # TODO: Really??
(Vzip, uaddlp, Vrev) : 1,
VecToGprMov : 1,
(vand, vadd) : 1,
(vand, vadd, vsub) : 1,
(vmov) : 1,
Transpose : 1,
AESInstruction : 1,
AArch64NeonLogical : 1,
(vmovi) : 1,
(vxtn) : 1,
(vuxtl, vshl, vshl_d,
vshli, vshrn) : 1,
(vmul) : 2,
vshli, vsrshr, vshrn) : 1,
(Vmul, Vmla, Vqdmulh) : 2,
vusra : 1,
(Vmlal, vmull, vmull2) : 1,
(Vmull, Vmlal) : 1,
(AArch64BasicArithmetic,
AArch64ConditionalSelect,
AArch64ConditionalCompare,
Expand All @@ -167,21 +168,22 @@ def get_min_max_objective(slothy):
Ldr_X,
Ldr_Q) : 4,
(Stp_X, Str_X, Str_Q) : 2,
(vuzp1, vuzp2, vzip1,
Vrev, uaddlp) : 2,
St4 : 4,
(Vzip, Vrev, uaddlp) : 2,
VecToGprMov : 2,
(vxtn) : 2,
AESInstruction : 2,
AArch64NeonLogical : 2,
Transpose : 2,
(vand, vadd) : 2,
(vand, vadd, vsub) : 2,
(vmov) : 2, # ???
(vmovi) : 2,
(vmul) : 5,
(Vmul, Vmla, Vqdmulh) : 5,
vusra : 4, # TODO: Add fwd path
(Vmlal, vmull, vmull2) : 4, # TODO: Add fwd path
(Vmull, Vmlal) : 4,
(vuxtl, vshl, vshl_d,
vshli, vshrn) : 2,
(vsrshr) : 4,
(AArch64BasicArithmetic,
AArch64ConditionalSelect,
AArch64ConditionalCompare,
Expand All @@ -201,9 +203,32 @@ def get_min_max_objective(slothy):
def get_latency(src, out_idx, dst):
_ = out_idx # out_idx unused

_ = find_class(src)
_ = find_class(dst)
instclass_src = find_class(src)
instclass_dst = find_class(dst)

latency = lookup_multidict(default_latencies, src)

# Fast mul->mla forwarding
if instclass_src in [vmul, vmul_lane] and \
instclass_dst in [vmla, vmla_lane, vmls, vmls_lane] and \
src.args_out[0] == dst.args_in_out[0]:
return 2
# Fast mla->mla forwarding
if instclass_src in [vmla, vmla_lane, vmls, vmls_lane] and \
instclass_dst in [vmla, vmla_lane, vmls, vmls_lane] and \
src.args_in_out[0] == dst.args_in_out[0]:
return 2
# Fast mull->mlal forwarding
if instclass_src in all_subclass_leaves(Vmull) and \
instclass_dst in all_subclass_leaves(Vmlal) and \
src.args_out[0] == dst.args_in_out[0]:
return 1
# Fast mlal->mlal forwarding
if instclass_src in all_subclass_leaves(Vmlal) and \
instclass_dst in all_subclass_leaves(Vmlal) and \
src.args_in_out[0] == dst.args_in_out[0]:
return 1

return latency

def get_units(src):
Expand Down

0 comments on commit 4f7c000

Please sign in to comment.