diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py index 79bba7b1..90aa8dbc 100644 --- a/slothy/targets/aarch64/aarch64_neon.py +++ b/slothy/targets/aarch64/aarch64_neon.py @@ -2259,12 +2259,15 @@ class vuxtl(AArch64Instruction): # pylint: disable=missing-docstring,invalid-nam inputs = ["Va"] outputs = ["Vd"] -class vqrdmulh(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name +class Vqdmulh(AArch64Instruction): + pass + +class vqrdmulh(Vqdmulh): # pylint: disable=missing-docstring,invalid-name pattern = "sqrdmulh ., ., ." inputs = ["Va", "Vb"] outputs = ["Vd"] -class vqrdmulh_lane(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name +class vqrdmulh_lane(Vqdmulh): # pylint: disable=missing-docstring,invalid-name pattern = "sqrdmulh ., ., .[]" inputs = ["Va", "Vb"] outputs = ["Vd"] @@ -2276,7 +2279,7 @@ def make(cls, src): [ f"v{i}" for i in range(0,16) ]] return obj -class vqdmulh_lane(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name +class vqdmulh_lane(Vqdmulh): # pylint: disable=missing-docstring,invalid-name pattern = "sqdmulh ., ., .[]" inputs = ["Va", "Vb"] outputs = ["Vd"] @@ -2380,12 +2383,15 @@ class vext(AArch64NeonLogical): # pylint: disable=missing-docstring,invalid-name inputs = ["Va", "Vb"] outputs = ["Vd"] -class vmul(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name +class Vmul(AArch64Instruction): + pass + +class vmul(Vmul): # pylint: disable=missing-docstring,invalid-name pattern = "mul ., ., ." inputs = ["Va", "Vb"] outputs = ["Vd"] -class vmul_lane(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name +class vmul_lane(Vmul): # pylint: disable=missing-docstring,invalid-name pattern = "mul ., ., .[]" inputs = ["Va", "Vb"] outputs = ["Vd"] @@ -2398,12 +2404,15 @@ def make(cls, src): return obj -class vmla(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name +class Vmla(AArch64Instruction): + pass + +class vmla(Vmla): # pylint: disable=missing-docstring,invalid-name pattern = "mla ., ., ." inputs = ["Va", "Vb"] in_outs=["Vd"] -class vmla_lane(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name +class vmla_lane(Vmla): # pylint: disable=missing-docstring,invalid-name pattern = "mla ., ., .[]" inputs = ["Va", "Vb"] in_outs=["Vd"] @@ -2415,12 +2424,12 @@ def make(cls, src): [ f"v{i}" for i in range(0,16) ]] return obj -class vmls(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name +class vmls(Vmla): # pylint: disable=missing-docstring,invalid-name pattern = "mls ., ., ." inputs = ["Va", "Vb"] in_outs = ["Vd"] -class vmls_lane(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name +class vmls_lane(Vmla): # pylint: disable=missing-docstring,invalid-name pattern = "mls ., ., .[]" inputs = ["Va", "Vb"] in_outs=["Vd"] diff --git a/slothy/targets/aarch64/cortex_a72_frontend.py b/slothy/targets/aarch64/cortex_a72_frontend.py index aac5ddad..1a74a7dc 100644 --- a/slothy/targets/aarch64/cortex_a72_frontend.py +++ b/slothy/targets/aarch64/cortex_a72_frontend.py @@ -133,7 +133,7 @@ def get_min_max_objective(slothy): vsrshr : [ExecutionUnit.ASIMD1], - St4 : [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1], + (St4, St2) : [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1], Ld4 : [[ExecutionUnit.ASIMD0, ExecutionUnit.LOAD0, ExecutionUnit.LOAD1], [ExecutionUnit.ASIMD1, ExecutionUnit.LOAD0, ExecutionUnit.LOAD1]] @@ -166,6 +166,7 @@ def get_min_max_objective(slothy): vsrshr : 1, + St2 : 4, St4 : 8, Ld4 : 4 } @@ -195,6 +196,7 @@ def get_min_max_objective(slothy): (add, add_imm, add_lsl, add_lsr) : 2, vsrshr : 3, # approx + St2 : 4, St4 : 8, Ld4 : 4 } diff --git a/slothy/targets/aarch64/neoverse_n1_experimental.py b/slothy/targets/aarch64/neoverse_n1_experimental.py index ef7c62a6..a28624e5 100644 --- a/slothy/targets/aarch64/neoverse_n1_experimental.py +++ b/slothy/targets/aarch64/neoverse_n1_experimental.py @@ -96,20 +96,21 @@ def get_min_max_objective(slothy): # [ExecutionUnit.VEC0, ExecutionUnit.LSU1], # [ExecutionUnit.VEC1, ExecutionUnit.LSU0], # [ExecutionUnit.VEC1, ExecutionUnit.LSU1]], - (vuzp1, vuzp2, vzip1, - Vrev, uaddlp) : ExecutionUnit.V(), + # TODO: As above, this should somehow occupy both V and L + St4 : ExecutionUnit.V(), + (Vzip, Vrev, uaddlp) : ExecutionUnit.V(), (vmov) : ExecutionUnit.V(), VecToGprMov : ExecutionUnit.V(), Transpose : ExecutionUnit.V(), (vmovi) : ExecutionUnit.V(), - (vand, vadd) : ExecutionUnit.V(), + (vand, vadd, vsub) : ExecutionUnit.V(), (vxtn) : ExecutionUnit.V(), (vuxtl, vshl, vshl_d, - vshli, vshrn) : ExecutionUnit.V1(), + vshli, vsrshr, vshrn) : ExecutionUnit.V1(), vusra : ExecutionUnit.V1(), AESInstruction : ExecutionUnit.V0(), - (vmul, Vmlal, vmull, - vmull2) : ExecutionUnit.V0(), + (Vmul, Vmla, Vqdmulh, + Vmull, Vmlal) : ExecutionUnit.V0(), AArch64NeonLogical : ExecutionUnit.V(), (AArch64BasicArithmetic, AArch64ConditionalSelect, @@ -131,10 +132,10 @@ def get_min_max_objective(slothy): (Ldr_X, Str_X, Ldr_Q, Str_Q) : 1, (Ldp_X, Stp_X) : 2, - (vuzp1, vuzp2, vzip1, - uaddlp, Vrev) : 1, + St4 : 6, # TODO: Really?? + (Vzip, uaddlp, Vrev) : 1, VecToGprMov : 1, - (vand, vadd) : 1, + (vand, vadd, vsub) : 1, (vmov) : 1, Transpose : 1, AESInstruction : 1, @@ -142,10 +143,10 @@ def get_min_max_objective(slothy): (vmovi) : 1, (vxtn) : 1, (vuxtl, vshl, vshl_d, - vshli, vshrn) : 1, - (vmul) : 2, + vshli, vsrshr, vshrn) : 1, + (Vmul, Vmla, Vqdmulh) : 2, vusra : 1, - (Vmlal, vmull, vmull2) : 1, + (Vmull, Vmlal) : 1, (AArch64BasicArithmetic, AArch64ConditionalSelect, AArch64ConditionalCompare, @@ -167,21 +168,22 @@ def get_min_max_objective(slothy): Ldr_X, Ldr_Q) : 4, (Stp_X, Str_X, Str_Q) : 2, - (vuzp1, vuzp2, vzip1, - Vrev, uaddlp) : 2, + St4 : 4, + (Vzip, Vrev, uaddlp) : 2, VecToGprMov : 2, (vxtn) : 2, AESInstruction : 2, AArch64NeonLogical : 2, Transpose : 2, - (vand, vadd) : 2, + (vand, vadd, vsub) : 2, (vmov) : 2, # ??? (vmovi) : 2, - (vmul) : 5, + (Vmul, Vmla, Vqdmulh) : 5, vusra : 4, # TODO: Add fwd path - (Vmlal, vmull, vmull2) : 4, # TODO: Add fwd path + (Vmull, Vmlal) : 4, (vuxtl, vshl, vshl_d, vshli, vshrn) : 2, + (vsrshr) : 4, (AArch64BasicArithmetic, AArch64ConditionalSelect, AArch64ConditionalCompare, @@ -201,9 +203,32 @@ def get_min_max_objective(slothy): def get_latency(src, out_idx, dst): _ = out_idx # out_idx unused - _ = find_class(src) - _ = find_class(dst) + instclass_src = find_class(src) + instclass_dst = find_class(dst) + latency = lookup_multidict(default_latencies, src) + + # Fast mul->mla forwarding + if instclass_src in [vmul, vmul_lane] and \ + instclass_dst in [vmla, vmla_lane, vmls, vmls_lane] and \ + src.args_out[0] == dst.args_in_out[0]: + return 2 + # Fast mla->mla forwarding + if instclass_src in [vmla, vmla_lane, vmls, vmls_lane] and \ + instclass_dst in [vmla, vmla_lane, vmls, vmls_lane] and \ + src.args_in_out[0] == dst.args_in_out[0]: + return 2 + # Fast mull->mlal forwarding + if instclass_src in all_subclass_leaves(Vmull) and \ + instclass_dst in all_subclass_leaves(Vmlal) and \ + src.args_out[0] == dst.args_in_out[0]: + return 1 + # Fast mlal->mlal forwarding + if instclass_src in all_subclass_leaves(Vmlal) and \ + instclass_dst in all_subclass_leaves(Vmlal) and \ + src.args_in_out[0] == dst.args_in_out[0]: + return 1 + return latency def get_units(src):