diff --git a/slothy/targets/aarch64/aarch64_neon.py b/slothy/targets/aarch64/aarch64_neon.py
index 79bba7b1..90aa8dbc 100644
--- a/slothy/targets/aarch64/aarch64_neon.py
+++ b/slothy/targets/aarch64/aarch64_neon.py
@@ -2259,12 +2259,15 @@ class vuxtl(AArch64Instruction): # pylint: disable=missing-docstring,invalid-nam
     inputs = ["Va"]
     outputs = ["Vd"]
 
-class vqrdmulh(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
+class Vqdmulh(AArch64Instruction):
+    pass
+
+class vqrdmulh(Vqdmulh): # pylint: disable=missing-docstring,invalid-name
     pattern = "sqrdmulh <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>"
     inputs = ["Va", "Vb"]
     outputs = ["Vd"]
 
-class vqrdmulh_lane(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
+class vqrdmulh_lane(Vqdmulh): # pylint: disable=missing-docstring,invalid-name
     pattern = "sqrdmulh <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>[<index>]"
     inputs = ["Va", "Vb"]
     outputs = ["Vd"]
@@ -2276,7 +2279,7 @@ def make(cls, src):
                                           [ f"v{i}" for i in range(0,16) ]]
         return obj
 
-class vqdmulh_lane(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
+class vqdmulh_lane(Vqdmulh): # pylint: disable=missing-docstring,invalid-name
     pattern = "sqdmulh <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>[<index>]"
     inputs = ["Va", "Vb"]
     outputs = ["Vd"]
@@ -2380,12 +2383,15 @@ class vext(AArch64NeonLogical): # pylint: disable=missing-docstring,invalid-name
     inputs = ["Va", "Vb"]
     outputs = ["Vd"]
 
-class vmul(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
+class Vmul(AArch64Instruction):
+    pass
+
+class vmul(Vmul): # pylint: disable=missing-docstring,invalid-name
     pattern = "mul <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>"
     inputs = ["Va", "Vb"]
     outputs = ["Vd"]
 
-class vmul_lane(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
+class vmul_lane(Vmul): # pylint: disable=missing-docstring,invalid-name
     pattern = "mul <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>[<index>]"
     inputs = ["Va", "Vb"]
     outputs = ["Vd"]
@@ -2398,12 +2404,15 @@ def make(cls, src):
 
         return obj
 
-class vmla(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
+class Vmla(AArch64Instruction):
+    pass
+
+class vmla(Vmla): # pylint: disable=missing-docstring,invalid-name
     pattern = "mla <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>"
     inputs = ["Va", "Vb"]
     in_outs=["Vd"]
 
-class vmla_lane(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
+class vmla_lane(Vmla): # pylint: disable=missing-docstring,invalid-name
     pattern = "mla <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>[<index>]"
     inputs = ["Va", "Vb"]
     in_outs=["Vd"]
@@ -2415,12 +2424,12 @@ def make(cls, src):
                                           [ f"v{i}" for i in range(0,16) ]]
         return obj
 
-class vmls(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
+class vmls(Vmla): # pylint: disable=missing-docstring,invalid-name
     pattern = "mls <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>"
     inputs = ["Va", "Vb"]
     in_outs = ["Vd"]
 
-class vmls_lane(AArch64Instruction): # pylint: disable=missing-docstring,invalid-name
+class vmls_lane(Vmla): # pylint: disable=missing-docstring,invalid-name
     pattern = "mls <Vd>.<dt0>, <Va>.<dt1>, <Vb>.<dt2>[<index>]"
     inputs = ["Va", "Vb"]
     in_outs=["Vd"]
diff --git a/slothy/targets/aarch64/cortex_a72_frontend.py b/slothy/targets/aarch64/cortex_a72_frontend.py
index aac5ddad..1a74a7dc 100644
--- a/slothy/targets/aarch64/cortex_a72_frontend.py
+++ b/slothy/targets/aarch64/cortex_a72_frontend.py
@@ -133,7 +133,7 @@ def get_min_max_objective(slothy):
 
     vsrshr : [ExecutionUnit.ASIMD1],
 
-    St4 : [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1],
+    (St4, St2) : [ExecutionUnit.ASIMD0, ExecutionUnit.ASIMD1],
 
     Ld4 : [[ExecutionUnit.ASIMD0, ExecutionUnit.LOAD0, ExecutionUnit.LOAD1],
            [ExecutionUnit.ASIMD1, ExecutionUnit.LOAD0, ExecutionUnit.LOAD1]]
@@ -166,6 +166,7 @@ def get_min_max_objective(slothy):
 
     vsrshr : 1,
 
+    St2 : 4,
     St4 : 8,
     Ld4 : 4
 }
@@ -195,6 +196,7 @@ def get_min_max_objective(slothy):
     (add, add_imm, add_lsl, add_lsr) : 2,
 
     vsrshr : 3, # approx
+    St2 : 4,
     St4 : 8,
     Ld4 : 4
 }
diff --git a/slothy/targets/aarch64/neoverse_n1_experimental.py b/slothy/targets/aarch64/neoverse_n1_experimental.py
index ef7c62a6..a28624e5 100644
--- a/slothy/targets/aarch64/neoverse_n1_experimental.py
+++ b/slothy/targets/aarch64/neoverse_n1_experimental.py
@@ -96,20 +96,21 @@ def get_min_max_objective(slothy):
     #          [ExecutionUnit.VEC0, ExecutionUnit.LSU1],
     #          [ExecutionUnit.VEC1, ExecutionUnit.LSU0],
     #          [ExecutionUnit.VEC1, ExecutionUnit.LSU1]],
-    (vuzp1, vuzp2, vzip1,
-     Vrev, uaddlp)           : ExecutionUnit.V(),
+    # TODO: As above, this should somehow occupy both V and L
+    St4                       : ExecutionUnit.V(),
+    (Vzip, Vrev, uaddlp)      : ExecutionUnit.V(),
     (vmov)                    : ExecutionUnit.V(),
     VecToGprMov               : ExecutionUnit.V(),
     Transpose                 : ExecutionUnit.V(),
     (vmovi)                   : ExecutionUnit.V(),
-    (vand, vadd)              : ExecutionUnit.V(),
+    (vand, vadd, vsub)        : ExecutionUnit.V(),
     (vxtn)                    : ExecutionUnit.V(),
     (vuxtl, vshl, vshl_d,
-     vshli, vshrn)            : ExecutionUnit.V1(),
+     vshli, vsrshr, vshrn)    : ExecutionUnit.V1(),
     vusra                     : ExecutionUnit.V1(),
     AESInstruction            : ExecutionUnit.V0(),
-    (vmul, Vmlal, vmull,
-     vmull2)                  : ExecutionUnit.V0(),
+    (Vmul, Vmla, Vqdmulh,
+     Vmull, Vmlal)            : ExecutionUnit.V0(),
     AArch64NeonLogical        : ExecutionUnit.V(),
     (AArch64BasicArithmetic,
      AArch64ConditionalSelect,
@@ -131,10 +132,10 @@ def get_min_max_objective(slothy):
     (Ldr_X, Str_X,
      Ldr_Q, Str_Q)             : 1,
     (Ldp_X, Stp_X)             : 2,
-    (vuzp1, vuzp2, vzip1,
-     uaddlp, Vrev)            : 1,
+    St4                        : 6, # TODO: Really??
+    (Vzip, uaddlp, Vrev)       : 1,
     VecToGprMov                : 1,
-    (vand, vadd)               : 1,
+    (vand, vadd, vsub)         : 1,
     (vmov)                     : 1,
     Transpose                  : 1,
     AESInstruction             : 1,
@@ -142,10 +143,10 @@ def get_min_max_objective(slothy):
     (vmovi)                    : 1,
     (vxtn)                     : 1,
     (vuxtl, vshl, vshl_d,
-     vshli, vshrn)             : 1,
-    (vmul)                     : 2,
+     vshli, vsrshr, vshrn)     : 1,
+    (Vmul, Vmla, Vqdmulh)      : 2,
     vusra                      : 1,
-    (Vmlal, vmull, vmull2)     : 1,
+    (Vmull, Vmlal)             : 1,
     (AArch64BasicArithmetic,
      AArch64ConditionalSelect,
      AArch64ConditionalCompare,
@@ -167,21 +168,22 @@ def get_min_max_objective(slothy):
      Ldr_X,
      Ldr_Q)                   : 4,
     (Stp_X, Str_X, Str_Q)     : 2,
-    (vuzp1, vuzp2, vzip1,
-     Vrev, uaddlp)           : 2,
+    St4                       : 4,
+    (Vzip, Vrev, uaddlp)      : 2,
     VecToGprMov               : 2,
     (vxtn)                    : 2,
     AESInstruction            : 2,
     AArch64NeonLogical        : 2,
     Transpose                 : 2,
-    (vand, vadd)              : 2,
+    (vand, vadd, vsub)        : 2,
     (vmov)                    : 2, # ???
     (vmovi)                   : 2,
-    (vmul)                    : 5,
+    (Vmul, Vmla, Vqdmulh)     : 5,
     vusra                     : 4, # TODO: Add fwd path
-    (Vmlal, vmull, vmull2)    : 4, # TODO: Add fwd path
+    (Vmull, Vmlal)            : 4,
     (vuxtl, vshl, vshl_d,
      vshli, vshrn)            : 2,
+    (vsrshr)                  : 4,
     (AArch64BasicArithmetic,
      AArch64ConditionalSelect,
      AArch64ConditionalCompare,
@@ -201,9 +203,32 @@ def get_min_max_objective(slothy):
 def get_latency(src, out_idx, dst):
     _ = out_idx # out_idx unused
 
-    _ = find_class(src)
-    _ = find_class(dst)
+    instclass_src = find_class(src)
+    instclass_dst = find_class(dst)
+
     latency = lookup_multidict(default_latencies, src)
+
+    # Fast mul->mla forwarding
+    if instclass_src in [vmul, vmul_lane] and \
+       instclass_dst in [vmla, vmla_lane, vmls, vmls_lane] and \
+       src.args_out[0] == dst.args_in_out[0]:
+        return 2
+    # Fast mla->mla forwarding
+    if instclass_src in [vmla, vmla_lane, vmls, vmls_lane] and \
+       instclass_dst in [vmla, vmla_lane, vmls, vmls_lane] and \
+       src.args_in_out[0] == dst.args_in_out[0]:
+        return 2
+    # Fast mull->mlal forwarding
+    if instclass_src in all_subclass_leaves(Vmull) and \
+       instclass_dst in all_subclass_leaves(Vmlal) and \
+       src.args_out[0] == dst.args_in_out[0]:
+        return 1
+    # Fast mlal->mlal forwarding
+    if instclass_src in all_subclass_leaves(Vmlal) and \
+       instclass_dst in all_subclass_leaves(Vmlal) and \
+       src.args_in_out[0] == dst.args_in_out[0]:
+        return 1
+
     return latency
 
 def get_units(src):