diff --git a/example.py b/example.py index eedcfbb8..d97dcaf1 100644 --- a/example.py +++ b/example.py @@ -58,7 +58,7 @@ class Example(): def __init__(self, infile, name=None, funcname=None, suffix="opt", rename=False, outfile="", arch=Arch_Armv81M, target=Target_CortexM55r1, - timeout=None, **kwargs): + timeout=None, outfile_full=False, **kwargs): if name is None: name = infile @@ -67,17 +67,23 @@ def __init__(self, infile, name=None, funcname=None, suffix="opt", self.funcname = funcname self.infile = infile self.suffix = suffix - if outfile == "": - self.outfile = f"{infile}_{self.suffix}_{target_label_dict[self.target]}" + if outfile_full is True: + self.outfile = outfile else: - self.outfile = f"{outfile}_{self.suffix}_{target_label_dict[self.target]}" + if outfile == "": + self.outfile = f"{infile}_{self.suffix}_{target_label_dict[self.target]}" + else: + self.outfile = f"{outfile}_{self.suffix}_{target_label_dict[self.target]}" if funcname is None: self.funcname = self.infile subfolder = "" if self.arch == AArch64_Neon: subfolder = "aarch64/" self.infile_full = f"examples/naive/{subfolder}{self.infile}.s" - self.outfile_full = f"examples/opt/{subfolder}{self.outfile}.s" + if outfile_full is False: + self.outfile_full = f"examples/opt/{subfolder}{self.outfile}.s" + else: + self.outfile_full = self.outfile self.name = name self.rename = rename self.timeout = timeout @@ -1358,46 +1364,6 @@ def core(self, slothy): slothy.config.sw_pipelining.optimize_postamble = False slothy.optimize_loop("flt_radix4_fft_loop_start") -class neon_keccak_x4(Example): - def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): - name = "keccak_f1600_x4_hybrid_slothy" - infile = "keccak_f1600_x4_hybrid_slothy" - - if var != "": - name += f"_{var}" - infile += f"_{var}" - name += f"_{target_label_dict[target]}" - - super().__init__(infile, name, outfile=name, rename=True, arch=arch, target=target, timeout=600) - - def core(self, slothy): - slothy.config.inputs_are_outputs = True - # TODO: check of all of these are need for all code parts - slothy.config.reserved_regs = ["sp"] - slothy.config.outputs = ["x27"] - slothy.config.reserved_regs += self.target_reserved - slothy.config.constraints.stalls_first_attempt = 8 - slothy.config.variable_size = True - - slothy.config.split_heuristic = True - slothy.config.split_heuristic_repeat = 0 - slothy.config.split_heuristic_preprocess_naive_interleaving = True - - slothy.optimize(start="initial", end="end_initial") - slothy.optimize(start="initial2", end="end_initial2") - slothy.optimize(start="loop_0", end="end_loop_0") - slothy.optimize(start="loop_1", end="end_loop_1") - - slothy.config.split_heuristic = True - slothy.config.split_heuristic_factor = 3 - slothy.config.split_heuristic_stepsize = 0.2 - slothy.config.split_heuristic_repeat = 2 - slothy.optimize(start="initial", end="end_initial") - slothy.optimize(start="initial2", end="end_initial2") - slothy.config.split_heuristic_repeat = 5 - slothy.optimize(start="loop_0", end="end_loop_0") - slothy.optimize(start="loop_1", end="end_loop_1") - class neon_keccak_x1_no_symbolic(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): name = "keccak_f1600_x1_scalar_slothy_no_symbolic" @@ -1418,15 +1384,41 @@ def core(self, slothy): slothy.config.outputs = ["flags"] slothy.config.constraints.stalls_first_attempt = 64 -# slothy.config.ignore_objective = True slothy.config.constraints.minimize_spills = True -# slothy.config.constraints.functional_only = True slothy.config.constraints.allow_reordering = True -# slothy.config.constraints.allow_reordering = False slothy.config.constraints.allow_spills = True slothy.config.constraints.minimize_spills = True slothy.config.visualize_expected_performance = True -# slothy.config.visualize_show_old_code = True + slothy.optimize(start="loop", end="end_loop") + + slothy.config.outputs = ["hint_STACK_OFFSET_COUNT"] + slothy.optimize(start="initial_round_start", end="initial_round_end") + +class neon_keccak_x1_scalar_opt(Example): + def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): + name = "keccak_f1600_x1_scalar_opt" + infile = "keccak_f1600_x1_scalar_pre_opt" + outfile = "keccak_f1600_x1_scalar" + + super().__init__(infile, name, outfile=outfile, rename=True, arch=arch, target=target) + + def core(self, slothy): + slothy.config.reserved_regs = ["x18", "sp"] + + slothy.config.inputs_are_outputs = True + slothy.config.variable_size = True + slothy.config.timeout = 10800 + + slothy.config.selfcheck_failure_logfile = "selfcheck_fail.log" + + slothy.config.outputs = ["flags"] + slothy.config.constraints.stalls_first_attempt = 32 + slothy.config.visualize_expected_performance = True + slothy.config.split_heuristic = True + slothy.config.split_heuristic_factor = 1.5 + slothy.config.split_heuristic_stepsize = 0.3 + slothy.config.split_heuristic_repeat = 1 + slothy.config.split_heuristic_optimize_seam = 5 slothy.optimize(start="loop", end="end_loop") @@ -1454,49 +1446,50 @@ def core(self, slothy): slothy.config.outputs = ["flags"] slothy.config.constraints.stalls_first_attempt = 64 slothy.config.ignore_objective = True -# slothy.config.constraints.minimize_spills = True slothy.config.constraints.functional_only = True -# slothy.config.constraints.allow_reordering = True slothy.config.constraints.allow_reordering = False slothy.config.constraints.allow_spills = True -# slothy.config.constraints.minimize_spills = True slothy.config.visualize_expected_performance = True -# slothy.config.visualize_show_old_code = True slothy.optimize(start="loop", end="loop_end") slothy.config.outputs = ["hint_STACK_OFFSET_COUNT"] slothy.optimize(start="initial", end="loop") -class neon_keccak_x1_scalar_opt(Example): +class neon_keccak_x4_interleave(Example): def __init__(self, var="", arch=AArch64_Neon, target=Target_CortexA55): - name = "keccak_f1600_x1_scalar_opt" - infile = "keccak_f1600_x1_scalar_pre_opt" - outfile = "keccak_f1600_x1_scalar" + name = "keccak_f1600_x4_hybrid_slothy_interleave" + infile = "keccak_f1600_x4_hybrid_slothy_clean" + outfile = "examples/naive/aarch64/keccak_f1600_x4_hybrid_slothy_interleaved.s" - super().__init__(infile, name, outfile=outfile, rename=True, arch=arch, target=target) + super().__init__(infile, name, outfile=outfile, rename="keccak_f1600_x4_hybrid_slothy_interleaved", + arch=arch, target=target, outfile_full=True) def core(self, slothy): slothy.config.reserved_regs = ["x18", "sp"] slothy.config.inputs_are_outputs = True slothy.config.variable_size = True + slothy.config.visualize_expected_performance = False slothy.config.timeout = 10800 slothy.config.selfcheck_failure_logfile = "selfcheck_fail.log" - slothy.config.outputs = ["flags"] - slothy.config.constraints.stalls_first_attempt = 32 + slothy.config.outputs = ["flags", "hint_STACK_OFFSET_COUNT"] + slothy.config.constraints.stalls_first_attempt = 64 + slothy.config.ignore_objective = True + slothy.config.constraints.functional_only = True + slothy.config.constraints.allow_reordering = False + slothy.config.constraints.allow_spills = True slothy.config.visualize_expected_performance = True - slothy.config.split_heuristic = True - slothy.config.split_heuristic_factor = 1.5 - slothy.config.split_heuristic_stepsize = 0.3 - slothy.config.split_heuristic_repeat = 1 - slothy.config.split_heuristic_optimize_seam = 5 - slothy.optimize(start="loop", end="end_loop") + slothy.config.split_heuristic = True + slothy.config.split_heuristic_repeat = 0 + slothy.config.split_heuristic_preprocess_naive_interleaving = True + slothy.config.split_heuristic_preprocess_naive_interleaving_strategy = "alternate" + slothy.config.split_heuristic_estimate_performance = False + slothy.config.absorb_spills = False - slothy.config.outputs = ["hint_STACK_OFFSET_COUNT"] - slothy.optimize(start="initial_round_start", end="initial_round_end") + slothy.optimize(start="loop", end="loop_end") ############################################################################################# @@ -1641,10 +1634,10 @@ def main(): # Fixed point fft_fixedpoint_radix4(), # Keccak - neon_keccak_x4(), neon_keccak_x1_no_symbolic(), neon_keccak_x1_scalar_opt(), neon_keccak_x4_no_symbolic(), + neon_keccak_x4_interleave(), ] all_example_names = [e.name for e in examples] diff --git a/examples/naive/aarch64/keccak_f1600_x4_hybrid_slothy.s b/examples/naive/aarch64/keccak_f1600_x4_hybrid_slothy.s deleted file mode 100644 index 77a8de43..00000000 --- a/examples/naive/aarch64/keccak_f1600_x4_hybrid_slothy.s +++ /dev/null @@ -1,1150 +0,0 @@ -/* - * Copyright (c) 2021-2022 Arm Limited - * Copyright (c) 2022 Matthias Kannwischer - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ -#include - -// -// Author: Hanno Becker -// Author: Matthias Kannwischer -// - -.macro asm_load dst // @slothy:no-unfold - ASM_LOAD(\dst, round_constants) -.endm - -/********************** CONSTANTS *************************/ - .data - .align(8) -round_constants: - .quad 0x0000000000000001 - .quad 0x0000000000008082 - .quad 0x800000000000808a - .quad 0x8000000080008000 - .quad 0x000000000000808b - .quad 0x0000000080000001 - .quad 0x8000000080008081 - .quad 0x8000000000008009 - .quad 0x000000000000008a - .quad 0x0000000000000088 - .quad 0x0000000080008009 - .quad 0x000000008000000a - .quad 0x000000008000808b - .quad 0x800000000000008b - .quad 0x8000000000008089 - .quad 0x8000000000008003 - .quad 0x8000000000008002 - .quad 0x8000000000000080 - .quad 0x000000000000800a - .quad 0x800000008000000a - .quad 0x8000000080008081 - .quad 0x8000000000008080 - .quad 0x0000000080000001 - .quad 0x8000000080008008 - -/****************** REGISTER ALLOCATIONS *******************/ - - input_addr .req x0 - const_addr .req x29 - count .req w27 - cur_const .req x26 - - /* Mapping of Kecck-f1600 SIMD state to vector registers - * at the beginning and end of each round. */ - - vAba .req v0 - vAbe .req v1 - vAbi .req v2 - vAbo .req v3 - vAbu .req v4 - vAga .req v5 - vAge .req v6 - vAgi .req v7 - vAgo .req v8 - vAgu .req v9 - vAka .req v10 - vAke .req v11 - vAki .req v12 - vAko .req v13 - vAku .req v14 - vAma .req v15 - vAme .req v16 - vAmi .req v17 - vAmo .req v18 - vAmu .req v19 - vAsa .req v20 - vAse .req v21 - vAsi .req v22 - vAso .req v23 - vAsu .req v24 - - /* q-form of the above mapping */ - vAbaq .req q0 - vAbeq .req q1 - vAbiq .req q2 - vAboq .req q3 - vAbuq .req q4 - vAgaq .req q5 - vAgeq .req q6 - vAgiq .req q7 - vAgoq .req q8 - vAguq .req q9 - vAkaq .req q10 - vAkeq .req q11 - vAkiq .req q12 - vAkoq .req q13 - vAkuq .req q14 - vAmaq .req q15 - vAmeq .req q16 - vAmiq .req q17 - vAmoq .req q18 - vAmuq .req q19 - vAsaq .req q20 - vAseq .req q21 - vAsiq .req q22 - vAsoq .req q23 - vAsuq .req q24 - - /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ - C0 .req v30 - C1 .req v29 - C2 .req v28 - C3 .req v27 - C4 .req v26 - - /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ - E0 .req v26 - E1 .req v25 - E2 .req v29 - E3 .req v28 - E4 .req v27 - - /* A_[y,2*x+3*y] = rot(A[x,y]) */ - vAbi_ .req v2 - vAbo_ .req v3 - vAbu_ .req v4 - vAga_ .req v10 - vAge_ .req v11 - vAgi_ .req v7 - vAgo_ .req v8 - vAgu_ .req v9 - vAka_ .req v15 - vAke_ .req v16 - vAki_ .req v12 - vAko_ .req v13 - vAku_ .req v14 - vAma_ .req v20 - vAme_ .req v21 - vAmi_ .req v17 - vAmo_ .req v18 - vAmu_ .req v19 - vAsa_ .req v0 - vAse_ .req v1 - vAsi_ .req v22 - vAso_ .req v23 - vAsu_ .req v24 - vAba_ .req v30 - vAbe_ .req v27 - - /* Unused temporary */ - vtmp .req v31 - - /* Mapping of Kecck-f1600 state to scalar registers - * at the beginning and end of each round. */ - s_Aba .req x1 - sAbe .req x6 - sAbi .req x11 - sAbo .req x16 - sAbu .req x21 - sAga .req x2 - sAge .req x7 - sAgi .req x12 - sAgo .req x17 - sAgu .req x22 - sAka .req x3 - sAke .req x8 - sAki .req x13 - sAko .req x18 - sAku .req x23 - sAma .req x4 - sAme .req x9 - sAmi .req x14 - sAmo .req x19 - sAmu .req x24 - sAsa .req x5 - sAse .req x10 - sAsi .req x15 - sAso .req x20 - sAsu .req x25 - - /* sA_[y,2*x+3*y] = rot(A[x,y]) */ - s_Aba_ .req x0 - sAbe_ .req x28 - sAbi_ .req x11 - sAbo_ .req x16 - sAbu_ .req x21 - sAga_ .req x3 - sAge_ .req x8 - sAgi_ .req x12 - sAgo_ .req x17 - sAgu_ .req x22 - sAka_ .req x4 - sAke_ .req x9 - sAki_ .req x13 - sAko_ .req x18 - sAku_ .req x23 - sAma_ .req x5 - sAme_ .req x10 - sAmi_ .req x14 - sAmo_ .req x19 - sAmu_ .req x24 - sAsa_ .req x1 - sAse_ .req x6 - sAsi_ .req x15 - sAso_ .req x20 - sAsu_ .req x25 - - /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ - /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ - sC0 .req x0 - sE0 .req x29 - sC1 .req x26 - sE1 .req x30 - sC2 .req x27 - sE2 .req x26 - sC3 .req x28 - sE3 .req x27 - sC4 .req x29 - sE4 .req x28 - - tmp .req x30 - -/************************ MACROS ****************************/ - -/* Macros using v8.4-A SHA-3 instructions */ - -.macro eor3_m1 d s0 s1 s2 - eor \d\().16b, \s0\().16b, \s1\().16b - eor \d\().16b, \d\().16b, \s2\().16b -.endm - -.macro rax1_m1 d s0 s1 - add vtmp.2d, \s1\().2d, \s1\().2d - sri vtmp.2d, \s1\().2d, #63 - eor \d\().16b, vtmp.16b, \s0\().16b -.endm - -.macro xar_m1 d s0 s1 imm - eor vtmp.16b, \s0\().16b, \s1\().16b - shl \d\().2d, vtmp.2d, #(64-\imm) - sri \d\().2d, vtmp.2d, #(\imm) -.endm - -.macro bcax_m1 d s0 s1 s2 - bic vtmp.16b, \s1\().16b, \s2\().16b - eor \d\().16b, vtmp.16b, \s0\().16b -.endm - -.macro load_input_vector num idx - ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] - ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] - ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] - ldr vAboq, [input_addr, #(16*(\num*3+\idx))] - ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] - ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] - ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] - ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] - ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] - ldr vAguq, [input_addr, #(16*(\num*9+\idx))] - ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] - ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] - ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] - ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] - ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] - ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] - ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] - ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] - ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] - ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] - ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] - ldr vAseq, [input_addr, #(16*(\num*21+\idx))] - ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] - ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] - ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] -.endm - -.macro store_input_vector num idx - str vAbaq, [input_addr, #(16*(\num*0+\idx))] - str vAbeq, [input_addr, #(16*(\num*1+\idx))] - str vAbiq, [input_addr, #(16*(\num*2+\idx))] - str vAboq, [input_addr, #(16*(\num*3+\idx))] - str vAbuq, [input_addr, #(16*(\num*4+\idx))] - str vAgaq, [input_addr, #(16*(\num*5+\idx))] - str vAgeq, [input_addr, #(16*(\num*6+\idx))] - str vAgiq, [input_addr, #(16*(\num*7+\idx))] - str vAgoq, [input_addr, #(16*(\num*8+\idx))] - str vAguq, [input_addr, #(16*(\num*9+\idx))] - str vAkaq, [input_addr, #(16*(\num*10+\idx))] - str vAkeq, [input_addr, #(16*(\num*11+\idx))] - str vAkiq, [input_addr, #(16*(\num*12+\idx))] - str vAkoq, [input_addr, #(16*(\num*13+\idx))] - str vAkuq, [input_addr, #(16*(\num*14+\idx))] - str vAmaq, [input_addr, #(16*(\num*15+\idx))] - str vAmeq, [input_addr, #(16*(\num*16+\idx))] - str vAmiq, [input_addr, #(16*(\num*17+\idx))] - str vAmoq, [input_addr, #(16*(\num*18+\idx))] - str vAmuq, [input_addr, #(16*(\num*19+\idx))] - str vAsaq, [input_addr, #(16*(\num*20+\idx))] - str vAseq, [input_addr, #(16*(\num*21+\idx))] - str vAsiq, [input_addr, #(16*(\num*22+\idx))] - str vAsoq, [input_addr, #(16*(\num*23+\idx))] - str vAsuq, [input_addr, #(16*(\num*24+\idx))] -.endm - -.macro store_input_scalar num idx - str s_Aba, [input_addr, 8*(\num*(0) +\idx)] - str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] - str sAbi, [input_addr, 8*(\num*(2)+ \idx)] - str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] - str sAbu, [input_addr, 8*(\num*(4)+ \idx)] - str sAga, [input_addr, 8*(\num*(4+1) +\idx)] - str sAge, [input_addr, 8*(\num*(6)+ \idx)] - str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] - str sAgo, [input_addr, 8*(\num*(8)+ \idx)] - str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] - str sAka, [input_addr, 8*(\num*(10) +\idx)] - str sAke, [input_addr, 8*(\num*(10+1)+\idx)] - str sAki, [input_addr, 8*(\num*(12) +\idx)] - str sAko, [input_addr, 8*(\num*(12+1)+\idx)] - str sAku, [input_addr, 8*(\num*(14) +\idx)] - str sAma, [input_addr, 8*(\num*(14+1)+\idx)] - str sAme, [input_addr, 8*(\num*(16) +\idx)] - str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] - str sAmo, [input_addr, 8*(\num*(18) +\idx)] - str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] - str sAsa, [input_addr, 8*(\num*(20) +\idx)] - str sAse, [input_addr, 8*(\num*(20+1)+\idx)] - str sAsi, [input_addr, 8*(\num*(22) +\idx)] - str sAso, [input_addr, 8*(\num*(22+1)+\idx)] - str sAsu, [input_addr, 8*(\num*(24) +\idx)] -.endm - -.macro load_input_scalar num idx - ldr s_Aba, [input_addr, 8*(\num*(0) +\idx)] - ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] - ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] - ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] - ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] - ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] - ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] - ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] - ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] - ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] - ldr sAka, [input_addr, 8*(\num*(10) +\idx)] - ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] - ldr sAki, [input_addr, 8*(\num*(12) +\idx)] - ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] - ldr sAku, [input_addr, 8*(\num*(14) +\idx)] - ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] - ldr sAme, [input_addr, 8*(\num*(16) +\idx)] - ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] - ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] - ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] - ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] - ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] - ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] - ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] - ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] -.endm - -#define STACK_SIZE (8*8 + 16*6 + 3*8 + 8) // VREGS (8*8), GPRs (16*6), count (8), const (8), input (8), padding (8) -#define STACK_BASE_GPRS (3*8+8) -#define STACK_BASE_VREGS (3*8+8+16*6) -#define STACK_OFFSET_INPUT (0*8) -#define STACK_OFFSET_CONST (1*8) -#define STACK_OFFSET_COUNT (2*8) - -.macro save_gprs - stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] - stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] - stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] - stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] - stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] - stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] -.endm - -.macro restore_gprs - ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] - ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] - ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] - ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] - ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] - ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] -.endm - -.macro save_vregs - stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] - stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] - stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] - stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] -.endm - -.macro restore_vregs - ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] - ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] - ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] - ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] -.endm - -.macro alloc_stack - sub sp, sp, #(STACK_SIZE) -.endm - -.macro free_stack - add sp, sp, #(STACK_SIZE) -.endm - -.macro eor5 dst, src0, src1, src2, src3, src4 - eor \dst, \src0, \src1 - eor \dst, \dst, \src2 - eor \dst, \dst, \src3 - eor \dst, \dst, \src4 -.endm - -.macro xor_rol dst, src1, src0, imm - eor \dst, \src0, \src1, ror #(64-\imm) -.endm - -.macro bic_rol dst, src1, src0, imm - bic \dst, \src0, \src1, ror #(64-\imm) -.endm - -.macro rotate dst, src, imm - ror \dst, \src, #(64-\imm) -.endm - -.macro hybrid_round_initial - scalar_round_initial - vector_round_initial -.endm - -.macro scalar_round_initial - eor sC0, sAma, sAsa - eor sC1, sAme, sAse - eor sC2, sAmi, sAsi - eor sC3, sAmo, sAso - eor sC4, sAmu, sAsu - eor sC0, sAka, sC0 - eor sC1, sAke, sC1 - eor sC2, sAki, sC2 - eor sC3, sAko, sC3 - eor sC4, sAku, sC4 - eor sC0, sAga, sC0 - eor sC1, sAge, sC1 - eor sC2, sAgi, sC2 - eor sC3, sAgo, sC3 - eor sC4, sAgu, sC4 - eor sC0, s_Aba, sC0 - eor sC1, sAbe, sC1 - eor sC2, sAbi, sC2 - eor sC3, sAbo, sC3 - eor sC4, sAbu, sC4 - - eor sE1, sC0, sC2, ror #63 - eor sE3, sC2, sC4, ror #63 - eor sE0, sC4, sC1, ror #63 - eor sE2, sC1, sC3, ror #63 - eor sE4, sC3, sC0, ror #63 - - eor s_Aba_, s_Aba, sE0 - eor sAsa_, sAbi, sE2 - eor sAbi_, sAki, sE2 - eor sAki_, sAko, sE3 - eor sAko_, sAmu, sE4 - eor sAmu_, sAso, sE3 - eor sAso_, sAma, sE0 - eor sAka_, sAbe, sE1 - eor sAse_, sAgo, sE3 - eor sAgo_, sAme, sE1 - eor sAke_, sAgi, sE2 - eor sAgi_, sAka, sE0 - eor sAga_, sAbo, sE3 - eor sAbo_, sAmo, sE3 - eor sAmo_, sAmi, sE2 - eor sAmi_, sAke, sE1 - eor sAge_, sAgu, sE4 - eor sAgu_, sAsi, sE2 - eor sAsi_, sAku, sE4 - eor sAku_, sAsa, sE0 - eor sAma_, sAbu, sE4 - eor sAbu_, sAsu, sE4 - eor sAsu_, sAse, sE1 - eor sAme_, sAga, sE0 - eor sAbe_, sAge, sE1 - - asm_load const_addr - - bic tmp, sAgi_, sAge_, ror #47 - eor sAga, tmp, sAga_, ror #39 - bic tmp, sAgo_, sAgi_, ror #42 - eor sAge, tmp, sAge_, ror #25 - bic tmp, sAgu_, sAgo_, ror #16 - eor sAgi, tmp, sAgi_, ror #58 - bic tmp, sAga_, sAgu_, ror #31 - eor sAgo, tmp, sAgo_, ror #47 - bic tmp, sAge_, sAga_, ror #56 - eor sAgu, tmp, sAgu_, ror #23 - bic tmp, sAki_, sAke_, ror #19 - eor sAka, tmp, sAka_, ror #24 - bic tmp, sAko_, sAki_, ror #47 - eor sAke, tmp, sAke_, ror #2 - bic tmp, sAku_, sAko_, ror #10 - eor sAki, tmp, sAki_, ror #57 - bic tmp, sAka_, sAku_, ror #47 - eor sAko, tmp, sAko_, ror #57 - bic tmp, sAke_, sAka_, ror #5 - eor sAku, tmp, sAku_, ror #52 - bic tmp, sAmi_, sAme_, ror #38 - eor sAma, tmp, sAma_, ror #47 - bic tmp, sAmo_, sAmi_, ror #5 - eor sAme, tmp, sAme_, ror #43 - bic tmp, sAmu_, sAmo_, ror #41 - eor sAmi, tmp, sAmi_, ror #46 - - ldr cur_const, [const_addr] - mov count, #1 - - bic tmp, sAma_, sAmu_, ror #35 - eor sAmo, tmp, sAmo_, ror #12 - bic tmp, sAme_, sAma_, ror #9 - eor sAmu, tmp, sAmu_, ror #44 - bic tmp, sAsi_, sAse_, ror #48 - eor sAsa, tmp, sAsa_, ror #41 - bic tmp, sAso_, sAsi_, ror #2 - eor sAse, tmp, sAse_, ror #50 - bic tmp, sAsu_, sAso_, ror #25 - eor sAsi, tmp, sAsi_, ror #27 - bic tmp, sAsa_, sAsu_, ror #60 - eor sAso, tmp, sAso_, ror #21 - bic tmp, sAse_, sAsa_, ror #57 - eor sAsu, tmp, sAsu_, ror #53 - bic tmp, sAbi_, sAbe_, ror #63 - eor s_Aba, s_Aba_, tmp, ror #21 - bic tmp, sAbo_, sAbi_, ror #42 - eor sAbe, tmp, sAbe_, ror #41 - bic tmp, sAbu_, sAbo_, ror #57 - eor sAbi, tmp, sAbi_, ror #35 - bic tmp, s_Aba_, sAbu_, ror #50 - eor sAbo, tmp, sAbo_, ror #43 - bic tmp, sAbe_, s_Aba_, ror #44 - eor sAbu, tmp, sAbu_, ror #30 - - eor s_Aba, s_Aba, cur_const - - str count, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT - - eor sC0, sAka, sAsa, ror #50 - eor sC1, sAse, sAge, ror #60 - eor sC2, sAmi, sAgi, ror #59 - eor sC3, sAgo, sAso, ror #30 - eor sC4, sAbu, sAsu, ror #53 - eor sC0, sAma, sC0, ror #49 - eor sC1, sAbe, sC1, ror #44 - eor sC2, sAki, sC2, ror #26 - eor sC3, sAmo, sC3, ror #63 - eor sC4, sAmu, sC4, ror #56 - eor sC0, sAga, sC0, ror #57 - eor sC1, sAme, sC1, ror #58 - eor sC2, sAbi, sC2, ror #60 - eor sC3, sAko, sC3, ror #38 - eor sC4, sAgu, sC4, ror #48 - eor sC0, s_Aba, sC0, ror #61 - eor sC1, sAke, sC1, ror #57 - eor sC2, sAsi, sC2, ror #52 - eor sC3, sAbo, sC3, ror #63 - eor sC4, sAku, sC4, ror #50 - ror sC1, sC1, #56 - ror sC4, sC4, #58 - ror sC2, sC2, #62 - - eor sE1, sC0, sC2, ror #63 - eor sE3, sC2, sC4, ror #63 - eor sE0, sC4, sC1, ror #63 - eor sE2, sC1, sC3, ror #63 - eor sE4, sC3, sC0, ror #63 - - eor s_Aba_, sE0, s_Aba - eor sAsa_, sE2, sAbi, ror #50 - eor sAbi_, sE2, sAki, ror #46 - eor sAki_, sE3, sAko, ror #63 - eor sAko_, sE4, sAmu, ror #28 - eor sAmu_, sE3, sAso, ror #2 - eor sAso_, sE0, sAma, ror #54 - eor sAka_, sE1, sAbe, ror #43 - eor sAse_, sE3, sAgo, ror #36 - eor sAgo_, sE1, sAme, ror #49 - eor sAke_, sE2, sAgi, ror #3 - eor sAgi_, sE0, sAka, ror #39 - eor sAga_, sE3, sAbo - eor sAbo_, sE3, sAmo, ror #37 - eor sAmo_, sE2, sAmi, ror #8 - eor sAmi_, sE1, sAke, ror #56 - eor sAge_, sE4, sAgu, ror #44 - eor sAgu_, sE2, sAsi, ror #62 - eor sAsi_, sE4, sAku, ror #58 - eor sAku_, sE0, sAsa, ror #25 - eor sAma_, sE4, sAbu, ror #20 - eor sAbu_, sE4, sAsu, ror #9 - eor sAsu_, sE1, sAse, ror #23 - eor sAme_, sE0, sAga, ror #61 - eor sAbe_, sE1, sAge, ror #19 - - asm_load const_addr - ldr count, [sp, #STACK_OFFSET_COUNT] // @slothy:reads=STACK_OFFSET_COUNT - - bic tmp, sAgi_, sAge_, ror #47 - eor sAga, tmp, sAga_, ror #39 - bic tmp, sAgo_, sAgi_, ror #42 - eor sAge, tmp, sAge_, ror #25 - bic tmp, sAgu_, sAgo_, ror #16 - eor sAgi, tmp, sAgi_, ror #58 - bic tmp, sAga_, sAgu_, ror #31 - eor sAgo, tmp, sAgo_, ror #47 - bic tmp, sAge_, sAga_, ror #56 - eor sAgu, tmp, sAgu_, ror #23 - bic tmp, sAki_, sAke_, ror #19 - eor sAka, tmp, sAka_, ror #24 - bic tmp, sAko_, sAki_, ror #47 - eor sAke, tmp, sAke_, ror #2 - bic tmp, sAku_, sAko_, ror #10 - eor sAki, tmp, sAki_, ror #57 - bic tmp, sAka_, sAku_, ror #47 - eor sAko, tmp, sAko_, ror #57 - bic tmp, sAke_, sAka_, ror #5 - eor sAku, tmp, sAku_, ror #52 - bic tmp, sAmi_, sAme_, ror #38 - eor sAma, tmp, sAma_, ror #47 - bic tmp, sAmo_, sAmi_, ror #5 - eor sAme, tmp, sAme_, ror #43 - bic tmp, sAmu_, sAmo_, ror #41 - eor sAmi, tmp, sAmi_, ror #46 - bic tmp, sAma_, sAmu_, ror #35 - - ldr cur_const, [const_addr, count, UXTW #3] - - eor sAmo, tmp, sAmo_, ror #12 - bic tmp, sAme_, sAma_, ror #9 - eor sAmu, tmp, sAmu_, ror #44 - bic tmp, sAsi_, sAse_, ror #48 - eor sAsa, tmp, sAsa_, ror #41 - bic tmp, sAso_, sAsi_, ror #2 - eor sAse, tmp, sAse_, ror #50 - bic tmp, sAsu_, sAso_, ror #25 - eor sAsi, tmp, sAsi_, ror #27 - bic tmp, sAsa_, sAsu_, ror #60 - eor sAso, tmp, sAso_, ror #21 - bic tmp, sAse_, sAsa_, ror #57 - eor sAsu, tmp, sAsu_, ror #53 - bic tmp, sAbi_, sAbe_, ror #63 - eor s_Aba, s_Aba_, tmp, ror #21 - bic tmp, sAbo_, sAbi_, ror #42 - eor sAbe, tmp, sAbe_, ror #41 - bic tmp, sAbu_, sAbo_, ror #57 - eor sAbi, tmp, sAbi_, ror #35 - bic tmp, s_Aba_, sAbu_, ror #50 - eor sAbo, tmp, sAbo_, ror #43 - bic tmp, sAbe_, s_Aba_, ror #44 - eor sAbu, tmp, sAbu_, ror #30 - add count, count, #1 - eor s_Aba, s_Aba, cur_const -.endm - - -.macro vector_round_initial - eor3_m1 C0, vAba, vAga, vAka - eor3_m1 C0, C0, vAma, vAsa - eor3_m1 C1, vAbe, vAge, vAke - eor3_m1 C1, C1, vAme, vAse - eor3_m1 C2, vAbi, vAgi, vAki - eor3_m1 C2, C2, vAmi, vAsi - eor3_m1 C3, vAbo, vAgo, vAko - eor3_m1 C3, C3, vAmo, vAso - eor3_m1 C4, vAbu, vAgu, vAku - eor3_m1 C4, C4, vAmu, vAsu - rax1_m1 E1, C0, C2 - rax1_m1 E3, C2, C4 - rax1_m1 E0, C4, C1 - rax1_m1 E2, C1, C3 - rax1_m1 E4, C3, C0 - eor vAba_.16b, vAba.16b, E0.16b - xar_m1 vAsa_, vAbi, E2, 2 - xar_m1 vAbi_, vAki, E2, 21 - xar_m1 vAki_, vAko, E3, 39 - xar_m1 vAko_, vAmu, E4, 56 - xar_m1 vAmu_, vAso, E3, 8 - xar_m1 vAso_, vAma, E0, 23 - xar_m1 vAka_, vAbe, E1, 63 - xar_m1 vAse_, vAgo, E3, 9 - xar_m1 vAgo_, vAme, E1, 19 - xar_m1 vAke_, vAgi, E2, 58 - xar_m1 vAgi_, vAka, E0, 61 - xar_m1 vAga_, vAbo, E3, 36 - xar_m1 vAbo_, vAmo, E3, 43 - xar_m1 vAmo_, vAmi, E2, 49 - xar_m1 vAmi_, vAke, E1, 54 - xar_m1 vAge_, vAgu, E4, 44 - xar_m1 vAgu_, vAsi, E2, 3 - xar_m1 vAsi_, vAku, E4, 25 - xar_m1 vAku_, vAsa, E0, 46 - xar_m1 vAma_, vAbu, E4, 37 - xar_m1 vAbu_, vAsu, E4, 50 - xar_m1 vAsu_, vAse, E1, 62 - xar_m1 vAme_, vAga, E0, 28 - xar_m1 vAbe_, vAge, E1, 20 - ldr sE1, [sp, #STACK_OFFSET_CONST] // @slothy:reads=STACK_OFFSET_CONST - ld1r {v28.2d}, [sE1], #8 - str sE1, [sp, #STACK_OFFSET_CONST] // @slothy:writes=STACK_OFFSET_CONST - bcax_m1 vAga, vAga_, vAgi_, vAge_ - bcax_m1 vAge, vAge_, vAgo_, vAgi_ - bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ - bcax_m1 vAgo, vAgo_, vAga_, vAgu_ - bcax_m1 vAgu, vAgu_, vAge_, vAga_ - bcax_m1 vAka, vAka_, vAki_, vAke_ - bcax_m1 vAke, vAke_, vAko_, vAki_ - bcax_m1 vAki, vAki_, vAku_, vAko_ - bcax_m1 vAko, vAko_, vAka_, vAku_ - bcax_m1 vAku, vAku_, vAke_, vAka_ - bcax_m1 vAma, vAma_, vAmi_, vAme_ - bcax_m1 vAme, vAme_, vAmo_, vAmi_ - bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ - bcax_m1 vAmo, vAmo_, vAma_, vAmu_ - bcax_m1 vAmu, vAmu_, vAme_, vAma_ - bcax_m1 vAsa, vAsa_, vAsi_, vAse_ - bcax_m1 vAse, vAse_, vAso_, vAsi_ - bcax_m1 vAsi, vAsi_, vAsu_, vAso_ - bcax_m1 vAso, vAso_, vAsa_, vAsu_ - bcax_m1 vAsu, vAsu_, vAse_, vAsa_ - bcax_m1 vAba, vAba_, vAbi_, vAbe_ - bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ - bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ - bcax_m1 vAbo, vAbo_, vAba_, vAbu_ - bcax_m1 vAbu, vAbu_, vAbe_, vAba_ - eor vAba.16b, vAba.16b, v28.16b -.endm - -.macro hybrid_round_noninitial - scalar_round_noninitial - vector_round_noninitial -.endm - -.macro scalar_round_noninitial - str count, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT - - eor sC0, sAka, sAsa, ror #50 - eor sC1, sAse, sAge, ror #60 - eor sC2, sAmi, sAgi, ror #59 - eor sC3, sAgo, sAso, ror #30 - eor sC4, sAbu, sAsu, ror #53 - eor sC0, sAma, sC0, ror #49 - eor sC1, sAbe, sC1, ror #44 - eor sC2, sAki, sC2, ror #26 - eor sC3, sAmo, sC3, ror #63 - eor sC4, sAmu, sC4, ror #56 - eor sC0, sAga, sC0, ror #57 - eor sC1, sAme, sC1, ror #58 - eor sC2, sAbi, sC2, ror #60 - eor sC3, sAko, sC3, ror #38 - eor sC4, sAgu, sC4, ror #48 - eor sC0, s_Aba, sC0, ror #61 - eor sC1, sAke, sC1, ror #57 - eor sC2, sAsi, sC2, ror #52 - eor sC3, sAbo, sC3, ror #63 - eor sC4, sAku, sC4, ror #50 - ror sC1, sC1, #56 - ror sC4, sC4, #58 - ror sC2, sC2, #62 - - eor sE1, sC0, sC2, ror #63 - eor sE3, sC2, sC4, ror #63 - eor sE0, sC4, sC1, ror #63 - eor sE2, sC1, sC3, ror #63 - eor sE4, sC3, sC0, ror #63 - - eor s_Aba_, sE0, s_Aba - eor sAsa_, sE2, sAbi, ror #50 - eor sAbi_, sE2, sAki, ror #46 - eor sAki_, sE3, sAko, ror #63 - eor sAko_, sE4, sAmu, ror #28 - eor sAmu_, sE3, sAso, ror #2 - eor sAso_, sE0, sAma, ror #54 - eor sAka_, sE1, sAbe, ror #43 - eor sAse_, sE3, sAgo, ror #36 - eor sAgo_, sE1, sAme, ror #49 - eor sAke_, sE2, sAgi, ror #3 - eor sAgi_, sE0, sAka, ror #39 - eor sAga_, sE3, sAbo - eor sAbo_, sE3, sAmo, ror #37 - eor sAmo_, sE2, sAmi, ror #8 - eor sAmi_, sE1, sAke, ror #56 - eor sAge_, sE4, sAgu, ror #44 - eor sAgu_, sE2, sAsi, ror #62 - eor sAsi_, sE4, sAku, ror #58 - eor sAku_, sE0, sAsa, ror #25 - eor sAma_, sE4, sAbu, ror #20 - eor sAbu_, sE4, sAsu, ror #9 - eor sAsu_, sE1, sAse, ror #23 - eor sAme_, sE0, sAga, ror #61 - eor sAbe_, sE1, sAge, ror #19 - - asm_load const_addr - ldr count, [sp, #STACK_OFFSET_COUNT] // @slothy:reads=STACK_OFFSET_COUNT - - bic tmp, sAgi_, sAge_, ror #47 - eor sAga, tmp, sAga_, ror #39 - bic tmp, sAgo_, sAgi_, ror #42 - eor sAge, tmp, sAge_, ror #25 - bic tmp, sAgu_, sAgo_, ror #16 - eor sAgi, tmp, sAgi_, ror #58 - bic tmp, sAga_, sAgu_, ror #31 - eor sAgo, tmp, sAgo_, ror #47 - bic tmp, sAge_, sAga_, ror #56 - eor sAgu, tmp, sAgu_, ror #23 - bic tmp, sAki_, sAke_, ror #19 - eor sAka, tmp, sAka_, ror #24 - bic tmp, sAko_, sAki_, ror #47 - eor sAke, tmp, sAke_, ror #2 - bic tmp, sAku_, sAko_, ror #10 - eor sAki, tmp, sAki_, ror #57 - bic tmp, sAka_, sAku_, ror #47 - eor sAko, tmp, sAko_, ror #57 - bic tmp, sAke_, sAka_, ror #5 - eor sAku, tmp, sAku_, ror #52 - bic tmp, sAmi_, sAme_, ror #38 - eor sAma, tmp, sAma_, ror #47 - bic tmp, sAmo_, sAmi_, ror #5 - eor sAme, tmp, sAme_, ror #43 - bic tmp, sAmu_, sAmo_, ror #41 - eor sAmi, tmp, sAmi_, ror #46 - bic tmp, sAma_, sAmu_, ror #35 - - ldr cur_const, [const_addr, count, UXTW #3] - add count, count, #1 - - eor sAmo, tmp, sAmo_, ror #12 - bic tmp, sAme_, sAma_, ror #9 - eor sAmu, tmp, sAmu_, ror #44 - bic tmp, sAsi_, sAse_, ror #48 - eor sAsa, tmp, sAsa_, ror #41 - bic tmp, sAso_, sAsi_, ror #2 - eor sAse, tmp, sAse_, ror #50 - bic tmp, sAsu_, sAso_, ror #25 - eor sAsi, tmp, sAsi_, ror #27 - bic tmp, sAsa_, sAsu_, ror #60 - eor sAso, tmp, sAso_, ror #21 - bic tmp, sAse_, sAsa_, ror #57 - eor sAsu, tmp, sAsu_, ror #53 - bic tmp, sAbi_, sAbe_, ror #63 - eor s_Aba, s_Aba_, tmp, ror #21 - bic tmp, sAbo_, sAbi_, ror #42 - eor sAbe, tmp, sAbe_, ror #41 - bic tmp, sAbu_, sAbo_, ror #57 - eor sAbi, tmp, sAbi_, ror #35 - bic tmp, s_Aba_, sAbu_, ror #50 - eor sAbo, tmp, sAbo_, ror #43 - bic tmp, sAbe_, s_Aba_, ror #44 - eor sAbu, tmp, sAbu_, ror #30 - - eor s_Aba, s_Aba, cur_const - str count, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT - - eor sC0, sAka, sAsa, ror #50 - eor sC1, sAse, sAge, ror #60 - eor sC2, sAmi, sAgi, ror #59 - eor sC3, sAgo, sAso, ror #30 - eor sC4, sAbu, sAsu, ror #53 - eor sC0, sAma, sC0, ror #49 - eor sC1, sAbe, sC1, ror #44 - eor sC2, sAki, sC2, ror #26 - eor sC3, sAmo, sC3, ror #63 - eor sC4, sAmu, sC4, ror #56 - eor sC0, sAga, sC0, ror #57 - eor sC1, sAme, sC1, ror #58 - eor sC2, sAbi, sC2, ror #60 - eor sC3, sAko, sC3, ror #38 - eor sC4, sAgu, sC4, ror #48 - eor sC0, s_Aba, sC0, ror #61 - eor sC1, sAke, sC1, ror #57 - eor sC2, sAsi, sC2, ror #52 - eor sC3, sAbo, sC3, ror #63 - eor sC4, sAku, sC4, ror #50 - ror sC1, sC1, #56 - ror sC4, sC4, #58 - ror sC2, sC2, #62 - - eor sE1, sC0, sC2, ror #63 - eor sE3, sC2, sC4, ror #63 - eor sE0, sC4, sC1, ror #63 - eor sE2, sC1, sC3, ror #63 - eor sE4, sC3, sC0, ror #63 - - eor s_Aba_, sE0, s_Aba - eor sAsa_, sE2, sAbi, ror #50 - eor sAbi_, sE2, sAki, ror #46 - eor sAki_, sE3, sAko, ror #63 - eor sAko_, sE4, sAmu, ror #28 - eor sAmu_, sE3, sAso, ror #2 - eor sAso_, sE0, sAma, ror #54 - eor sAka_, sE1, sAbe, ror #43 - eor sAse_, sE3, sAgo, ror #36 - eor sAgo_, sE1, sAme, ror #49 - eor sAke_, sE2, sAgi, ror #3 - eor sAgi_, sE0, sAka, ror #39 - eor sAga_, sE3, sAbo - eor sAbo_, sE3, sAmo, ror #37 - eor sAmo_, sE2, sAmi, ror #8 - eor sAmi_, sE1, sAke, ror #56 - eor sAge_, sE4, sAgu, ror #44 - eor sAgu_, sE2, sAsi, ror #62 - eor sAsi_, sE4, sAku, ror #58 - eor sAku_, sE0, sAsa, ror #25 - eor sAma_, sE4, sAbu, ror #20 - eor sAbu_, sE4, sAsu, ror #9 - eor sAsu_, sE1, sAse, ror #23 - eor sAme_, sE0, sAga, ror #61 - eor sAbe_, sE1, sAge, ror #19 - - asm_load const_addr - ldr count, [sp, #STACK_OFFSET_COUNT] // @slothy:reads=STACK_OFFSET_COUNT - - bic tmp, sAgi_, sAge_, ror #47 - eor sAga, tmp, sAga_, ror #39 - bic tmp, sAgo_, sAgi_, ror #42 - eor sAge, tmp, sAge_, ror #25 - bic tmp, sAgu_, sAgo_, ror #16 - eor sAgi, tmp, sAgi_, ror #58 - bic tmp, sAga_, sAgu_, ror #31 - eor sAgo, tmp, sAgo_, ror #47 - bic tmp, sAge_, sAga_, ror #56 - eor sAgu, tmp, sAgu_, ror #23 - bic tmp, sAki_, sAke_, ror #19 - eor sAka, tmp, sAka_, ror #24 - bic tmp, sAko_, sAki_, ror #47 - eor sAke, tmp, sAke_, ror #2 - bic tmp, sAku_, sAko_, ror #10 - eor sAki, tmp, sAki_, ror #57 - bic tmp, sAka_, sAku_, ror #47 - eor sAko, tmp, sAko_, ror #57 - bic tmp, sAke_, sAka_, ror #5 - eor sAku, tmp, sAku_, ror #52 - bic tmp, sAmi_, sAme_, ror #38 - eor sAma, tmp, sAma_, ror #47 - bic tmp, sAmo_, sAmi_, ror #5 - eor sAme, tmp, sAme_, ror #43 - bic tmp, sAmu_, sAmo_, ror #41 - eor sAmi, tmp, sAmi_, ror #46 - bic tmp, sAma_, sAmu_, ror #35 - - ldr cur_const, [const_addr, count, UXTW #3] - add count, count, #1 - - eor sAmo, tmp, sAmo_, ror #12 - bic tmp, sAme_, sAma_, ror #9 - eor sAmu, tmp, sAmu_, ror #44 - bic tmp, sAsi_, sAse_, ror #48 - eor sAsa, tmp, sAsa_, ror #41 - bic tmp, sAso_, sAsi_, ror #2 - eor sAse, tmp, sAse_, ror #50 - bic tmp, sAsu_, sAso_, ror #25 - eor sAsi, tmp, sAsi_, ror #27 - bic tmp, sAsa_, sAsu_, ror #60 - eor sAso, tmp, sAso_, ror #21 - bic tmp, sAse_, sAsa_, ror #57 - eor sAsu, tmp, sAsu_, ror #53 - bic tmp, sAbi_, sAbe_, ror #63 - eor s_Aba, s_Aba_, tmp, ror #21 - bic tmp, sAbo_, sAbi_, ror #42 - eor sAbe, tmp, sAbe_, ror #41 - bic tmp, sAbu_, sAbo_, ror #57 - eor sAbi, tmp, sAbi_, ror #35 - bic tmp, s_Aba_, sAbu_, ror #50 - eor sAbo, tmp, sAbo_, ror #43 - bic tmp, sAbe_, s_Aba_, ror #44 - eor sAbu, tmp, sAbu_, ror #30 - - eor s_Aba, s_Aba, cur_const - -.endm - - -.macro vector_round_noninitial - eor3_m1 C0, vAba, vAga, vAka - eor3_m1 C0, C0, vAma, vAsa - eor3_m1 C1, vAbe, vAge, vAke - eor3_m1 C1, C1, vAme, vAse - eor3_m1 C2, vAbi, vAgi, vAki - eor3_m1 C2, C2, vAmi, vAsi - eor3_m1 C3, vAbo, vAgo, vAko - eor3_m1 C3, C3, vAmo, vAso - eor3_m1 C4, vAbu, vAgu, vAku - eor3_m1 C4, C4, vAmu, vAsu - rax1_m1 E1, C0, C2 - rax1_m1 E3, C2, C4 - rax1_m1 E0, C4, C1 - rax1_m1 E2, C1, C3 - rax1_m1 E4, C3, C0 - eor vAba_.16b, vAba.16b, E0.16b - xar_m1 vAsa_, vAbi, E2, 2 - xar_m1 vAbi_, vAki, E2, 21 - xar_m1 vAki_, vAko, E3, 39 - xar_m1 vAko_, vAmu, E4, 56 - xar_m1 vAmu_, vAso, E3, 8 - xar_m1 vAso_, vAma, E0, 23 - xar_m1 vAka_, vAbe, E1, 63 - xar_m1 vAse_, vAgo, E3, 9 - xar_m1 vAgo_, vAme, E1, 19 - xar_m1 vAke_, vAgi, E2, 58 - xar_m1 vAgi_, vAka, E0, 61 - xar_m1 vAga_, vAbo, E3, 36 - xar_m1 vAbo_, vAmo, E3, 43 - xar_m1 vAmo_, vAmi, E2, 49 - xar_m1 vAmi_, vAke, E1, 54 - xar_m1 vAge_, vAgu, E4, 44 - xar_m1 vAgu_, vAsi, E2, 3 - xar_m1 vAsi_, vAku, E4, 25 - xar_m1 vAku_, vAsa, E0, 46 - xar_m1 vAma_, vAbu, E4, 37 - xar_m1 vAbu_, vAsu, E4, 50 - xar_m1 vAsu_, vAse, E1, 62 - xar_m1 vAme_, vAga, E0, 28 - xar_m1 vAbe_, vAge, E1, 20 - ldr sE1, [sp, #STACK_OFFSET_CONST] // @slothy:reads=STACK_OFFSET_CONST - ld1r {v28.2d}, [sE1], #8 - str sE1, [sp, #STACK_OFFSET_CONST] // @slothy:writes=STACK_OFFSET_CONST - bcax_m1 vAga, vAga_, vAgi_, vAge_ - bcax_m1 vAge, vAge_, vAgo_, vAgi_ - bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ - bcax_m1 vAgo, vAgo_, vAga_, vAgu_ - bcax_m1 vAgu, vAgu_, vAge_, vAga_ - bcax_m1 vAka, vAka_, vAki_, vAke_ - bcax_m1 vAke, vAke_, vAko_, vAki_ - bcax_m1 vAki, vAki_, vAku_, vAko_ - bcax_m1 vAko, vAko_, vAka_, vAku_ - bcax_m1 vAku, vAku_, vAke_, vAka_ - bcax_m1 vAma, vAma_, vAmi_, vAme_ - bcax_m1 vAme, vAme_, vAmo_, vAmi_ - bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ - bcax_m1 vAmo, vAmo_, vAma_, vAmu_ - bcax_m1 vAmu, vAmu_, vAme_, vAma_ - bcax_m1 vAsa, vAsa_, vAsi_, vAse_ - bcax_m1 vAse, vAse_, vAso_, vAsi_ - bcax_m1 vAsi, vAsi_, vAsu_, vAso_ - bcax_m1 vAso, vAso_, vAsa_, vAsu_ - bcax_m1 vAsu, vAsu_, vAse_, vAsa_ - bcax_m1 vAba, vAba_, vAbi_, vAbe_ - bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ - bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ - bcax_m1 vAbo, vAbo_, vAba_, vAbu_ - bcax_m1 vAbu, vAbu_, vAbe_, vAba_ - eor vAba.16b, vAba.16b, v28.16b -.endm - - -.macro final_rotate - ror sAga, sAga,#(64-3) - ror sAka, sAka,#(64-25) - ror sAma, sAma,#(64-10) - ror sAsa, sAsa,#(64-39) - ror sAbe, sAbe,#(64-21) - ror sAge, sAge,#(64-45) - ror sAke, sAke,#(64-8) - ror sAme, sAme,#(64-15) - ror sAse, sAse,#(64-41) - ror sAbi, sAbi,#(64-14) - ror sAgi, sAgi,#(64-61) - ror sAki, sAki,#(64-18) - ror sAmi, sAmi,#(64-56) - ror sAsi, sAsi,#(64-2) - ror sAgo, sAgo,#(64-28) - ror sAko, sAko,#(64-1) - ror sAmo, sAmo,#(64-27) - ror sAso, sAso,#(64-62) - ror sAbu, sAbu,#(64-44) - ror sAgu, sAgu,#(64-20) - ror sAku, sAku,#(64-6) - ror sAmu, sAmu,#(64-36) - ror sAsu, sAsu,#(64-55) -.endm - -#define KECCAK_F1600_ROUNDS 24 - -.global keccak_f1600_x4_hybrid_slothy -.global _keccak_f1600_x4_hybrid_slothy -.text -.align 4 - -keccak_f1600_x4_hybrid_slothy: -_keccak_f1600_x4_hybrid_slothy: - alloc_stack - save_gprs - save_vregs - str input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:writes=STACK_OFFSET_INPUT - - load_input_vector 2,1 - - asm_load const_addr - str const_addr, [sp, #STACK_OFFSET_CONST] // @slothy:writes=STACK_OFFSET_CONST - - // First scalar Keccak computation alongside first half of SIMD computation - load_input_scalar 4,0 - initial: - hybrid_round_initial - end_initial: - loop_0: - hybrid_round_noninitial - end_loop_0: - cmp count, #(KECCAK_F1600_ROUNDS-1) - ble loop_0 - final_rotate - ldr input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:reads=STACK_OFFSET_INPUT - store_input_scalar 4,0 - - // Second scalar Keccak computation alongsie second half of SIMD computation - load_input_scalar 4,1 - initial2: - hybrid_round_initial - end_initial2: - loop_1: - hybrid_round_noninitial - end_loop_1: - cmp count, #(KECCAK_F1600_ROUNDS-1) - ble loop_1 - final_rotate - ldr input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:reads=STACK_OFFSET_INPUT - store_input_scalar 4, 1 - - store_input_vector 2,1 - - restore_vregs - restore_gprs - free_stack - ret \ No newline at end of file diff --git a/examples/naive/aarch64/keccak_f1600_x4_hybrid_slothy_clean.s b/examples/naive/aarch64/keccak_f1600_x4_hybrid_slothy_clean.s new file mode 100644 index 00000000..bb83be51 --- /dev/null +++ b/examples/naive/aarch64/keccak_f1600_x4_hybrid_slothy_clean.s @@ -0,0 +1,959 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include + +#define KECCAK_F1600_ROUNDS 24 + +// +// Author: Hanno Becker +// Author: Matthias Kannwischer +// + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + outer .req x30 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Unused temporary */ + vtmp .req v31 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + sAba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x28 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + tmp .req x30 + +/************************ MACROS ****************************/ + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + add vtmp.2d, \s1\().2d, \s1\().2d + sri vtmp.2d, \s1\().2d, #63 + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor vtmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, vtmp.2d, #(64-\imm) + sri \d\().2d, vtmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str sAba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr sAba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_LOCS 6 + +#define STACK_SIZE (16*6 + 8*8 + 6*8 + (STACK_LOCS) * 8) +#define STACK_BASE_GPRS (6*8) +#define STACK_BASE_VREGS (6*8 + 16*6) +#define STACK_OFFSET_LOCS (16*6 + 8*8 + 6*8) + +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST_SCALAR (1*8) +#define STACK_OFFSET_CONST_VECTOR (2*8) +#define STACK_OFFSET_COUNT (3*8) +#define STACK_OFFSET_OUTER (4*8) + +#define STACK_LOC_0 ((STACK_OFFSET_LOCS) + 0*8) +#define STACK_LOC_1 ((STACK_OFFSET_LOCS) + 1*8) +#define STACK_LOC_2 ((STACK_OFFSET_LOCS) + 2*8) +#define STACK_LOC_3 ((STACK_OFFSET_LOCS) + 3*8) +#define STACK_LOC_4 ((STACK_OFFSET_LOCS) + 4*8) +#define STACK_LOC_5 ((STACK_OFFSET_LOCS) + 4*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro chi_step_ror out, a, b, c, r1, r2 + bic X, \b\(), \c\(), ror #\r1 + eor \out\(), X, \a\(), ror #\r2 +.endm + +.macro chi_step_ror2 out, a, b, c, r1, r2 + bic X, \b\(), \c\(), ror #\r1 + eor \out\(), \a\(), X, ror #\r2 +.endm + +.macro scalar_round_initial + eor x30, x24, x25 // *..................................................... + eor x27, x9, x10 // *..................................................... + eor x0, x30, x21 // .*.................................................... + eor x26, x27, x6 // .*.................................................... + eor x27, x26, x7 // ..*................................................... + eor x29, x0, x22 // ..*................................................... + eor x26, x29, x23 // ...*.................................................. + eor x29, x4, x5 // ...*.................................................. + eor x30, x29, x1 // ....*................................................. + eor x0, x27, x8 // ....*................................................. + eor x29, x30, x2 // .....*................................................ + eor x30, x19, x20 // .....*................................................ + eor x30, x30, x16 // ......*............................................... + eor x27, x26, x0, ror #63 // ......*............................................... + eor x4, x4, x27 // .......*.............................................. + eor x30, x30, x17 // .......*.............................................. + eor x30, x30, x28 // ........*............................................. + eor x29, x29, x3 // ........*............................................. + eor x0, x0, x30, ror #63 // .........*............................................ + eor x30, x30, x29, ror #63 // .........*............................................ + eor x22, x22, x30 // ..........*........................................... + eor x23, x23, x30 // ..........*........................................... + str x23, [sp, #STACK_LOC_0] // ...........*.......................................... // @slothy:writes=stack_0 + eor x23, x14, x15 // ...........*.......................................... + eor x14, x14, x0 // ............*......................................... + eor x23, x23, x11 // ............*......................................... + eor x15, x15, x0 // .............*........................................ + eor x1, x1, x27 // .............*........................................ + eor x23, x23, x12 // ..............*....................................... + eor x23, x23, x13 // ...............*...................................... + eor x11, x11, x0 // ...............*...................................... + eor x29, x29, x23, ror #63 // ................*..................................... + eor x23, x23, x26, ror #63 // ................*..................................... + eor x26, x13, x0 // .................*.................................... + eor x13, x28, x23 // .................*.................................... + eor x28, x24, x30 // ..................*................................... + eor x24, x16, x23 // ..................*................................... + eor x16, x21, x30 // ...................*.................................. + eor x21, x25, x30 // ...................*.................................. + eor x30, x19, x23 // ....................*................................. + eor x19, x20, x23 // ....................*................................. + eor x20, x17, x23 // .....................*................................ + eor x17, x12, x0 // .....................*................................ + eor x0, x2, x27 // ......................*............................... + eor x2, x6, x29 // ......................*............................... + eor x6, x8, x29 // .......................*.............................. + bic x8, x28, x13, ror #47 // .......................*.............................. + eor x12, x3, x27 // ........................*............................. + bic x3, x13, x17, ror #19 // ........................*............................. + eor x5, x5, x27 // .........................*............................ + ldr x27, [sp, #STACK_LOC_0] // .........................*............................ // @slothy:reads=stack_0 + bic x25, x17, x2, ror #5 // ..........................*........................... + eor x9, x9, x29 // ..........................*........................... + eor x23, x25, x5, ror #52 // ...........................*.......................... + eor x3, x3, x2, ror #24 // ...........................*.......................... + eor x8, x8, x17, ror #2 // ............................*......................... + eor x17, x10, x29 // ............................*......................... + bic x25, x12, x22, ror #47 // .............................*........................ + eor x29, x7, x29 // .............................*........................ + bic x10, x4, x27, ror #2 // ..............................*....................... + bic x7, x5, x28, ror #10 // ..............................*....................... + eor x10, x10, x20, ror #50 // ...............................*...................... + eor x13, x7, x13, ror #57 // ...............................*...................... + bic x7, x2, x5, ror #47 // ................................*..................... + eor x2, x25, x24, ror #39 // ................................*..................... + bic x25, x20, x11, ror #57 // .................................*.................... + bic x5, x17, x4, ror #25 // .................................*.................... + eor x25, x25, x17, ror #53 // ..................................*................... + bic x17, x11, x17, ror #60 // ..................................*................... + eor x28, x7, x28, ror #57 // ...................................*.................. + bic x7, x9, x12, ror #42 // ...................................*.................. + eor x7, x7, x22, ror #25 // ....................................*................. + bic x22, x22, x24, ror #56 // ....................................*................. + bic x24, x24, x15, ror #31 // .....................................*................ + eor x22, x22, x15, ror #23 // .....................................*................ + bic x20, x27, x20, ror #48 // ......................................*............... + bic x15, x15, x9, ror #16 // ......................................*............... + eor x12, x15, x12, ror #58 // .......................................*.............. + eor x15, x5, x27, ror #27 // .......................................*.............. + eor x5, x20, x11, ror #41 // ........................................*............. + ldr x11, [sp, #STACK_OFFSET_CONST_SCALAR] // ........................................*............. + eor x20, x17, x4, ror #21 // .........................................*............ + eor x17, x24, x9, ror #47 // .........................................*............ + mov x24, #1 // ..........................................*........... + bic x9, x0, x16, ror #9 // ..........................................*........... + str x24, [sp, #STACK_OFFSET_COUNT] // ...........................................*.......... // @slothy:writes=STACK_OFFSET_COUNT + bic x24, x29, x1, ror #44 // ...........................................*.......... + bic x27, x1, x21, ror #50 // ............................................*......... + bic x4, x26, x29, ror #63 // ............................................*......... + eor x1, x1, x4, ror #21 // .............................................*........ + ldr x11, [x11] // .............................................*........ + bic x4, x21, x30, ror #57 // ..............................................*....... + eor x21, x24, x21, ror #30 // ..............................................*....... + eor x24, x9, x19, ror #44 // ...............................................*...... + bic x9, x14, x6, ror #5 // ...............................................*...... + eor x9, x9, x0, ror #43 // ................................................*..... + bic x0, x6, x0, ror #38 // ................................................*..... + eor x1, x1, x11 // .................................................*.... + eor x11, x4, x26, ror #35 // .................................................*.... + eor x4, x0, x16, ror #47 // ..................................................*... + bic x0, x16, x19, ror #35 // ..................................................*... + eor x16, x27, x30, ror #43 // ...................................................*.. + bic x27, x30, x26, ror #42 // ...................................................*.. + bic x26, x19, x14, ror #41 // ....................................................*. + eor x19, x0, x14, ror #12 // ....................................................*. + eor x14, x26, x6, ror #46 // .....................................................* + eor x6, x27, x29, ror #41 // .....................................................* + + // eor5 X, sAma, sAsa, sAba, sAga, sAka + // eor5 X, sAme, sAse, sAbe, sAge, sAke + // eor5 X, sAmi, sAsi, sAbi, sAgi, sAki + // eor5 X, sAmo, sAso, sAbo, sAgo, sAko + // eor5 X, sAmu, sAsu, sAbu, sAgu, sAku + + // eor X, X, X, ror #63 + // eor X, X, X, ror #63 + // eor X, X, X, ror #63 + // eor X, X, X, ror #63 + // eor X, X, X, ror #63 + + // eor X, sAba, X + // eor X, sAbi, X + // eor X, sAki, X + // eor X, sAko, X + // eor X, sAmu, X + // eor X, sAso, X + // eor X, sAma, X + // eor X, sAbe, X + // eor X, sAgo, X + // eor X, sAme, X + // eor X, sAgi, X + // eor X, sAka, X + // eor X, sAbo, X + // eor X, sAmo, X + // eor X, sAmi, X + // eor X, sAke, X + // eor X, sAgu, X + // eor X, sAsi, X + // eor X, sAku, X + // eor X, sAsa, X + // eor X, sAbu, X + // eor X, sAsu, X + // eor X, sAse, X + // eor X, sAga, X + // eor X, sAge, X + + // ldr X, [sp, #STACK_OFFSET_CONST_SCALAR] + // ldr X, [X] + // mov X, #1 + // str X, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT + + // chi_step_ror sAga, X, X, X, 47, 39 + // chi_step_ror sAge, X, X, X, 42, 25 + // chi_step_ror sAgi, X, X, X, 16, 58 + // chi_step_ror sAgo, X, X, X, 31, 47 + // chi_step_ror sAgu, X, X, X, 56, 23 + // chi_step_ror sAka, X, X, X, 19, 24 + // chi_step_ror sAke, X, X, X, 47, 2 + // chi_step_ror sAki, X, X, X, 10, 57 + // chi_step_ror sAko, X, X, X, 47, 57 + // chi_step_ror sAku, X, X, X, 5, 52 + // chi_step_ror sAma, X, X, X, 38, 47 + // chi_step_ror sAme, X, X, X, 5, 43 + // chi_step_ror sAmi, X, X, X, 41, 46 + // chi_step_ror sAmo, X, X, X, 35, 12 + // chi_step_ror sAmu, X, X, X, 9, 44 + // chi_step_ror sAsa, X, X, X, 48, 41 + // chi_step_ror sAse, X, X, X, 2, 50 + // chi_step_ror sAsi, X, X, X, 25, 27 + // chi_step_ror sAso, X, X, X, 60, 21 + // chi_step_ror sAsu, X, X, X, 57, 53 + // chi_step_ror2 sAba, X, X, X, 63, 21 + // chi_step_ror sAbe, X, X, X, 42, 41 + // chi_step_ror sAbi, X, X, X, 57, 35 + // chi_step_ror sAbo, X, X, X, 50, 43 + // chi_step_ror sAbu, X, X, X, 44, 30 + + // eor sAba, sAba, X +.endm + +.macro vector_round + eor3_m1 C0, vAba, vAga, vAka + eor3_m1 C0, C0, vAma, vAsa + eor3_m1 C1, vAbe, vAge, vAke + eor3_m1 C1, C1, vAme, vAse + eor3_m1 C2, vAbi, vAgi, vAki + eor3_m1 C2, C2, vAmi, vAsi + eor3_m1 C3, vAbo, vAgo, vAko + eor3_m1 C3, C3, vAmo, vAso + eor3_m1 C4, vAbu, vAgu, vAku + eor3_m1 C4, C4, vAmu, vAsu + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + eor vAba_.16b, vAba.16b, E0.16b + xar_m1 vAsa_, vAbi, E2, 2 + xar_m1 vAbi_, vAki, E2, 21 + xar_m1 vAki_, vAko, E3, 39 + xar_m1 vAko_, vAmu, E4, 56 + xar_m1 vAmu_, vAso, E3, 8 + xar_m1 vAso_, vAma, E0, 23 + xar_m1 vAka_, vAbe, E1, 63 + xar_m1 vAse_, vAgo, E3, 9 + xar_m1 vAgo_, vAme, E1, 19 + xar_m1 vAke_, vAgi, E2, 58 + xar_m1 vAgi_, vAka, E0, 61 + xar_m1 vAga_, vAbo, E3, 36 + xar_m1 vAbo_, vAmo, E3, 43 + xar_m1 vAmo_, vAmi, E2, 49 + xar_m1 vAmi_, vAke, E1, 54 + xar_m1 vAge_, vAgu, E4, 44 + xar_m1 vAgu_, vAsi, E2, 3 + xar_m1 vAsi_, vAku, E4, 25 + xar_m1 vAku_, vAsa, E0, 46 + xar_m1 vAma_, vAbu, E4, 37 + xar_m1 vAbu_, vAsu, E4, 50 + xar_m1 vAsu_, vAse, E1, 62 + xar_m1 vAme_, vAga, E0, 28 + xar_m1 vAbe_, vAge, E1, 20 + ldr tmp, [sp, #STACK_OFFSET_CONST_VECTOR] // @slothy:reads=STACK_OFFSET_CONST_VECTOR + ld1r {v28.2d}, [tmp], #8 + str tmp, [sp, #STACK_OFFSET_CONST_VECTOR] // @slothy:writes=STACK_OFFSET_CONST_VECTOR + bcax_m1 vAga, vAga_, vAgi_, vAge_ + bcax_m1 vAge, vAge_, vAgo_, vAgi_ + bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ + bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + bcax_m1 vAgu, vAgu_, vAge_, vAga_ + bcax_m1 vAka, vAka_, vAki_, vAke_ + bcax_m1 vAke, vAke_, vAko_, vAki_ + bcax_m1 vAki, vAki_, vAku_, vAko_ + bcax_m1 vAko, vAko_, vAka_, vAku_ + bcax_m1 vAku, vAku_, vAke_, vAka_ + bcax_m1 vAma, vAma_, vAmi_, vAme_ + bcax_m1 vAme, vAme_, vAmo_, vAmi_ + bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ + bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + bcax_m1 vAmu, vAmu_, vAme_, vAma_ + bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + bcax_m1 vAse, vAse_, vAso_, vAsi_ + bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + bcax_m1 vAso, vAso_, vAsa_, vAsu_ + bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + bcax_m1 vAba, vAba_, vAbi_, vAbe_ + bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ + bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + bcax_m1 vAbu, vAbu_, vAbe_, vAba_ + eor vAba.16b, vAba.16b, v28.16b +.endm + +.macro scalar_round_noninitial + eor x0, x15, x11, ror #52 // *........................................................ + eor x0, x0, x13, ror #48 // .*....................................................... + eor x26, x8, x9, ror #57 // .*....................................................... + eor x27, x0, x14, ror #10 // ..*...................................................... + eor x29, x16, x28, ror #63 // ..*...................................................... + eor x26, x26, x6, ror #51 // ...*..................................................... + eor x30, x23, x22, ror #50 // ...*..................................................... + eor x0, x26, x10, ror #31 // ....*.................................................... + eor x29, x29, x19, ror #37 // ....*.................................................... + eor x27, x27, x12, ror #5 // .....*................................................... + eor x30, x30, x24, ror #34 // .....*................................................... + eor x0, x0, x7, ror #27 // ......*.................................................. + eor x26, x30, x21, ror #26 // ......*.................................................. + eor x26, x26, x25, ror #15 // .......*................................................. + ror x30, x27, #62 // .......*................................................. + eor x30, x30, x26, ror #57 // ........*................................................ + ror x26, x26, #58 // ........*................................................ + eor x16, x30, x16 // .........*............................................... + eor x28, x30, x28, ror #63 // .........*............................................... + str x28, [sp, #STACK_LOC_0] // ..........*.............................................. // @slothy:writes=stack_0 + eor x29, x29, x17, ror #36 // ..........*.............................................. + eor x28, x1, x2, ror #61 // ...........*............................................. + eor x19, x30, x19, ror #37 // ...........*............................................. + eor x29, x29, x20, ror #2 // ............*............................................ + eor x28, x28, x4, ror #54 // ............*............................................ + eor x26, x26, x0, ror #55 // .............*........................................... + eor x28, x28, x3, ror #39 // .............*........................................... + eor x28, x28, x5, ror #25 // ..............*.......................................... + ror x0, x0, #56 // ..............*.......................................... + eor x0, x0, x29, ror #63 // ...............*......................................... + eor x27, x28, x27, ror #61 // ...............*......................................... + eor x13, x0, x13, ror #46 // ................*........................................ + eor x28, x29, x28, ror #63 // ................*........................................ + eor x29, x30, x20, ror #2 // .................*....................................... + eor x20, x26, x3, ror #39 // .................*....................................... + eor x11, x0, x11, ror #50 // ..................*...................................... + eor x25, x28, x25, ror #9 // ..................*...................................... + eor x3, x28, x21, ror #20 // ...................*..................................... + eor x21, x26, x1 // ...................*..................................... + eor x9, x27, x9, ror #49 // ....................*.................................... + eor x24, x28, x24, ror #28 // ....................*.................................... + eor x1, x30, x17, ror #36 // .....................*................................... + eor x14, x0, x14, ror #8 // .....................*................................... + eor x22, x28, x22, ror #44 // ......................*.................................. + eor x8, x27, x8, ror #56 // ......................*.................................. + eor x17, x27, x7, ror #19 // .......................*................................. + eor x15, x0, x15, ror #62 // .......................*................................. + bic x7, x20, x22, ror #47 // ........................*................................ + eor x4, x26, x4, ror #54 // ........................*................................ + eor x0, x0, x12, ror #3 // .........................*............................... + eor x28, x28, x23, ror #58 // .........................*............................... + eor x23, x26, x2, ror #61 // ..........................*.............................. + eor x26, x26, x5, ror #25 // ..........................*.............................. + eor x2, x7, x16, ror #39 // ...........................*............................. + bic x7, x9, x20, ror #42 // ...........................*............................. + bic x30, x15, x9, ror #16 // ............................*............................ + eor x7, x7, x22, ror #25 // ............................*............................ + eor x12, x30, x20, ror #58 // .............................*........................... + bic x20, x22, x16, ror #56 // .............................*........................... + eor x30, x27, x6, ror #43 // ..............................*.......................... + eor x22, x20, x15, ror #23 // ..............................*.......................... + bic x6, x19, x13, ror #42 // ...............................*......................... + eor x6, x6, x17, ror #41 // ................................*........................ + bic x5, x13, x17, ror #63 // ................................*........................ + eor x5, x21, x5, ror #21 // .................................*....................... + bic x17, x17, x21, ror #44 // .................................*....................... + eor x27, x27, x10, ror #23 // ..................................*...................... + bic x21, x21, x25, ror #50 // ..................................*...................... + bic x20, x27, x4, ror #25 // ...................................*..................... + bic x10, x16, x15, ror #31 // ...................................*..................... + eor x16, x21, x19, ror #43 // ....................................*.................... + eor x21, x17, x25, ror #30 // ....................................*.................... + bic x19, x25, x19, ror #57 // .....................................*................... + ldr x25, [sp, #STACK_OFFSET_COUNT] // .....................................*................... // @slothy:reads=STACK_OFFSET_COUNT + eor x17, x10, x9, ror #47 // ......................................*.................. + ldr x9, [sp, #STACK_OFFSET_CONST_SCALAR] // ......................................*.................. + eor x15, x20, x28, ror #27 // .......................................*................. + bic x20, x4, x28, ror #2 // .......................................*................. + eor x10, x20, x1, ror #50 // ........................................*................ + bic x20, x11, x27, ror #60 // ........................................*................ + eor x20, x20, x4, ror #21 // .........................................*............... + bic x4, x28, x1, ror #48 // .........................................*............... + bic x1, x1, x11, ror #57 // ..........................................*.............. + ldr x28, [x9, w25, UXTW #3] // ..........................................*.............. + ldr x9, [sp, #STACK_LOC_0] // ...........................................*............. // @slothy:reads=stack_0 + add x25, x25, #1 // ...........................................*............. + str x25, [sp, #STACK_OFFSET_COUNT] // ............................................*............ // @slothy:writes=STACK_OFFSET_COUNT + cmp x25, #(KECCAK_F1600_ROUNDS-1) // ............................................*............ // @slothy:ignore_useless_output + eor x25, x1, x27, ror #53 // .............................................*........... + bic x27, x30, x26, ror #47 // .............................................*........... + eor x1, x5, x28 // ..............................................*.......... + eor x5, x4, x11, ror #41 // ..............................................*.......... + eor x11, x19, x13, ror #35 // ...............................................*......... + bic x13, x26, x24, ror #10 // ...............................................*......... + eor x28, x27, x24, ror #57 // ................................................*........ + bic x27, x24, x9, ror #47 // ................................................*........ + bic x19, x23, x3, ror #9 // .................................................*....... + bic x4, x29, x14, ror #41 // .................................................*....... + eor x24, x19, x29, ror #44 // ..................................................*...... + bic x29, x3, x29, ror #35 // ..................................................*...... + eor x13, x13, x9, ror #57 // ...................................................*..... + eor x19, x29, x14, ror #12 // ...................................................*..... + bic x29, x9, x0, ror #19 // ....................................................*.... + bic x14, x14, x8, ror #5 // ....................................................*.... + eor x9, x14, x23, ror #43 // .....................................................*... + eor x14, x4, x8, ror #46 // .....................................................*... + bic x23, x8, x23, ror #38 // ......................................................*.. + eor x8, x27, x0, ror #2 // ......................................................*.. + eor x4, x23, x3, ror #47 // .......................................................*. + bic x3, x0, x30, ror #5 // .......................................................*. + eor x23, x3, x26, ror #52 // ........................................................* + eor x3, x29, x30, ror #24 // ........................................................* + + // eor X, sAba, sAga, ror #61 + // eor X, X, sAma, ror #54 + // eor X, X, sAka, ror #39 + // eor X, X, sAsa, ror #25 + + // eor X, sAke, sAme, ror #57 + // eor X, X, sAbe, ror #51 + // eor X, X, sAse, ror #31 + // eor X, X, sAge, ror #27 + + // eor X, sAsi, sAbi, ror #52 + // eor X, X, sAki, ror #48 + // eor X, X, sAmi, ror #10 + // eor X, X, sAgi, ror #5 + + // eor X, sAbo, sAko, ror #63 + // eor X, X, sAmo, ror #37 + // eor X, X, sAgo, ror #36 + // eor X, X, sAso, ror #2 + + // eor X, sAku, sAgu, ror #50 + // eor X, X, sAmu, ror #34 + // eor X, X, sAbu, ror #26 + // eor X, X, sAsu, ror #15 + + // eor X, X, X, ror #61 + // ror X, X, #62 + // eor X, X, X, ror #57 + // ror X, X, #58 + // eor X, X, X, ror #55 + // ror X, X, #56 + // eor X, X, X, ror #63 + // eor X, X, X, ror #63 + + // eor X, X, sAba + // eor X, X, sAbi, ror #50 + // eor X, X, sAki, ror #46 + // eor X, X, sAko, ror #63 + // eor X, X, sAmu, ror #28 + // eor X, X, sAso, ror #2 + // eor X, X, sAma, ror #54 + // eor X, X, sAbe, ror #43 + // eor X, X, sAgo, ror #36 + // eor X, X, sAme, ror #49 + // eor X, X, sAgi, ror #3 + // eor X, X, sAka, ror #39 + // eor X, X, sAbo + // eor X, X, sAmo, ror #37 + // eor X, X, sAmi, ror #8 + // eor X, X, sAke, ror #56 + // eor X, X, sAgu, ror #44 + // eor X, X, sAsi, ror #62 + // eor X, X, sAku, ror #58 + // eor X, X, sAsa, ror #25 + // eor X, X, sAbu, ror #20 + // eor X, X, sAsu, ror #9 + // eor X, X, sAse, ror #23 + // eor X, X, sAga, ror #61 + // eor X, X, sAge, ror #19 + + // ldr X, [sp, #STACK_OFFSET_CONST_SCALAR] + // ldr X, [sp, #STACK_OFFSET_COUNT] // @slothy:reads=STACK_OFFSET_COUNT + // ldr X, [X, W, UXTW #3] + // add X, X, #1 + // cmp X, #(KECCAK_F1600_ROUNDS-1) // @slothy:ignore_useless_output + // str X, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT + + // chi_step_ror sAga, X, X, X, 47, 39 + // chi_step_ror sAge, X, X, X, 42, 25 + // chi_step_ror sAgi, X, X, X, 16, 58 + // chi_step_ror sAgo, X, X, X, 31, 47 + // chi_step_ror sAgu, X, X, X, 56, 23 + // chi_step_ror sAka, X, X, X, 19, 24 + // chi_step_ror sAke, X, X, X, 47, 2 + // chi_step_ror sAki, X, X, X, 10, 57 + // chi_step_ror sAko, X, X, X, 47, 57 + // chi_step_ror sAku, X, X, X, 5, 52 + // chi_step_ror sAma, X, X, X, 38, 47 + // chi_step_ror sAme, X, X, X, 5, 43 + // chi_step_ror sAmi, X, X, X, 41, 46 + // chi_step_ror sAmo, X, X, X, 35, 12 + // chi_step_ror sAmu, X, X, X, 9, 44 + // chi_step_ror sAsa, X, X, X, 48, 41 + // chi_step_ror sAse, X, X, X, 2, 50 + // chi_step_ror sAsi, X, X, X, 25, 27 + // chi_step_ror sAso, X, X, X, 60, 21 + // chi_step_ror sAsu, X, X, X, 57, 53 + // chi_step_ror2 sAba, X, X, X, 63, 21 + // chi_step_ror sAbe, X, X, X, 42, 41 + // chi_step_ror sAbi, X, X, X, 57, 35 + // chi_step_ror sAbo, X, X, X, 50, 43 + // chi_step_ror sAbu, X, X, X, 44, 30 + + // eor sAba, sAba, X +.endm + +.macro final_scalar_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +.global keccak_f1600_x4_hybrid_slothy_clean +.global _keccak_f1600_x4_hybrid_slothy_clean +.text +.align 4 + +keccak_f1600_x4_hybrid_slothy_clean: +_keccak_f1600_x4_hybrid_slothy_clean: + alloc_stack + save_gprs + save_vregs + + ASM_LOAD(const_addr, round_constants) + + mov outer, #0 + str outer, [sp, #STACK_OFFSET_OUTER] // @slothy:writes=STACK_OFFSET_OUTER + str const_addr, [sp, #STACK_OFFSET_CONST_SCALAR] // @slothy:writes=STACK_OFFSET_CONST_SCALAR + str const_addr, [sp, #STACK_OFFSET_CONST_VECTOR] // @slothy:writes=STACK_OFFSET_CONST_VECTOR + str input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:writes=STACK_OFFSET_INPUT + + load_input_vector 2,1 // Vector input + load_input_scalar 4,0 // First scalar input + + initial: + scalar_round_initial // @slothy:interleaving_class=0 + scalar_round_noninitial // @slothy:interleaving_class=0 + vector_round // @slothy:interleaving_class=1 + loop: + scalar_round_noninitial // @slothy:interleaving_class=0 + scalar_round_noninitial // @slothy:interleaving_class=0 + vector_round // @slothy:interleaving_class=1 + loop_end: + ble loop + final_scalar_rotate + + // Read outer loop flag: We repeat the above twice + ldr outer, [sp, #STACK_OFFSET_OUTER] // @slothy:reads=STACK_OFFSET_OUTER + cmp outer, #1 + beq done + + // Update outer loop flag + mov outer, #1 + str outer, [sp, #STACK_OFFSET_OUTER] // @slothy:writes=STACK_OFFSET_OUTER + + ldr input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:reads=STACK_OFFSET_INPUT + store_input_scalar 4,0 // Store first scalar data + load_input_scalar 4,1 // Load second scalar input + + b initial +done: + + ldr input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:reads=STACK_OFFSET_INPUT + store_input_scalar 4,1 + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + ret diff --git a/examples/naive/aarch64/keccak_f1600_x4_hybrid_slothy_interleaved.s b/examples/naive/aarch64/keccak_f1600_x4_hybrid_slothy_interleaved.s new file mode 100644 index 00000000..79bc57be --- /dev/null +++ b/examples/naive/aarch64/keccak_f1600_x4_hybrid_slothy_interleaved.s @@ -0,0 +1,1741 @@ +/* + * Copyright (c) 2021-2022 Arm Limited + * Copyright (c) 2022 Matthias Kannwischer + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ +#include + +#define KECCAK_F1600_ROUNDS 24 + + +// Author: Hanno Becker +// Author: Matthias Kannwischer + + +/********************** CONSTANTS *************************/ + .data + .align(8) +round_constants: + .quad 0x0000000000000001 + .quad 0x0000000000008082 + .quad 0x800000000000808a + .quad 0x8000000080008000 + .quad 0x000000000000808b + .quad 0x0000000080000001 + .quad 0x8000000080008081 + .quad 0x8000000000008009 + .quad 0x000000000000008a + .quad 0x0000000000000088 + .quad 0x0000000080008009 + .quad 0x000000008000000a + .quad 0x000000008000808b + .quad 0x800000000000008b + .quad 0x8000000000008089 + .quad 0x8000000000008003 + .quad 0x8000000000008002 + .quad 0x8000000000000080 + .quad 0x000000000000800a + .quad 0x800000008000000a + .quad 0x8000000080008081 + .quad 0x8000000000008080 + .quad 0x0000000080000001 + .quad 0x8000000080008008 + +/****************** REGISTER ALLOCATIONS *******************/ + + input_addr .req x0 + const_addr .req x29 + outer .req x30 + cur_const .req x26 + + /* Mapping of Kecck-f1600 SIMD state to vector registers + * at the beginning and end of each round. */ + + vAba .req v0 + vAbe .req v1 + vAbi .req v2 + vAbo .req v3 + vAbu .req v4 + vAga .req v5 + vAge .req v6 + vAgi .req v7 + vAgo .req v8 + vAgu .req v9 + vAka .req v10 + vAke .req v11 + vAki .req v12 + vAko .req v13 + vAku .req v14 + vAma .req v15 + vAme .req v16 + vAmi .req v17 + vAmo .req v18 + vAmu .req v19 + vAsa .req v20 + vAse .req v21 + vAsi .req v22 + vAso .req v23 + vAsu .req v24 + + /* q-form of the above mapping */ + vAbaq .req q0 + vAbeq .req q1 + vAbiq .req q2 + vAboq .req q3 + vAbuq .req q4 + vAgaq .req q5 + vAgeq .req q6 + vAgiq .req q7 + vAgoq .req q8 + vAguq .req q9 + vAkaq .req q10 + vAkeq .req q11 + vAkiq .req q12 + vAkoq .req q13 + vAkuq .req q14 + vAmaq .req q15 + vAmeq .req q16 + vAmiq .req q17 + vAmoq .req q18 + vAmuq .req q19 + vAsaq .req q20 + vAseq .req q21 + vAsiq .req q22 + vAsoq .req q23 + vAsuq .req q24 + + /* C[x] = A[x,0] xor A[x,1] xor A[x,2] xor A[x,3] xor A[x,4], for x in 0..4 */ + C0 .req v30 + C1 .req v29 + C2 .req v28 + C3 .req v27 + C4 .req v26 + + /* E[x] = C[x-1] xor rot(C[x+1],1), for x in 0..4 */ + E0 .req v26 + E1 .req v25 + E2 .req v29 + E3 .req v28 + E4 .req v27 + + /* A_[y,2*x+3*y] = rot(A[x,y]) */ + vAbi_ .req v2 + vAbo_ .req v3 + vAbu_ .req v4 + vAga_ .req v10 + vAge_ .req v11 + vAgi_ .req v7 + vAgo_ .req v8 + vAgu_ .req v9 + vAka_ .req v15 + vAke_ .req v16 + vAki_ .req v12 + vAko_ .req v13 + vAku_ .req v14 + vAma_ .req v20 + vAme_ .req v21 + vAmi_ .req v17 + vAmo_ .req v18 + vAmu_ .req v19 + vAsa_ .req v0 + vAse_ .req v1 + vAsi_ .req v22 + vAso_ .req v23 + vAsu_ .req v24 + vAba_ .req v30 + vAbe_ .req v27 + + /* Unused temporary */ + vtmp .req v31 + + /* Mapping of Kecck-f1600 state to scalar registers + * at the beginning and end of each round. */ + sAba .req x1 + sAbe .req x6 + sAbi .req x11 + sAbo .req x16 + sAbu .req x21 + sAga .req x2 + sAge .req x7 + sAgi .req x12 + sAgo .req x17 + sAgu .req x22 + sAka .req x3 + sAke .req x8 + sAki .req x13 + sAko .req x28 + sAku .req x23 + sAma .req x4 + sAme .req x9 + sAmi .req x14 + sAmo .req x19 + sAmu .req x24 + sAsa .req x5 + sAse .req x10 + sAsi .req x15 + sAso .req x20 + sAsu .req x25 + + tmp .req x30 + +/************************ MACROS ****************************/ + +.macro eor3_m1 d s0 s1 s2 + eor \d\().16b, \s0\().16b, \s1\().16b + eor \d\().16b, \d\().16b, \s2\().16b +.endm + +.macro rax1_m1 d s0 s1 + add vtmp.2d, \s1\().2d, \s1\().2d + sri vtmp.2d, \s1\().2d, #63 + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro xar_m1 d s0 s1 imm + eor vtmp.16b, \s0\().16b, \s1\().16b + shl \d\().2d, vtmp.2d, #(64-\imm) + sri \d\().2d, vtmp.2d, #(\imm) +.endm + +.macro bcax_m1 d s0 s1 s2 + bic vtmp.16b, \s1\().16b, \s2\().16b + eor \d\().16b, vtmp.16b, \s0\().16b +.endm + +.macro load_input_vector num idx + ldr vAbaq, [input_addr, #(16*(\num*0+\idx))] + ldr vAbeq, [input_addr, #(16*(\num*1+\idx))] + ldr vAbiq, [input_addr, #(16*(\num*2+\idx))] + ldr vAboq, [input_addr, #(16*(\num*3+\idx))] + ldr vAbuq, [input_addr, #(16*(\num*4+\idx))] + ldr vAgaq, [input_addr, #(16*(\num*5+\idx))] + ldr vAgeq, [input_addr, #(16*(\num*6+\idx))] + ldr vAgiq, [input_addr, #(16*(\num*7+\idx))] + ldr vAgoq, [input_addr, #(16*(\num*8+\idx))] + ldr vAguq, [input_addr, #(16*(\num*9+\idx))] + ldr vAkaq, [input_addr, #(16*(\num*10+\idx))] + ldr vAkeq, [input_addr, #(16*(\num*11+\idx))] + ldr vAkiq, [input_addr, #(16*(\num*12+\idx))] + ldr vAkoq, [input_addr, #(16*(\num*13+\idx))] + ldr vAkuq, [input_addr, #(16*(\num*14+\idx))] + ldr vAmaq, [input_addr, #(16*(\num*15+\idx))] + ldr vAmeq, [input_addr, #(16*(\num*16+\idx))] + ldr vAmiq, [input_addr, #(16*(\num*17+\idx))] + ldr vAmoq, [input_addr, #(16*(\num*18+\idx))] + ldr vAmuq, [input_addr, #(16*(\num*19+\idx))] + ldr vAsaq, [input_addr, #(16*(\num*20+\idx))] + ldr vAseq, [input_addr, #(16*(\num*21+\idx))] + ldr vAsiq, [input_addr, #(16*(\num*22+\idx))] + ldr vAsoq, [input_addr, #(16*(\num*23+\idx))] + ldr vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_vector num idx + str vAbaq, [input_addr, #(16*(\num*0+\idx))] + str vAbeq, [input_addr, #(16*(\num*1+\idx))] + str vAbiq, [input_addr, #(16*(\num*2+\idx))] + str vAboq, [input_addr, #(16*(\num*3+\idx))] + str vAbuq, [input_addr, #(16*(\num*4+\idx))] + str vAgaq, [input_addr, #(16*(\num*5+\idx))] + str vAgeq, [input_addr, #(16*(\num*6+\idx))] + str vAgiq, [input_addr, #(16*(\num*7+\idx))] + str vAgoq, [input_addr, #(16*(\num*8+\idx))] + str vAguq, [input_addr, #(16*(\num*9+\idx))] + str vAkaq, [input_addr, #(16*(\num*10+\idx))] + str vAkeq, [input_addr, #(16*(\num*11+\idx))] + str vAkiq, [input_addr, #(16*(\num*12+\idx))] + str vAkoq, [input_addr, #(16*(\num*13+\idx))] + str vAkuq, [input_addr, #(16*(\num*14+\idx))] + str vAmaq, [input_addr, #(16*(\num*15+\idx))] + str vAmeq, [input_addr, #(16*(\num*16+\idx))] + str vAmiq, [input_addr, #(16*(\num*17+\idx))] + str vAmoq, [input_addr, #(16*(\num*18+\idx))] + str vAmuq, [input_addr, #(16*(\num*19+\idx))] + str vAsaq, [input_addr, #(16*(\num*20+\idx))] + str vAseq, [input_addr, #(16*(\num*21+\idx))] + str vAsiq, [input_addr, #(16*(\num*22+\idx))] + str vAsoq, [input_addr, #(16*(\num*23+\idx))] + str vAsuq, [input_addr, #(16*(\num*24+\idx))] +.endm + +.macro store_input_scalar num idx + str sAba, [input_addr, 8*(\num*(0) +\idx)] + str sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + str sAbi, [input_addr, 8*(\num*(2)+ \idx)] + str sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + str sAbu, [input_addr, 8*(\num*(4)+ \idx)] + str sAga, [input_addr, 8*(\num*(4+1) +\idx)] + str sAge, [input_addr, 8*(\num*(6)+ \idx)] + str sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + str sAgo, [input_addr, 8*(\num*(8)+ \idx)] + str sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + str sAka, [input_addr, 8*(\num*(10) +\idx)] + str sAke, [input_addr, 8*(\num*(10+1)+\idx)] + str sAki, [input_addr, 8*(\num*(12) +\idx)] + str sAko, [input_addr, 8*(\num*(12+1)+\idx)] + str sAku, [input_addr, 8*(\num*(14) +\idx)] + str sAma, [input_addr, 8*(\num*(14+1)+\idx)] + str sAme, [input_addr, 8*(\num*(16) +\idx)] + str sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + str sAmo, [input_addr, 8*(\num*(18) +\idx)] + str sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + str sAsa, [input_addr, 8*(\num*(20) +\idx)] + str sAse, [input_addr, 8*(\num*(20+1)+\idx)] + str sAsi, [input_addr, 8*(\num*(22) +\idx)] + str sAso, [input_addr, 8*(\num*(22+1)+\idx)] + str sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +.macro load_input_scalar num idx + ldr sAba, [input_addr, 8*(\num*(0) +\idx)] + ldr sAbe, [input_addr, 8*(\num*(0+1) +\idx)] + ldr sAbi, [input_addr, 8*(\num*(2)+ \idx)] + ldr sAbo, [input_addr, 8*(\num*(2+1) +\idx)] + ldr sAbu, [input_addr, 8*(\num*(4)+ \idx)] + ldr sAga, [input_addr, 8*(\num*(4+1) +\idx)] + ldr sAge, [input_addr, 8*(\num*(6)+ \idx)] + ldr sAgi, [input_addr, 8*(\num*(6+1) +\idx)] + ldr sAgo, [input_addr, 8*(\num*(8)+ \idx)] + ldr sAgu, [input_addr, 8*(\num*(8+1) +\idx)] + ldr sAka, [input_addr, 8*(\num*(10) +\idx)] + ldr sAke, [input_addr, 8*(\num*(10+1)+\idx)] + ldr sAki, [input_addr, 8*(\num*(12) +\idx)] + ldr sAko, [input_addr, 8*(\num*(12+1)+\idx)] + ldr sAku, [input_addr, 8*(\num*(14) +\idx)] + ldr sAma, [input_addr, 8*(\num*(14+1)+\idx)] + ldr sAme, [input_addr, 8*(\num*(16) +\idx)] + ldr sAmi, [input_addr, 8*(\num*(16+1)+\idx)] + ldr sAmo, [input_addr, 8*(\num*(18) +\idx)] + ldr sAmu, [input_addr, 8*(\num*(18+1)+\idx)] + ldr sAsa, [input_addr, 8*(\num*(20) +\idx)] + ldr sAse, [input_addr, 8*(\num*(20+1)+\idx)] + ldr sAsi, [input_addr, 8*(\num*(22) +\idx)] + ldr sAso, [input_addr, 8*(\num*(22+1)+\idx)] + ldr sAsu, [input_addr, 8*(\num*(24) +\idx)] +.endm + +#define STACK_LOCS 6 + +#define STACK_SIZE (16*6 + 8*8 + 6*8 + (STACK_LOCS) * 8) +#define STACK_BASE_GPRS (6*8) +#define STACK_BASE_VREGS (6*8 + 16*6) +#define STACK_OFFSET_LOCS (16*6 + 8*8 + 6*8) + +#define STACK_OFFSET_INPUT (0*8) +#define STACK_OFFSET_CONST_SCALAR (1*8) +#define STACK_OFFSET_CONST_VECTOR (2*8) +#define STACK_OFFSET_COUNT (3*8) +#define STACK_OFFSET_OUTER (4*8) + +#define STACK_LOC_0 ((STACK_OFFSET_LOCS) + 0*8) +#define STACK_LOC_1 ((STACK_OFFSET_LOCS) + 1*8) +#define STACK_LOC_2 ((STACK_OFFSET_LOCS) + 2*8) +#define STACK_LOC_3 ((STACK_OFFSET_LOCS) + 3*8) +#define STACK_LOC_4 ((STACK_OFFSET_LOCS) + 4*8) +#define STACK_LOC_5 ((STACK_OFFSET_LOCS) + 4*8) + +.macro save_gprs + stp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + stp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + stp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + stp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + stp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + stp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro restore_gprs + ldp x19, x20, [sp, #(STACK_BASE_GPRS + 16*0)] + ldp x21, x22, [sp, #(STACK_BASE_GPRS + 16*1)] + ldp x23, x24, [sp, #(STACK_BASE_GPRS + 16*2)] + ldp x25, x26, [sp, #(STACK_BASE_GPRS + 16*3)] + ldp x27, x28, [sp, #(STACK_BASE_GPRS + 16*4)] + ldp x29, x30, [sp, #(STACK_BASE_GPRS + 16*5)] +.endm + +.macro save_vregs + stp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] + stp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + stp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + stp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] +.endm + +.macro restore_vregs + ldp d14, d15, [sp,#(STACK_BASE_VREGS+3*16)] + ldp d12, d13, [sp,#(STACK_BASE_VREGS+2*16)] + ldp d10, d11, [sp,#(STACK_BASE_VREGS+1*16)] + ldp d8, d9, [sp,#(STACK_BASE_VREGS+0*16)] +.endm + +.macro alloc_stack + sub sp, sp, #(STACK_SIZE) +.endm + +.macro free_stack + add sp, sp, #(STACK_SIZE) +.endm + +.macro eor5 dst, src0, src1, src2, src3, src4 + eor \dst, \src0, \src1 + eor \dst, \dst, \src2 + eor \dst, \dst, \src3 + eor \dst, \dst, \src4 +.endm + +.macro chi_step_ror out, a, b, c, r1, r2 + bic X, \b\(), \c\(), ror #\r1 + eor \out\(), X, \a\(), ror #\r2 +.endm + +.macro chi_step_ror2 out, a, b, c, r1, r2 + bic X, \b\(), \c\(), ror #\r1 + eor \out\(), \a\(), X, ror #\r2 +.endm + +.macro scalar_round_initial + eor x30, x24, x25 // *..................................................... + eor x27, x9, x10 // *..................................................... + eor x0, x30, x21 // .*.................................................... + eor x26, x27, x6 // .*.................................................... + eor x27, x26, x7 // ..*................................................... + eor x29, x0, x22 // ..*................................................... + eor x26, x29, x23 // ...*.................................................. + eor x29, x4, x5 // ...*.................................................. + eor x30, x29, x1 // ....*................................................. + eor x0, x27, x8 // ....*................................................. + eor x29, x30, x2 // .....*................................................ + eor x30, x19, x20 // .....*................................................ + eor x30, x30, x16 // ......*............................................... + eor x27, x26, x0, ror #63 // ......*............................................... + eor x4, x4, x27 // .......*.............................................. + eor x30, x30, x17 // .......*.............................................. + eor x30, x30, x28 // ........*............................................. + eor x29, x29, x3 // ........*............................................. + eor x0, x0, x30, ror #63 // .........*............................................ + eor x30, x30, x29, ror #63 // .........*............................................ + eor x22, x22, x30 // ..........*........................................... + eor x23, x23, x30 // ..........*........................................... + str x23, [sp, #STACK_LOC_0] // ...........*.......................................... // @slothy:writes=stack_0 + eor x23, x14, x15 // ...........*.......................................... + eor x14, x14, x0 // ............*......................................... + eor x23, x23, x11 // ............*......................................... + eor x15, x15, x0 // .............*........................................ + eor x1, x1, x27 // .............*........................................ + eor x23, x23, x12 // ..............*....................................... + eor x23, x23, x13 // ...............*...................................... + eor x11, x11, x0 // ...............*...................................... + eor x29, x29, x23, ror #63 // ................*..................................... + eor x23, x23, x26, ror #63 // ................*..................................... + eor x26, x13, x0 // .................*.................................... + eor x13, x28, x23 // .................*.................................... + eor x28, x24, x30 // ..................*................................... + eor x24, x16, x23 // ..................*................................... + eor x16, x21, x30 // ...................*.................................. + eor x21, x25, x30 // ...................*.................................. + eor x30, x19, x23 // ....................*................................. + eor x19, x20, x23 // ....................*................................. + eor x20, x17, x23 // .....................*................................ + eor x17, x12, x0 // .....................*................................ + eor x0, x2, x27 // ......................*............................... + eor x2, x6, x29 // ......................*............................... + eor x6, x8, x29 // .......................*.............................. + bic x8, x28, x13, ror #47 // .......................*.............................. + eor x12, x3, x27 // ........................*............................. + bic x3, x13, x17, ror #19 // ........................*............................. + eor x5, x5, x27 // .........................*............................ + ldr x27, [sp, #STACK_LOC_0] // .........................*............................ // @slothy:reads=stack_0 + bic x25, x17, x2, ror #5 // ..........................*........................... + eor x9, x9, x29 // ..........................*........................... + eor x23, x25, x5, ror #52 // ...........................*.......................... + eor x3, x3, x2, ror #24 // ...........................*.......................... + eor x8, x8, x17, ror #2 // ............................*......................... + eor x17, x10, x29 // ............................*......................... + bic x25, x12, x22, ror #47 // .............................*........................ + eor x29, x7, x29 // .............................*........................ + bic x10, x4, x27, ror #2 // ..............................*....................... + bic x7, x5, x28, ror #10 // ..............................*....................... + eor x10, x10, x20, ror #50 // ...............................*...................... + eor x13, x7, x13, ror #57 // ...............................*...................... + bic x7, x2, x5, ror #47 // ................................*..................... + eor x2, x25, x24, ror #39 // ................................*..................... + bic x25, x20, x11, ror #57 // .................................*.................... + bic x5, x17, x4, ror #25 // .................................*.................... + eor x25, x25, x17, ror #53 // ..................................*................... + bic x17, x11, x17, ror #60 // ..................................*................... + eor x28, x7, x28, ror #57 // ...................................*.................. + bic x7, x9, x12, ror #42 // ...................................*.................. + eor x7, x7, x22, ror #25 // ....................................*................. + bic x22, x22, x24, ror #56 // ....................................*................. + bic x24, x24, x15, ror #31 // .....................................*................ + eor x22, x22, x15, ror #23 // .....................................*................ + bic x20, x27, x20, ror #48 // ......................................*............... + bic x15, x15, x9, ror #16 // ......................................*............... + eor x12, x15, x12, ror #58 // .......................................*.............. + eor x15, x5, x27, ror #27 // .......................................*.............. + eor x5, x20, x11, ror #41 // ........................................*............. + ldr x11, [sp, #STACK_OFFSET_CONST_SCALAR] // ........................................*............. + eor x20, x17, x4, ror #21 // .........................................*............ + eor x17, x24, x9, ror #47 // .........................................*............ + mov x24, #1 // ..........................................*........... + bic x9, x0, x16, ror #9 // ..........................................*........... + str x24, [sp, #STACK_OFFSET_COUNT] // ...........................................*.......... // @slothy:writes=STACK_OFFSET_COUNT + bic x24, x29, x1, ror #44 // ...........................................*.......... + bic x27, x1, x21, ror #50 // ............................................*......... + bic x4, x26, x29, ror #63 // ............................................*......... + eor x1, x1, x4, ror #21 // .............................................*........ + ldr x11, [x11] // .............................................*........ + bic x4, x21, x30, ror #57 // ..............................................*....... + eor x21, x24, x21, ror #30 // ..............................................*....... + eor x24, x9, x19, ror #44 // ...............................................*...... + bic x9, x14, x6, ror #5 // ...............................................*...... + eor x9, x9, x0, ror #43 // ................................................*..... + bic x0, x6, x0, ror #38 // ................................................*..... + eor x1, x1, x11 // .................................................*.... + eor x11, x4, x26, ror #35 // .................................................*.... + eor x4, x0, x16, ror #47 // ..................................................*... + bic x0, x16, x19, ror #35 // ..................................................*... + eor x16, x27, x30, ror #43 // ...................................................*.. + bic x27, x30, x26, ror #42 // ...................................................*.. + bic x26, x19, x14, ror #41 // ....................................................*. + eor x19, x0, x14, ror #12 // ....................................................*. + eor x14, x26, x6, ror #46 // .....................................................* + eor x6, x27, x29, ror #41 // .....................................................* + + // eor5 X, sAma, sAsa, sAba, sAga, sAka + // eor5 X, sAme, sAse, sAbe, sAge, sAke + // eor5 X, sAmi, sAsi, sAbi, sAgi, sAki + // eor5 X, sAmo, sAso, sAbo, sAgo, sAko + // eor5 X, sAmu, sAsu, sAbu, sAgu, sAku + + // eor X, X, X, ror #63 + // eor X, X, X, ror #63 + // eor X, X, X, ror #63 + // eor X, X, X, ror #63 + // eor X, X, X, ror #63 + + // eor X, sAba, X + // eor X, sAbi, X + // eor X, sAki, X + // eor X, sAko, X + // eor X, sAmu, X + // eor X, sAso, X + // eor X, sAma, X + // eor X, sAbe, X + // eor X, sAgo, X + // eor X, sAme, X + // eor X, sAgi, X + // eor X, sAka, X + // eor X, sAbo, X + // eor X, sAmo, X + // eor X, sAmi, X + // eor X, sAke, X + // eor X, sAgu, X + // eor X, sAsi, X + // eor X, sAku, X + // eor X, sAsa, X + // eor X, sAbu, X + // eor X, sAsu, X + // eor X, sAse, X + // eor X, sAga, X + // eor X, sAge, X + + // ldr X, [sp, #STACK_OFFSET_CONST_SCALAR] + // ldr X, [X] + // mov X, #1 + // str X, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT + + // chi_step_ror sAga, X, X, X, 47, 39 + // chi_step_ror sAge, X, X, X, 42, 25 + // chi_step_ror sAgi, X, X, X, 16, 58 + // chi_step_ror sAgo, X, X, X, 31, 47 + // chi_step_ror sAgu, X, X, X, 56, 23 + // chi_step_ror sAka, X, X, X, 19, 24 + // chi_step_ror sAke, X, X, X, 47, 2 + // chi_step_ror sAki, X, X, X, 10, 57 + // chi_step_ror sAko, X, X, X, 47, 57 + // chi_step_ror sAku, X, X, X, 5, 52 + // chi_step_ror sAma, X, X, X, 38, 47 + // chi_step_ror sAme, X, X, X, 5, 43 + // chi_step_ror sAmi, X, X, X, 41, 46 + // chi_step_ror sAmo, X, X, X, 35, 12 + // chi_step_ror sAmu, X, X, X, 9, 44 + // chi_step_ror sAsa, X, X, X, 48, 41 + // chi_step_ror sAse, X, X, X, 2, 50 + // chi_step_ror sAsi, X, X, X, 25, 27 + // chi_step_ror sAso, X, X, X, 60, 21 + // chi_step_ror sAsu, X, X, X, 57, 53 + // chi_step_ror2 sAba, X, X, X, 63, 21 + // chi_step_ror sAbe, X, X, X, 42, 41 + // chi_step_ror sAbi, X, X, X, 57, 35 + // chi_step_ror sAbo, X, X, X, 50, 43 + // chi_step_ror sAbu, X, X, X, 44, 30 + + // eor sAba, sAba, X +.endm + +.macro vector_round + eor3_m1 C0, vAba, vAga, vAka + eor3_m1 C0, C0, vAma, vAsa + eor3_m1 C1, vAbe, vAge, vAke + eor3_m1 C1, C1, vAme, vAse + eor3_m1 C2, vAbi, vAgi, vAki + eor3_m1 C2, C2, vAmi, vAsi + eor3_m1 C3, vAbo, vAgo, vAko + eor3_m1 C3, C3, vAmo, vAso + eor3_m1 C4, vAbu, vAgu, vAku + eor3_m1 C4, C4, vAmu, vAsu + rax1_m1 E1, C0, C2 + rax1_m1 E3, C2, C4 + rax1_m1 E0, C4, C1 + rax1_m1 E2, C1, C3 + rax1_m1 E4, C3, C0 + eor vAba_.16b, vAba.16b, E0.16b + xar_m1 vAsa_, vAbi, E2, 2 + xar_m1 vAbi_, vAki, E2, 21 + xar_m1 vAki_, vAko, E3, 39 + xar_m1 vAko_, vAmu, E4, 56 + xar_m1 vAmu_, vAso, E3, 8 + xar_m1 vAso_, vAma, E0, 23 + xar_m1 vAka_, vAbe, E1, 63 + xar_m1 vAse_, vAgo, E3, 9 + xar_m1 vAgo_, vAme, E1, 19 + xar_m1 vAke_, vAgi, E2, 58 + xar_m1 vAgi_, vAka, E0, 61 + xar_m1 vAga_, vAbo, E3, 36 + xar_m1 vAbo_, vAmo, E3, 43 + xar_m1 vAmo_, vAmi, E2, 49 + xar_m1 vAmi_, vAke, E1, 54 + xar_m1 vAge_, vAgu, E4, 44 + xar_m1 vAgu_, vAsi, E2, 3 + xar_m1 vAsi_, vAku, E4, 25 + xar_m1 vAku_, vAsa, E0, 46 + xar_m1 vAma_, vAbu, E4, 37 + xar_m1 vAbu_, vAsu, E4, 50 + xar_m1 vAsu_, vAse, E1, 62 + xar_m1 vAme_, vAga, E0, 28 + xar_m1 vAbe_, vAge, E1, 20 + ldr tmp, [sp, #STACK_OFFSET_CONST_VECTOR] // @slothy:reads=STACK_OFFSET_CONST_VECTOR + ld1r {v28.2d}, [tmp], #8 + str tmp, [sp, #STACK_OFFSET_CONST_VECTOR] // @slothy:writes=STACK_OFFSET_CONST_VECTOR + bcax_m1 vAga, vAga_, vAgi_, vAge_ + bcax_m1 vAge, vAge_, vAgo_, vAgi_ + bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ + bcax_m1 vAgo, vAgo_, vAga_, vAgu_ + bcax_m1 vAgu, vAgu_, vAge_, vAga_ + bcax_m1 vAka, vAka_, vAki_, vAke_ + bcax_m1 vAke, vAke_, vAko_, vAki_ + bcax_m1 vAki, vAki_, vAku_, vAko_ + bcax_m1 vAko, vAko_, vAka_, vAku_ + bcax_m1 vAku, vAku_, vAke_, vAka_ + bcax_m1 vAma, vAma_, vAmi_, vAme_ + bcax_m1 vAme, vAme_, vAmo_, vAmi_ + bcax_m1 vAmi, vAmi_, vAmu_, vAmo_ + bcax_m1 vAmo, vAmo_, vAma_, vAmu_ + bcax_m1 vAmu, vAmu_, vAme_, vAma_ + bcax_m1 vAsa, vAsa_, vAsi_, vAse_ + bcax_m1 vAse, vAse_, vAso_, vAsi_ + bcax_m1 vAsi, vAsi_, vAsu_, vAso_ + bcax_m1 vAso, vAso_, vAsa_, vAsu_ + bcax_m1 vAsu, vAsu_, vAse_, vAsa_ + bcax_m1 vAba, vAba_, vAbi_, vAbe_ + bcax_m1 vAbe, vAbe_, vAbo_, vAbi_ + bcax_m1 vAbi, vAbi_, vAbu_, vAbo_ + bcax_m1 vAbo, vAbo_, vAba_, vAbu_ + bcax_m1 vAbu, vAbu_, vAbe_, vAba_ + eor vAba.16b, vAba.16b, v28.16b +.endm + +.macro scalar_round_noninitial + eor x0, x15, x11, ror #52 // *........................................................ + eor x0, x0, x13, ror #48 // .*....................................................... + eor x26, x8, x9, ror #57 // .*....................................................... + eor x27, x0, x14, ror #10 // ..*...................................................... + eor x29, x16, x28, ror #63 // ..*...................................................... + eor x26, x26, x6, ror #51 // ...*..................................................... + eor x30, x23, x22, ror #50 // ...*..................................................... + eor x0, x26, x10, ror #31 // ....*.................................................... + eor x29, x29, x19, ror #37 // ....*.................................................... + eor x27, x27, x12, ror #5 // .....*................................................... + eor x30, x30, x24, ror #34 // .....*................................................... + eor x0, x0, x7, ror #27 // ......*.................................................. + eor x26, x30, x21, ror #26 // ......*.................................................. + eor x26, x26, x25, ror #15 // .......*................................................. + ror x30, x27, #62 // .......*................................................. + eor x30, x30, x26, ror #57 // ........*................................................ + ror x26, x26, #58 // ........*................................................ + eor x16, x30, x16 // .........*............................................... + eor x28, x30, x28, ror #63 // .........*............................................... + str x28, [sp, #STACK_LOC_0] // ..........*.............................................. // @slothy:writes=stack_0 + eor x29, x29, x17, ror #36 // ..........*.............................................. + eor x28, x1, x2, ror #61 // ...........*............................................. + eor x19, x30, x19, ror #37 // ...........*............................................. + eor x29, x29, x20, ror #2 // ............*............................................ + eor x28, x28, x4, ror #54 // ............*............................................ + eor x26, x26, x0, ror #55 // .............*........................................... + eor x28, x28, x3, ror #39 // .............*........................................... + eor x28, x28, x5, ror #25 // ..............*.......................................... + ror x0, x0, #56 // ..............*.......................................... + eor x0, x0, x29, ror #63 // ...............*......................................... + eor x27, x28, x27, ror #61 // ...............*......................................... + eor x13, x0, x13, ror #46 // ................*........................................ + eor x28, x29, x28, ror #63 // ................*........................................ + eor x29, x30, x20, ror #2 // .................*....................................... + eor x20, x26, x3, ror #39 // .................*....................................... + eor x11, x0, x11, ror #50 // ..................*...................................... + eor x25, x28, x25, ror #9 // ..................*...................................... + eor x3, x28, x21, ror #20 // ...................*..................................... + eor x21, x26, x1 // ...................*..................................... + eor x9, x27, x9, ror #49 // ....................*.................................... + eor x24, x28, x24, ror #28 // ....................*.................................... + eor x1, x30, x17, ror #36 // .....................*................................... + eor x14, x0, x14, ror #8 // .....................*................................... + eor x22, x28, x22, ror #44 // ......................*.................................. + eor x8, x27, x8, ror #56 // ......................*.................................. + eor x17, x27, x7, ror #19 // .......................*................................. + eor x15, x0, x15, ror #62 // .......................*................................. + bic x7, x20, x22, ror #47 // ........................*................................ + eor x4, x26, x4, ror #54 // ........................*................................ + eor x0, x0, x12, ror #3 // .........................*............................... + eor x28, x28, x23, ror #58 // .........................*............................... + eor x23, x26, x2, ror #61 // ..........................*.............................. + eor x26, x26, x5, ror #25 // ..........................*.............................. + eor x2, x7, x16, ror #39 // ...........................*............................. + bic x7, x9, x20, ror #42 // ...........................*............................. + bic x30, x15, x9, ror #16 // ............................*............................ + eor x7, x7, x22, ror #25 // ............................*............................ + eor x12, x30, x20, ror #58 // .............................*........................... + bic x20, x22, x16, ror #56 // .............................*........................... + eor x30, x27, x6, ror #43 // ..............................*.......................... + eor x22, x20, x15, ror #23 // ..............................*.......................... + bic x6, x19, x13, ror #42 // ...............................*......................... + eor x6, x6, x17, ror #41 // ................................*........................ + bic x5, x13, x17, ror #63 // ................................*........................ + eor x5, x21, x5, ror #21 // .................................*....................... + bic x17, x17, x21, ror #44 // .................................*....................... + eor x27, x27, x10, ror #23 // ..................................*...................... + bic x21, x21, x25, ror #50 // ..................................*...................... + bic x20, x27, x4, ror #25 // ...................................*..................... + bic x10, x16, x15, ror #31 // ...................................*..................... + eor x16, x21, x19, ror #43 // ....................................*.................... + eor x21, x17, x25, ror #30 // ....................................*.................... + bic x19, x25, x19, ror #57 // .....................................*................... + ldr x25, [sp, #STACK_OFFSET_COUNT] // .....................................*................... // @slothy:reads=STACK_OFFSET_COUNT + eor x17, x10, x9, ror #47 // ......................................*.................. + ldr x9, [sp, #STACK_OFFSET_CONST_SCALAR] // ......................................*.................. + eor x15, x20, x28, ror #27 // .......................................*................. + bic x20, x4, x28, ror #2 // .......................................*................. + eor x10, x20, x1, ror #50 // ........................................*................ + bic x20, x11, x27, ror #60 // ........................................*................ + eor x20, x20, x4, ror #21 // .........................................*............... + bic x4, x28, x1, ror #48 // .........................................*............... + bic x1, x1, x11, ror #57 // ..........................................*.............. + ldr x28, [x9, w25, UXTW #3] // ..........................................*.............. + ldr x9, [sp, #STACK_LOC_0] // ...........................................*............. // @slothy:reads=stack_0 + add x25, x25, #1 // ...........................................*............. + str x25, [sp, #STACK_OFFSET_COUNT] // ............................................*............ // @slothy:writes=STACK_OFFSET_COUNT + cmp x25, #(KECCAK_F1600_ROUNDS-1) // ............................................*............ // @slothy:ignore_useless_output + eor x25, x1, x27, ror #53 // .............................................*........... + bic x27, x30, x26, ror #47 // .............................................*........... + eor x1, x5, x28 // ..............................................*.......... + eor x5, x4, x11, ror #41 // ..............................................*.......... + eor x11, x19, x13, ror #35 // ...............................................*......... + bic x13, x26, x24, ror #10 // ...............................................*......... + eor x28, x27, x24, ror #57 // ................................................*........ + bic x27, x24, x9, ror #47 // ................................................*........ + bic x19, x23, x3, ror #9 // .................................................*....... + bic x4, x29, x14, ror #41 // .................................................*....... + eor x24, x19, x29, ror #44 // ..................................................*...... + bic x29, x3, x29, ror #35 // ..................................................*...... + eor x13, x13, x9, ror #57 // ...................................................*..... + eor x19, x29, x14, ror #12 // ...................................................*..... + bic x29, x9, x0, ror #19 // ....................................................*.... + bic x14, x14, x8, ror #5 // ....................................................*.... + eor x9, x14, x23, ror #43 // .....................................................*... + eor x14, x4, x8, ror #46 // .....................................................*... + bic x23, x8, x23, ror #38 // ......................................................*.. + eor x8, x27, x0, ror #2 // ......................................................*.. + eor x4, x23, x3, ror #47 // .......................................................*. + bic x3, x0, x30, ror #5 // .......................................................*. + eor x23, x3, x26, ror #52 // ........................................................* + eor x3, x29, x30, ror #24 // ........................................................* + + // eor X, sAba, sAga, ror #61 + // eor X, X, sAma, ror #54 + // eor X, X, sAka, ror #39 + // eor X, X, sAsa, ror #25 + + // eor X, sAke, sAme, ror #57 + // eor X, X, sAbe, ror #51 + // eor X, X, sAse, ror #31 + // eor X, X, sAge, ror #27 + + // eor X, sAsi, sAbi, ror #52 + // eor X, X, sAki, ror #48 + // eor X, X, sAmi, ror #10 + // eor X, X, sAgi, ror #5 + + // eor X, sAbo, sAko, ror #63 + // eor X, X, sAmo, ror #37 + // eor X, X, sAgo, ror #36 + // eor X, X, sAso, ror #2 + + // eor X, sAku, sAgu, ror #50 + // eor X, X, sAmu, ror #34 + // eor X, X, sAbu, ror #26 + // eor X, X, sAsu, ror #15 + + // eor X, X, X, ror #61 + // ror X, X, #62 + // eor X, X, X, ror #57 + // ror X, X, #58 + // eor X, X, X, ror #55 + // ror X, X, #56 + // eor X, X, X, ror #63 + // eor X, X, X, ror #63 + + // eor X, X, sAba + // eor X, X, sAbi, ror #50 + // eor X, X, sAki, ror #46 + // eor X, X, sAko, ror #63 + // eor X, X, sAmu, ror #28 + // eor X, X, sAso, ror #2 + // eor X, X, sAma, ror #54 + // eor X, X, sAbe, ror #43 + // eor X, X, sAgo, ror #36 + // eor X, X, sAme, ror #49 + // eor X, X, sAgi, ror #3 + // eor X, X, sAka, ror #39 + // eor X, X, sAbo + // eor X, X, sAmo, ror #37 + // eor X, X, sAmi, ror #8 + // eor X, X, sAke, ror #56 + // eor X, X, sAgu, ror #44 + // eor X, X, sAsi, ror #62 + // eor X, X, sAku, ror #58 + // eor X, X, sAsa, ror #25 + // eor X, X, sAbu, ror #20 + // eor X, X, sAsu, ror #9 + // eor X, X, sAse, ror #23 + // eor X, X, sAga, ror #61 + // eor X, X, sAge, ror #19 + + // ldr X, [sp, #STACK_OFFSET_CONST_SCALAR] + // ldr X, [sp, #STACK_OFFSET_COUNT] // @slothy:reads=STACK_OFFSET_COUNT + // ldr X, [X, W, UXTW #3] + // add X, X, #1 + // cmp X, #(KECCAK_F1600_ROUNDS-1) // @slothy:ignore_useless_output + // str X, [sp, #STACK_OFFSET_COUNT] // @slothy:writes=STACK_OFFSET_COUNT + + // chi_step_ror sAga, X, X, X, 47, 39 + // chi_step_ror sAge, X, X, X, 42, 25 + // chi_step_ror sAgi, X, X, X, 16, 58 + // chi_step_ror sAgo, X, X, X, 31, 47 + // chi_step_ror sAgu, X, X, X, 56, 23 + // chi_step_ror sAka, X, X, X, 19, 24 + // chi_step_ror sAke, X, X, X, 47, 2 + // chi_step_ror sAki, X, X, X, 10, 57 + // chi_step_ror sAko, X, X, X, 47, 57 + // chi_step_ror sAku, X, X, X, 5, 52 + // chi_step_ror sAma, X, X, X, 38, 47 + // chi_step_ror sAme, X, X, X, 5, 43 + // chi_step_ror sAmi, X, X, X, 41, 46 + // chi_step_ror sAmo, X, X, X, 35, 12 + // chi_step_ror sAmu, X, X, X, 9, 44 + // chi_step_ror sAsa, X, X, X, 48, 41 + // chi_step_ror sAse, X, X, X, 2, 50 + // chi_step_ror sAsi, X, X, X, 25, 27 + // chi_step_ror sAso, X, X, X, 60, 21 + // chi_step_ror sAsu, X, X, X, 57, 53 + // chi_step_ror2 sAba, X, X, X, 63, 21 + // chi_step_ror sAbe, X, X, X, 42, 41 + // chi_step_ror sAbi, X, X, X, 57, 35 + // chi_step_ror sAbo, X, X, X, 50, 43 + // chi_step_ror sAbu, X, X, X, 44, 30 + + // eor sAba, sAba, X +.endm + +.macro final_scalar_rotate + ror sAga, sAga,#(64-3) + ror sAka, sAka,#(64-25) + ror sAma, sAma,#(64-10) + ror sAsa, sAsa,#(64-39) + ror sAbe, sAbe,#(64-21) + ror sAge, sAge,#(64-45) + ror sAke, sAke,#(64-8) + ror sAme, sAme,#(64-15) + ror sAse, sAse,#(64-41) + ror sAbi, sAbi,#(64-14) + ror sAgi, sAgi,#(64-61) + ror sAki, sAki,#(64-18) + ror sAmi, sAmi,#(64-56) + ror sAsi, sAsi,#(64-2) + ror sAgo, sAgo,#(64-28) + ror sAko, sAko,#(64-1) + ror sAmo, sAmo,#(64-27) + ror sAso, sAso,#(64-62) + ror sAbu, sAbu,#(64-44) + ror sAgu, sAgu,#(64-20) + ror sAku, sAku,#(64-6) + ror sAmu, sAmu,#(64-36) + ror sAsu, sAsu,#(64-55) +.endm + +.global keccak_f1600_x4_hybrid_slothy_interleaved +.global _keccak_f1600_x4_hybrid_slothy_interleaved +.text +.align 4 + +keccak_f1600_x4_hybrid_slothy_interleaved: +_keccak_f1600_x4_hybrid_slothy_interleaved: + alloc_stack + save_gprs + save_vregs + + ASM_LOAD(const_addr, round_constants) + + mov outer, #0 + str outer, [sp, #STACK_OFFSET_OUTER] // @slothy:writes=STACK_OFFSET_OUTER + str const_addr, [sp, #STACK_OFFSET_CONST_SCALAR] // @slothy:writes=STACK_OFFSET_CONST_SCALAR + str const_addr, [sp, #STACK_OFFSET_CONST_VECTOR] // @slothy:writes=STACK_OFFSET_CONST_VECTOR + str input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:writes=STACK_OFFSET_INPUT + + load_input_vector 2,1 // Vector input + load_input_scalar 4,0 // First scalar input + + initial: + scalar_round_initial // @slothy:interleaving_class=0 + scalar_round_noninitial // @slothy:interleaving_class=0 + vector_round // @slothy:interleaving_class=1 + loop: + // Instructions: 386 + // Expected cycles: 193 + // Expected IPC: 2.00 + // + // --------------------------------------------------------------------------------------- cycle (expected) ---------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------- + eor x0, x15, x11, ror #52 // *................................................................................................................................................................................................ // @slothy:interleaving_class=0 + eor x0, x0, x13, ror #48 // *................................................................................................................................................................................................ // @slothy:interleaving_class=0 + eor v30.16B, v0.16B, v5.16B // .*............................................................................................................................................................................................... // @slothy:interleaving_class=1 + eor v30.16B, v30.16B, v10.16B // .*............................................................................................................................................................................................... // @slothy:interleaving_class=1 + eor x26, x8, x9, ror #57 // ..*.............................................................................................................................................................................................. // @slothy:interleaving_class=0 + eor v30.16B, v30.16B, v15.16B // ..*.............................................................................................................................................................................................. // @slothy:interleaving_class=1 + eor x27, x0, x14, ror #10 // ...*............................................................................................................................................................................................. // @slothy:interleaving_class=0 + eor x29, x16, x28, ror #63 // ...*............................................................................................................................................................................................. // @slothy:interleaving_class=0 + eor v30.16B, v30.16B, v20.16B // ....*............................................................................................................................................................................................ // @slothy:interleaving_class=1 + eor x26, x26, x6, ror #51 // ....*............................................................................................................................................................................................ // @slothy:interleaving_class=0 + eor v29.16B, v1.16B, v6.16B // .....*........................................................................................................................................................................................... // @slothy:interleaving_class=1 + eor x30, x23, x22, ror #50 // .....*........................................................................................................................................................................................... // @slothy:interleaving_class=0 + eor v29.16B, v29.16B, v11.16B // ......*.......................................................................................................................................................................................... // @slothy:interleaving_class=1 + eor x0, x26, x10, ror #31 // ......*.......................................................................................................................................................................................... // @slothy:interleaving_class=0 + eor x29, x29, x19, ror #37 // .......*......................................................................................................................................................................................... // @slothy:interleaving_class=0 + eor v29.16B, v29.16B, v16.16B // .......*......................................................................................................................................................................................... // @slothy:interleaving_class=1 + eor x27, x27, x12, ror #5 // ........*........................................................................................................................................................................................ // @slothy:interleaving_class=0 + eor v29.16B, v29.16B, v21.16B // ........*........................................................................................................................................................................................ // @slothy:interleaving_class=1 + eor x30, x30, x24, ror #34 // .........*....................................................................................................................................................................................... // @slothy:interleaving_class=0 + eor x0, x0, x7, ror #27 // .........*....................................................................................................................................................................................... // @slothy:interleaving_class=0 + eor v28.16B, v2.16B, v7.16B // ..........*...................................................................................................................................................................................... // @slothy:interleaving_class=1 + eor x26, x30, x21, ror #26 // ..........*...................................................................................................................................................................................... // @slothy:interleaving_class=0 + eor v28.16B, v28.16B, v12.16B // ...........*..................................................................................................................................................................................... // @slothy:interleaving_class=1 + eor x26, x26, x25, ror #15 // ...........*..................................................................................................................................................................................... // @slothy:interleaving_class=0 + eor v28.16B, v28.16B, v17.16B // ............*.................................................................................................................................................................................... // @slothy:interleaving_class=1 + ror x30, x27, #62 // ............*.................................................................................................................................................................................... // @slothy:interleaving_class=0 + eor x30, x30, x26, ror #57 // .............*................................................................................................................................................................................... // @slothy:interleaving_class=0 + eor v28.16B, v28.16B, v22.16B // .............*................................................................................................................................................................................... // @slothy:interleaving_class=1 + ror x26, x26, #58 // ..............*.................................................................................................................................................................................. // @slothy:interleaving_class=0 + eor v27.16B, v3.16B, v8.16B // ..............*.................................................................................................................................................................................. // @slothy:interleaving_class=1 + eor x16, x30, x16 // ...............*................................................................................................................................................................................. // @slothy:interleaving_class=0 + eor v27.16B, v27.16B, v13.16B // ...............*................................................................................................................................................................................. // @slothy:interleaving_class=1 + eor x28, x30, x28, ror #63 // ................*................................................................................................................................................................................ // @slothy:interleaving_class=0 + str x28, [sp, #STACK_LOC_0] // ................*................................................................................................................................................................................ // @slothy:writes=stack_0 // @slothy:interleaving_class=0 + eor v27.16B, v27.16B, v18.16B // .................*............................................................................................................................................................................... // @slothy:interleaving_class=1 + eor x29, x29, x17, ror #36 // .................*............................................................................................................................................................................... // @slothy:interleaving_class=0 + eor v27.16B, v27.16B, v23.16B // ..................*.............................................................................................................................................................................. // @slothy:interleaving_class=1 + eor x28, x1, x2, ror #61 // ..................*.............................................................................................................................................................................. // @slothy:interleaving_class=0 + eor x19, x30, x19, ror #37 // ...................*............................................................................................................................................................................. // @slothy:interleaving_class=0 + eor v26.16B, v4.16B, v9.16B // ...................*............................................................................................................................................................................. // @slothy:interleaving_class=1 + eor x29, x29, x20, ror #2 // ....................*............................................................................................................................................................................ // @slothy:interleaving_class=0 + eor v26.16B, v26.16B, v14.16B // ....................*............................................................................................................................................................................ // @slothy:interleaving_class=1 + eor x28, x28, x4, ror #54 // .....................*........................................................................................................................................................................... // @slothy:interleaving_class=0 + eor v26.16B, v26.16B, v19.16B // .....................*........................................................................................................................................................................... // @slothy:interleaving_class=1 + eor x26, x26, x0, ror #55 // ......................*.......................................................................................................................................................................... // @slothy:interleaving_class=0 + eor x28, x28, x3, ror #39 // ......................*.......................................................................................................................................................................... // @slothy:interleaving_class=0 + eor v26.16B, v26.16B, v24.16B // .......................*......................................................................................................................................................................... // @slothy:interleaving_class=1 + eor x28, x28, x5, ror #25 // .......................*......................................................................................................................................................................... // @slothy:interleaving_class=0 + add v31.2D, v28.2D, v28.2D // ........................*........................................................................................................................................................................ // @slothy:interleaving_class=1 + ror x0, x0, #56 // ........................*........................................................................................................................................................................ // @slothy:interleaving_class=0 + eor x0, x0, x29, ror #63 // .........................*....................................................................................................................................................................... // @slothy:interleaving_class=0 + sri v31.2D, v28.2D, #63 // .........................*....................................................................................................................................................................... // @slothy:interleaving_class=1 + eor x27, x28, x27, ror #61 // ..........................*...................................................................................................................................................................... // @slothy:interleaving_class=0 + eor v25.16B, v31.16B, v30.16B // ..........................*...................................................................................................................................................................... // @slothy:interleaving_class=1 + eor x13, x0, x13, ror #46 // ...........................*..................................................................................................................................................................... // @slothy:interleaving_class=0 + add v31.2D, v26.2D, v26.2D // ...........................*..................................................................................................................................................................... // @slothy:interleaving_class=1 + eor x28, x29, x28, ror #63 // ............................*.................................................................................................................................................................... // @slothy:interleaving_class=0 + eor x29, x30, x20, ror #2 // ............................*.................................................................................................................................................................... // @slothy:interleaving_class=0 + sri v31.2D, v26.2D, #63 // .............................*................................................................................................................................................................... // @slothy:interleaving_class=1 + eor x20, x26, x3, ror #39 // .............................*................................................................................................................................................................... // @slothy:interleaving_class=0 + eor v28.16B, v31.16B, v28.16B // ..............................*.................................................................................................................................................................. // @slothy:interleaving_class=1 + eor x11, x0, x11, ror #50 // ..............................*.................................................................................................................................................................. // @slothy:interleaving_class=0 + add v31.2D, v29.2D, v29.2D // ...............................*................................................................................................................................................................. // @slothy:interleaving_class=1 + eor x25, x28, x25, ror #9 // ...............................*................................................................................................................................................................. // @slothy:interleaving_class=0 + eor x3, x28, x21, ror #20 // ................................*................................................................................................................................................................ // @slothy:interleaving_class=0 + sri v31.2D, v29.2D, #63 // ................................*................................................................................................................................................................ // @slothy:interleaving_class=1 + eor x21, x26, x1 // .................................*............................................................................................................................................................... // @slothy:interleaving_class=0 + eor v26.16B, v31.16B, v26.16B // .................................*............................................................................................................................................................... // @slothy:interleaving_class=1 + eor x9, x27, x9, ror #49 // ..................................*.............................................................................................................................................................. // @slothy:interleaving_class=0 + eor x24, x28, x24, ror #28 // ..................................*.............................................................................................................................................................. // @slothy:interleaving_class=0 + add v31.2D, v27.2D, v27.2D // ...................................*............................................................................................................................................................. // @slothy:interleaving_class=1 + eor x1, x30, x17, ror #36 // ...................................*............................................................................................................................................................. // @slothy:interleaving_class=0 + sri v31.2D, v27.2D, #63 // ....................................*............................................................................................................................................................ // @slothy:interleaving_class=1 + eor x14, x0, x14, ror #8 // ....................................*............................................................................................................................................................ // @slothy:interleaving_class=0 + eor v29.16B, v31.16B, v29.16B // .....................................*........................................................................................................................................................... // @slothy:interleaving_class=1 + eor x22, x28, x22, ror #44 // .....................................*........................................................................................................................................................... // @slothy:interleaving_class=0 + eor x8, x27, x8, ror #56 // ......................................*.......................................................................................................................................................... // @slothy:interleaving_class=0 + add v31.2D, v30.2D, v30.2D // ......................................*.......................................................................................................................................................... // @slothy:interleaving_class=1 + eor x17, x27, x7, ror #19 // .......................................*......................................................................................................................................................... // @slothy:interleaving_class=0 + sri v31.2D, v30.2D, #63 // .......................................*......................................................................................................................................................... // @slothy:interleaving_class=1 + eor x15, x0, x15, ror #62 // ........................................*........................................................................................................................................................ // @slothy:interleaving_class=0 + bic x7, x20, x22, ror #47 // ........................................*........................................................................................................................................................ // @slothy:interleaving_class=0 + eor v27.16B, v31.16B, v27.16B // .........................................*....................................................................................................................................................... // @slothy:interleaving_class=1 + eor x4, x26, x4, ror #54 // .........................................*....................................................................................................................................................... // @slothy:interleaving_class=0 + eor v30.16B, v0.16B, v26.16B // ..........................................*...................................................................................................................................................... // @slothy:interleaving_class=1 + eor x0, x0, x12, ror #3 // ..........................................*...................................................................................................................................................... // @slothy:interleaving_class=0 + eor v31.16B, v2.16B, v29.16B // ...........................................*..................................................................................................................................................... // @slothy:interleaving_class=1 + eor x28, x28, x23, ror #58 // ...........................................*..................................................................................................................................................... // @slothy:interleaving_class=0 + eor x23, x26, x2, ror #61 // ............................................*.................................................................................................................................................... // @slothy:interleaving_class=0 + shl v0.2D, v31.2D, #(64-2) // ............................................*.................................................................................................................................................... // @slothy:interleaving_class=1 + eor x26, x26, x5, ror #25 // .............................................*................................................................................................................................................... // @slothy:interleaving_class=0 + sri v0.2D, v31.2D, #(2) // .............................................*................................................................................................................................................... // @slothy:interleaving_class=1 + eor x2, x7, x16, ror #39 // ..............................................*.................................................................................................................................................. // @slothy:interleaving_class=0 + eor v31.16B, v12.16B, v29.16B // ..............................................*.................................................................................................................................................. // @slothy:interleaving_class=1 + bic x7, x9, x20, ror #42 // ...............................................*................................................................................................................................................. // @slothy:interleaving_class=0 + bic x30, x15, x9, ror #16 // ...............................................*................................................................................................................................................. // @slothy:interleaving_class=0 + shl v2.2D, v31.2D, #(64-21) // ................................................*................................................................................................................................................ // @slothy:interleaving_class=1 + eor x7, x7, x22, ror #25 // ................................................*................................................................................................................................................ // @slothy:interleaving_class=0 + sri v2.2D, v31.2D, #(21) // .................................................*............................................................................................................................................... // @slothy:interleaving_class=1 + eor x12, x30, x20, ror #58 // .................................................*............................................................................................................................................... // @slothy:interleaving_class=0 + bic x20, x22, x16, ror #56 // ..................................................*.............................................................................................................................................. // @slothy:interleaving_class=0 + eor v31.16B, v13.16B, v28.16B // ..................................................*.............................................................................................................................................. // @slothy:interleaving_class=1 + eor x30, x27, x6, ror #43 // ...................................................*............................................................................................................................................. // @slothy:interleaving_class=0 + shl v12.2D, v31.2D, #(64-39) // ...................................................*............................................................................................................................................. // @slothy:interleaving_class=1 + eor x22, x20, x15, ror #23 // ....................................................*............................................................................................................................................ // @slothy:interleaving_class=0 + sri v12.2D, v31.2D, #(39) // ....................................................*............................................................................................................................................ // @slothy:interleaving_class=1 + bic x6, x19, x13, ror #42 // .....................................................*........................................................................................................................................... // @slothy:interleaving_class=0 + eor x6, x6, x17, ror #41 // .....................................................*........................................................................................................................................... // @slothy:interleaving_class=0 + eor v31.16B, v19.16B, v27.16B // ......................................................*.......................................................................................................................................... // @slothy:interleaving_class=1 + bic x5, x13, x17, ror #63 // ......................................................*.......................................................................................................................................... // @slothy:interleaving_class=0 + shl v13.2D, v31.2D, #(64-56) // .......................................................*......................................................................................................................................... // @slothy:interleaving_class=1 + eor x5, x21, x5, ror #21 // .......................................................*......................................................................................................................................... // @slothy:interleaving_class=0 + sri v13.2D, v31.2D, #(56) // ........................................................*........................................................................................................................................ // @slothy:interleaving_class=1 + bic x17, x17, x21, ror #44 // ........................................................*........................................................................................................................................ // @slothy:interleaving_class=0 + eor x27, x27, x10, ror #23 // .........................................................*....................................................................................................................................... // @slothy:interleaving_class=0 + eor v31.16B, v23.16B, v28.16B // .........................................................*....................................................................................................................................... // @slothy:interleaving_class=1 + bic x21, x21, x25, ror #50 // ..........................................................*...................................................................................................................................... // @slothy:interleaving_class=0 + shl v19.2D, v31.2D, #(64-8) // ..........................................................*...................................................................................................................................... // @slothy:interleaving_class=1 + bic x20, x27, x4, ror #25 // ...........................................................*..................................................................................................................................... // @slothy:interleaving_class=0 + bic x10, x16, x15, ror #31 // ...........................................................*..................................................................................................................................... // @slothy:interleaving_class=0 + sri v19.2D, v31.2D, #(8) // ............................................................*.................................................................................................................................... // @slothy:interleaving_class=1 + eor x16, x21, x19, ror #43 // ............................................................*.................................................................................................................................... // @slothy:interleaving_class=0 + eor v31.16B, v15.16B, v26.16B // .............................................................*................................................................................................................................... // @slothy:interleaving_class=1 + eor x21, x17, x25, ror #30 // .............................................................*................................................................................................................................... // @slothy:interleaving_class=0 + shl v23.2D, v31.2D, #(64-23) // ..............................................................*.................................................................................................................................. // @slothy:interleaving_class=1 + bic x19, x25, x19, ror #57 // ..............................................................*.................................................................................................................................. // @slothy:interleaving_class=0 + ldr x25, [sp, #STACK_OFFSET_COUNT] // ...............................................................*................................................................................................................................. // @slothy:reads=STACK_OFFSET_COUNT // @slothy:interleaving_class=0 + sri v23.2D, v31.2D, #(23) // ...............................................................*................................................................................................................................. // @slothy:interleaving_class=1 + eor x17, x10, x9, ror #47 // ................................................................*................................................................................................................................ // @slothy:interleaving_class=0 + eor v31.16B, v1.16B, v25.16B // ................................................................*................................................................................................................................ // @slothy:interleaving_class=1 + ldr x9, [sp, #STACK_OFFSET_CONST_SCALAR] // .................................................................*............................................................................................................................... // @slothy:interleaving_class=0 + eor x15, x20, x28, ror #27 // .................................................................*............................................................................................................................... // @slothy:interleaving_class=0 + shl v15.2D, v31.2D, #(64-63) // ..................................................................*.............................................................................................................................. // @slothy:interleaving_class=1 + bic x20, x4, x28, ror #2 // ..................................................................*.............................................................................................................................. // @slothy:interleaving_class=0 + sri v15.2D, v31.2D, #(63) // ...................................................................*............................................................................................................................. // @slothy:interleaving_class=1 + eor x10, x20, x1, ror #50 // ...................................................................*............................................................................................................................. // @slothy:interleaving_class=0 + eor v31.16B, v8.16B, v28.16B // ....................................................................*............................................................................................................................ // @slothy:interleaving_class=1 + bic x20, x11, x27, ror #60 // ....................................................................*............................................................................................................................ // @slothy:interleaving_class=0 + eor x20, x20, x4, ror #21 // .....................................................................*........................................................................................................................... // @slothy:interleaving_class=0 + shl v1.2D, v31.2D, #(64-9) // .....................................................................*........................................................................................................................... // @slothy:interleaving_class=1 + bic x4, x28, x1, ror #48 // ......................................................................*.......................................................................................................................... // @slothy:interleaving_class=0 + sri v1.2D, v31.2D, #(9) // ......................................................................*.......................................................................................................................... // @slothy:interleaving_class=1 + bic x1, x1, x11, ror #57 // .......................................................................*......................................................................................................................... // @slothy:interleaving_class=0 + eor v31.16B, v16.16B, v25.16B // .......................................................................*......................................................................................................................... // @slothy:interleaving_class=1 + ldr x28, [x9, w25, UXTW #3] // ........................................................................*........................................................................................................................ // @slothy:interleaving_class=0 + ldr x9, [sp, #STACK_LOC_0] // ........................................................................*........................................................................................................................ // @slothy:reads=stack_0 // @slothy:interleaving_class=0 + shl v8.2D, v31.2D, #(64-19) // .........................................................................*....................................................................................................................... // @slothy:interleaving_class=1 + add x25, x25, #1 // .........................................................................*....................................................................................................................... // @slothy:interleaving_class=0 + sri v8.2D, v31.2D, #(19) // ..........................................................................*...................................................................................................................... // @slothy:interleaving_class=1 + str x25, [sp, #STACK_OFFSET_COUNT] // ..........................................................................*...................................................................................................................... // @slothy:writes=STACK_OFFSET_COUNT // @slothy:interleaving_class=0 + cmp x25, #(KECCAK_F1600_ROUNDS-1) // ...........................................................................*..................................................................................................................... // @slothy:ignore_useless_output // @slothy:interleaving_class=0 + eor v31.16B, v7.16B, v29.16B // ...........................................................................*..................................................................................................................... // @slothy:interleaving_class=1 + eor x25, x1, x27, ror #53 // ............................................................................*.................................................................................................................... // @slothy:interleaving_class=0 + shl v16.2D, v31.2D, #(64-58) // ............................................................................*.................................................................................................................... // @slothy:interleaving_class=1 + bic x27, x30, x26, ror #47 // .............................................................................*................................................................................................................... // @slothy:interleaving_class=0 + sri v16.2D, v31.2D, #(58) // .............................................................................*................................................................................................................... // @slothy:interleaving_class=1 + eor x1, x5, x28 // ..............................................................................*.................................................................................................................. // @slothy:interleaving_class=0 + eor x5, x4, x11, ror #41 // ..............................................................................*.................................................................................................................. // @slothy:interleaving_class=0 + eor v31.16B, v10.16B, v26.16B // ...............................................................................*................................................................................................................. // @slothy:interleaving_class=1 + eor x11, x19, x13, ror #35 // ...............................................................................*................................................................................................................. // @slothy:interleaving_class=0 + shl v7.2D, v31.2D, #(64-61) // ................................................................................*................................................................................................................ // @slothy:interleaving_class=1 + bic x13, x26, x24, ror #10 // ................................................................................*................................................................................................................ // @slothy:interleaving_class=0 + eor x28, x27, x24, ror #57 // .................................................................................*............................................................................................................... // @slothy:interleaving_class=0 + sri v7.2D, v31.2D, #(61) // .................................................................................*............................................................................................................... // @slothy:interleaving_class=1 + bic x27, x24, x9, ror #47 // ..................................................................................*.............................................................................................................. // @slothy:interleaving_class=0 + eor v31.16B, v3.16B, v28.16B // ..................................................................................*.............................................................................................................. // @slothy:interleaving_class=1 + bic x19, x23, x3, ror #9 // ...................................................................................*............................................................................................................. // @slothy:interleaving_class=0 + shl v10.2D, v31.2D, #(64-36) // ...................................................................................*............................................................................................................. // @slothy:interleaving_class=1 + bic x4, x29, x14, ror #41 // ....................................................................................*............................................................................................................ // @slothy:interleaving_class=0 + eor x24, x19, x29, ror #44 // ....................................................................................*............................................................................................................ // @slothy:interleaving_class=0 + sri v10.2D, v31.2D, #(36) // .....................................................................................*........................................................................................................... // @slothy:interleaving_class=1 + bic x29, x3, x29, ror #35 // .....................................................................................*........................................................................................................... // @slothy:interleaving_class=0 + eor v31.16B, v18.16B, v28.16B // ......................................................................................*.......................................................................................................... // @slothy:interleaving_class=1 + eor x13, x13, x9, ror #57 // ......................................................................................*.......................................................................................................... // @slothy:interleaving_class=0 + shl v3.2D, v31.2D, #(64-43) // .......................................................................................*......................................................................................................... // @slothy:interleaving_class=1 + eor x19, x29, x14, ror #12 // .......................................................................................*......................................................................................................... // @slothy:interleaving_class=0 + bic x29, x9, x0, ror #19 // ........................................................................................*........................................................................................................ // @slothy:interleaving_class=0 + sri v3.2D, v31.2D, #(43) // ........................................................................................*........................................................................................................ // @slothy:interleaving_class=1 + bic x14, x14, x8, ror #5 // .........................................................................................*....................................................................................................... // @slothy:interleaving_class=0 + eor v31.16B, v17.16B, v29.16B // .........................................................................................*....................................................................................................... // @slothy:interleaving_class=1 + eor x9, x14, x23, ror #43 // ..........................................................................................*...................................................................................................... // @slothy:interleaving_class=0 + eor x14, x4, x8, ror #46 // ..........................................................................................*...................................................................................................... // @slothy:interleaving_class=0 + shl v18.2D, v31.2D, #(64-49) // ...........................................................................................*..................................................................................................... // @slothy:interleaving_class=1 + bic x23, x8, x23, ror #38 // ...........................................................................................*..................................................................................................... // @slothy:interleaving_class=0 + sri v18.2D, v31.2D, #(49) // ............................................................................................*.................................................................................................... // @slothy:interleaving_class=1 + eor x8, x27, x0, ror #2 // ............................................................................................*.................................................................................................... // @slothy:interleaving_class=0 + eor v31.16B, v11.16B, v25.16B // .............................................................................................*................................................................................................... // @slothy:interleaving_class=1 + eor x4, x23, x3, ror #47 // .............................................................................................*................................................................................................... // @slothy:interleaving_class=0 + bic x3, x0, x30, ror #5 // ..............................................................................................*.................................................................................................. // @slothy:interleaving_class=0 + shl v17.2D, v31.2D, #(64-54) // ..............................................................................................*.................................................................................................. // @slothy:interleaving_class=1 + eor x23, x3, x26, ror #52 // ...............................................................................................*................................................................................................. // @slothy:interleaving_class=0 + sri v17.2D, v31.2D, #(54) // ...............................................................................................*................................................................................................. // @slothy:interleaving_class=1 + eor x3, x29, x30, ror #24 // ................................................................................................*................................................................................................ // @slothy:interleaving_class=0 + eor x0, x15, x11, ror #52 // ................................................................................................*................................................................................................ // @slothy:interleaving_class=0 + eor v31.16B, v9.16B, v27.16B // .................................................................................................*............................................................................................... // @slothy:interleaving_class=1 + eor x0, x0, x13, ror #48 // .................................................................................................*............................................................................................... // @slothy:interleaving_class=0 + shl v11.2D, v31.2D, #(64-44) // ..................................................................................................*.............................................................................................. // @slothy:interleaving_class=1 + eor x26, x8, x9, ror #57 // ..................................................................................................*.............................................................................................. // @slothy:interleaving_class=0 + sri v11.2D, v31.2D, #(44) // ...................................................................................................*............................................................................................. // @slothy:interleaving_class=1 + eor x27, x0, x14, ror #10 // ...................................................................................................*............................................................................................. // @slothy:interleaving_class=0 + eor x29, x16, x28, ror #63 // ....................................................................................................*............................................................................................ // @slothy:interleaving_class=0 + eor v31.16B, v22.16B, v29.16B // ....................................................................................................*............................................................................................ // @slothy:interleaving_class=1 + eor x26, x26, x6, ror #51 // .....................................................................................................*........................................................................................... // @slothy:interleaving_class=0 + shl v9.2D, v31.2D, #(64-3) // .....................................................................................................*........................................................................................... // @slothy:interleaving_class=1 + eor x30, x23, x22, ror #50 // ......................................................................................................*.......................................................................................... // @slothy:interleaving_class=0 + sri v9.2D, v31.2D, #(3) // ......................................................................................................*.......................................................................................... // @slothy:interleaving_class=1 + eor x0, x26, x10, ror #31 // .......................................................................................................*......................................................................................... // @slothy:interleaving_class=0 + eor x29, x29, x19, ror #37 // .......................................................................................................*......................................................................................... // @slothy:interleaving_class=0 + eor v31.16B, v14.16B, v27.16B // ........................................................................................................*........................................................................................ // @slothy:interleaving_class=1 + eor x27, x27, x12, ror #5 // ........................................................................................................*........................................................................................ // @slothy:interleaving_class=0 + shl v22.2D, v31.2D, #(64-25) // .........................................................................................................*....................................................................................... // @slothy:interleaving_class=1 + eor x30, x30, x24, ror #34 // .........................................................................................................*....................................................................................... // @slothy:interleaving_class=0 + eor x0, x0, x7, ror #27 // ..........................................................................................................*...................................................................................... // @slothy:interleaving_class=0 + sri v22.2D, v31.2D, #(25) // ..........................................................................................................*...................................................................................... // @slothy:interleaving_class=1 + eor x26, x30, x21, ror #26 // ...........................................................................................................*..................................................................................... // @slothy:interleaving_class=0 + eor v31.16B, v20.16B, v26.16B // ...........................................................................................................*..................................................................................... // @slothy:interleaving_class=1 + eor x26, x26, x25, ror #15 // ............................................................................................................*.................................................................................... // @slothy:interleaving_class=0 + shl v14.2D, v31.2D, #(64-46) // ............................................................................................................*.................................................................................... // @slothy:interleaving_class=1 + ror x30, x27, #62 // .............................................................................................................*................................................................................... // @slothy:interleaving_class=0 + eor x30, x30, x26, ror #57 // .............................................................................................................*................................................................................... // @slothy:interleaving_class=0 + sri v14.2D, v31.2D, #(46) // ..............................................................................................................*.................................................................................. // @slothy:interleaving_class=1 + ror x26, x26, #58 // ..............................................................................................................*.................................................................................. // @slothy:interleaving_class=0 + eor v31.16B, v4.16B, v27.16B // ...............................................................................................................*................................................................................. // @slothy:interleaving_class=1 + eor x16, x30, x16 // ...............................................................................................................*................................................................................. // @slothy:interleaving_class=0 + shl v20.2D, v31.2D, #(64-37) // ................................................................................................................*................................................................................ // @slothy:interleaving_class=1 + eor x28, x30, x28, ror #63 // ................................................................................................................*................................................................................ // @slothy:interleaving_class=0 + str x28, [sp, #STACK_LOC_0] // .................................................................................................................*............................................................................... // @slothy:writes=stack_0 // @slothy:interleaving_class=0 + sri v20.2D, v31.2D, #(37) // .................................................................................................................*............................................................................... // @slothy:interleaving_class=1 + eor x29, x29, x17, ror #36 // ..................................................................................................................*.............................................................................. // @slothy:interleaving_class=0 + eor v31.16B, v24.16B, v27.16B // ..................................................................................................................*.............................................................................. // @slothy:interleaving_class=1 + eor x28, x1, x2, ror #61 // ...................................................................................................................*............................................................................. // @slothy:interleaving_class=0 + eor x19, x30, x19, ror #37 // ...................................................................................................................*............................................................................. // @slothy:interleaving_class=0 + shl v4.2D, v31.2D, #(64-50) // ....................................................................................................................*............................................................................ // @slothy:interleaving_class=1 + eor x29, x29, x20, ror #2 // ....................................................................................................................*............................................................................ // @slothy:interleaving_class=0 + sri v4.2D, v31.2D, #(50) // .....................................................................................................................*........................................................................... // @slothy:interleaving_class=1 + eor x28, x28, x4, ror #54 // .....................................................................................................................*........................................................................... // @slothy:interleaving_class=0 + eor v31.16B, v21.16B, v25.16B // ......................................................................................................................*.......................................................................... // @slothy:interleaving_class=1 + eor x26, x26, x0, ror #55 // ......................................................................................................................*.......................................................................... // @slothy:interleaving_class=0 + eor x28, x28, x3, ror #39 // .......................................................................................................................*......................................................................... // @slothy:interleaving_class=0 + shl v24.2D, v31.2D, #(64-62) // .......................................................................................................................*......................................................................... // @slothy:interleaving_class=1 + eor x28, x28, x5, ror #25 // ........................................................................................................................*........................................................................ // @slothy:interleaving_class=0 + sri v24.2D, v31.2D, #(62) // ........................................................................................................................*........................................................................ // @slothy:interleaving_class=1 + ror x0, x0, #56 // .........................................................................................................................*....................................................................... // @slothy:interleaving_class=0 + eor x0, x0, x29, ror #63 // .........................................................................................................................*....................................................................... // @slothy:interleaving_class=0 + eor v31.16B, v5.16B, v26.16B // ..........................................................................................................................*...................................................................... // @slothy:interleaving_class=1 + eor x27, x28, x27, ror #61 // ..........................................................................................................................*...................................................................... // @slothy:interleaving_class=0 + shl v21.2D, v31.2D, #(64-28) // ...........................................................................................................................*..................................................................... // @slothy:interleaving_class=1 + eor x13, x0, x13, ror #46 // ...........................................................................................................................*..................................................................... // @slothy:interleaving_class=0 + sri v21.2D, v31.2D, #(28) // ............................................................................................................................*.................................................................... // @slothy:interleaving_class=1 + eor x28, x29, x28, ror #63 // ............................................................................................................................*.................................................................... // @slothy:interleaving_class=0 + eor x29, x30, x20, ror #2 // .............................................................................................................................*................................................................... // @slothy:interleaving_class=0 + eor v31.16B, v6.16B, v25.16B // .............................................................................................................................*................................................................... // @slothy:interleaving_class=1 + eor x20, x26, x3, ror #39 // ..............................................................................................................................*.................................................................. // @slothy:interleaving_class=0 + shl v27.2D, v31.2D, #(64-20) // ..............................................................................................................................*.................................................................. // @slothy:interleaving_class=1 + eor x11, x0, x11, ror #50 // ...............................................................................................................................*................................................................. // @slothy:interleaving_class=0 + sri v27.2D, v31.2D, #(20) // ...............................................................................................................................*................................................................. // @slothy:interleaving_class=1 + eor x25, x28, x25, ror #9 // ................................................................................................................................*................................................................ // @slothy:interleaving_class=0 + eor x3, x28, x21, ror #20 // ................................................................................................................................*................................................................ // @slothy:interleaving_class=0 + bic v31.16B, v7.16B, v11.16B // .................................................................................................................................*............................................................... // @slothy:interleaving_class=1 + eor x21, x26, x1 // .................................................................................................................................*............................................................... // @slothy:interleaving_class=0 + eor v5.16B, v31.16B, v10.16B // ..................................................................................................................................*.............................................................. // @slothy:interleaving_class=1 + eor x9, x27, x9, ror #49 // ..................................................................................................................................*.............................................................. // @slothy:interleaving_class=0 + eor x24, x28, x24, ror #28 // ...................................................................................................................................*............................................................. // @slothy:interleaving_class=0 + bic v31.16B, v8.16B, v7.16B // ...................................................................................................................................*............................................................. // @slothy:interleaving_class=1 + eor x1, x30, x17, ror #36 // ....................................................................................................................................*............................................................ // @slothy:interleaving_class=0 + eor v6.16B, v31.16B, v11.16B // ....................................................................................................................................*............................................................ // @slothy:interleaving_class=1 + eor x14, x0, x14, ror #8 // .....................................................................................................................................*........................................................... // @slothy:interleaving_class=0 + bic v31.16B, v9.16B, v8.16B // .....................................................................................................................................*........................................................... // @slothy:interleaving_class=1 + eor x22, x28, x22, ror #44 // ......................................................................................................................................*.......................................................... // @slothy:interleaving_class=0 + eor x8, x27, x8, ror #56 // ......................................................................................................................................*.......................................................... // @slothy:interleaving_class=0 + eor v7.16B, v31.16B, v7.16B // .......................................................................................................................................*......................................................... // @slothy:interleaving_class=1 + eor x17, x27, x7, ror #19 // .......................................................................................................................................*......................................................... // @slothy:interleaving_class=0 + bic v31.16B, v10.16B, v9.16B // ........................................................................................................................................*........................................................ // @slothy:interleaving_class=1 + eor x15, x0, x15, ror #62 // ........................................................................................................................................*........................................................ // @slothy:interleaving_class=0 + bic x7, x20, x22, ror #47 // .........................................................................................................................................*....................................................... // @slothy:interleaving_class=0 + eor v8.16B, v31.16B, v8.16B // .........................................................................................................................................*....................................................... // @slothy:interleaving_class=1 + eor x4, x26, x4, ror #54 // ..........................................................................................................................................*...................................................... // @slothy:interleaving_class=0 + bic v31.16B, v11.16B, v10.16B // ..........................................................................................................................................*...................................................... // @slothy:interleaving_class=1 + eor x0, x0, x12, ror #3 // ...........................................................................................................................................*..................................................... // @slothy:interleaving_class=0 + eor v9.16B, v31.16B, v9.16B // ...........................................................................................................................................*..................................................... // @slothy:interleaving_class=1 + eor x28, x28, x23, ror #58 // ............................................................................................................................................*.................................................... // @slothy:interleaving_class=0 + eor x23, x26, x2, ror #61 // ............................................................................................................................................*.................................................... // @slothy:interleaving_class=0 + bic v31.16B, v12.16B, v16.16B // .............................................................................................................................................*................................................... // @slothy:interleaving_class=1 + eor x26, x26, x5, ror #25 // .............................................................................................................................................*................................................... // @slothy:interleaving_class=0 + eor v10.16B, v31.16B, v15.16B // ..............................................................................................................................................*.................................................. // @slothy:interleaving_class=1 + eor x2, x7, x16, ror #39 // ..............................................................................................................................................*.................................................. // @slothy:interleaving_class=0 + bic v31.16B, v13.16B, v12.16B // ...............................................................................................................................................*................................................. // @slothy:interleaving_class=1 + bic x7, x9, x20, ror #42 // ...............................................................................................................................................*................................................. // @slothy:interleaving_class=0 + bic x30, x15, x9, ror #16 // ................................................................................................................................................*................................................ // @slothy:interleaving_class=0 + eor v11.16B, v31.16B, v16.16B // ................................................................................................................................................*................................................ // @slothy:interleaving_class=1 + eor x7, x7, x22, ror #25 // .................................................................................................................................................*............................................... // @slothy:interleaving_class=0 + bic v31.16B, v14.16B, v13.16B // .................................................................................................................................................*............................................... // @slothy:interleaving_class=1 + eor x12, x30, x20, ror #58 // ..................................................................................................................................................*.............................................. // @slothy:interleaving_class=0 + bic x20, x22, x16, ror #56 // ..................................................................................................................................................*.............................................. // @slothy:interleaving_class=0 + eor v12.16B, v31.16B, v12.16B // ...................................................................................................................................................*............................................. // @slothy:interleaving_class=1 + eor x30, x27, x6, ror #43 // ...................................................................................................................................................*............................................. // @slothy:interleaving_class=0 + bic v31.16B, v15.16B, v14.16B // ....................................................................................................................................................*............................................ // @slothy:interleaving_class=1 + eor x22, x20, x15, ror #23 // ....................................................................................................................................................*............................................ // @slothy:interleaving_class=0 + eor v13.16B, v31.16B, v13.16B // .....................................................................................................................................................*........................................... // @slothy:interleaving_class=1 + bic x6, x19, x13, ror #42 // .....................................................................................................................................................*........................................... // @slothy:interleaving_class=0 + eor x6, x6, x17, ror #41 // ......................................................................................................................................................*.......................................... // @slothy:interleaving_class=0 + bic v31.16B, v16.16B, v15.16B // ......................................................................................................................................................*.......................................... // @slothy:interleaving_class=1 + bic x5, x13, x17, ror #63 // .......................................................................................................................................................*......................................... // @slothy:interleaving_class=0 + eor v14.16B, v31.16B, v14.16B // .......................................................................................................................................................*......................................... // @slothy:interleaving_class=1 + eor x5, x21, x5, ror #21 // ........................................................................................................................................................*........................................ // @slothy:interleaving_class=0 + bic v31.16B, v17.16B, v21.16B // ........................................................................................................................................................*........................................ // @slothy:interleaving_class=1 + bic x17, x17, x21, ror #44 // .........................................................................................................................................................*....................................... // @slothy:interleaving_class=0 + eor x27, x27, x10, ror #23 // .........................................................................................................................................................*....................................... // @slothy:interleaving_class=0 + eor v15.16B, v31.16B, v20.16B // ..........................................................................................................................................................*...................................... // @slothy:interleaving_class=1 + bic x21, x21, x25, ror #50 // ..........................................................................................................................................................*...................................... // @slothy:interleaving_class=0 + bic v31.16B, v18.16B, v17.16B // ...........................................................................................................................................................*..................................... // @slothy:interleaving_class=1 + bic x20, x27, x4, ror #25 // ...........................................................................................................................................................*..................................... // @slothy:interleaving_class=0 + bic x10, x16, x15, ror #31 // ............................................................................................................................................................*.................................... // @slothy:interleaving_class=0 + eor v16.16B, v31.16B, v21.16B // ............................................................................................................................................................*.................................... // @slothy:interleaving_class=1 + eor x16, x21, x19, ror #43 // .............................................................................................................................................................*................................... // @slothy:interleaving_class=0 + bic v31.16B, v19.16B, v18.16B // .............................................................................................................................................................*................................... // @slothy:interleaving_class=1 + eor x21, x17, x25, ror #30 // ..............................................................................................................................................................*.................................. // @slothy:interleaving_class=0 + eor v17.16B, v31.16B, v17.16B // ..............................................................................................................................................................*.................................. // @slothy:interleaving_class=1 + bic x19, x25, x19, ror #57 // ...............................................................................................................................................................*................................. // @slothy:interleaving_class=0 + ldr x25, [sp, #STACK_OFFSET_COUNT] // ...............................................................................................................................................................*................................. // @slothy:reads=STACK_OFFSET_COUNT // @slothy:interleaving_class=0 + bic v31.16B, v20.16B, v19.16B // ................................................................................................................................................................*................................ // @slothy:interleaving_class=1 + eor x17, x10, x9, ror #47 // ................................................................................................................................................................*................................ // @slothy:interleaving_class=0 + eor v18.16B, v31.16B, v18.16B // .................................................................................................................................................................*............................... // @slothy:interleaving_class=1 + ldr x9, [sp, #STACK_OFFSET_CONST_SCALAR] // .................................................................................................................................................................*............................... // @slothy:interleaving_class=0 + eor x15, x20, x28, ror #27 // ..................................................................................................................................................................*.............................. // @slothy:interleaving_class=0 + bic v31.16B, v21.16B, v20.16B // ..................................................................................................................................................................*.............................. // @slothy:interleaving_class=1 + bic x20, x4, x28, ror #2 // ...................................................................................................................................................................*............................. // @slothy:interleaving_class=0 + eor v19.16B, v31.16B, v19.16B // ...................................................................................................................................................................*............................. // @slothy:interleaving_class=1 + eor x10, x20, x1, ror #50 // ....................................................................................................................................................................*............................ // @slothy:interleaving_class=0 + bic v31.16B, v22.16B, v1.16B // ....................................................................................................................................................................*............................ // @slothy:interleaving_class=1 + bic x20, x11, x27, ror #60 // .....................................................................................................................................................................*........................... // @slothy:interleaving_class=0 + eor x20, x20, x4, ror #21 // .....................................................................................................................................................................*........................... // @slothy:interleaving_class=0 + eor v20.16B, v31.16B, v0.16B // ......................................................................................................................................................................*.......................... // @slothy:interleaving_class=1 + bic x4, x28, x1, ror #48 // ......................................................................................................................................................................*.......................... // @slothy:interleaving_class=0 + bic v31.16B, v23.16B, v22.16B // .......................................................................................................................................................................*......................... // @slothy:interleaving_class=1 + bic x1, x1, x11, ror #57 // .......................................................................................................................................................................*......................... // @slothy:interleaving_class=0 + eor v21.16B, v31.16B, v1.16B // ........................................................................................................................................................................*........................ // @slothy:interleaving_class=1 + ldr x28, [x9, w25, UXTW #3] // ........................................................................................................................................................................*........................ // @slothy:interleaving_class=0 + ldr x9, [sp, #STACK_LOC_0] // .........................................................................................................................................................................*....................... // @slothy:reads=stack_0 // @slothy:interleaving_class=0 + bic v31.16B, v24.16B, v23.16B // .........................................................................................................................................................................*....................... // @slothy:interleaving_class=1 + add x25, x25, #1 // ..........................................................................................................................................................................*...................... // @slothy:interleaving_class=0 + eor v22.16B, v31.16B, v22.16B // ..........................................................................................................................................................................*...................... // @slothy:interleaving_class=1 + str x25, [sp, #STACK_OFFSET_COUNT] // ...........................................................................................................................................................................*..................... // @slothy:writes=STACK_OFFSET_COUNT // @slothy:interleaving_class=0 + cmp x25, #(KECCAK_F1600_ROUNDS-1) // ...........................................................................................................................................................................*..................... // @slothy:ignore_useless_output // @slothy:interleaving_class=0 + bic v31.16B, v0.16B, v24.16B // ............................................................................................................................................................................*.................... // @slothy:interleaving_class=1 + eor x25, x1, x27, ror #53 // ............................................................................................................................................................................*.................... // @slothy:interleaving_class=0 + eor v23.16B, v31.16B, v23.16B // .............................................................................................................................................................................*................... // @slothy:interleaving_class=1 + bic x27, x30, x26, ror #47 // .............................................................................................................................................................................*................... // @slothy:interleaving_class=0 + bic v31.16B, v1.16B, v0.16B // ..............................................................................................................................................................................*.................. // @slothy:interleaving_class=1 + eor x1, x5, x28 // ..............................................................................................................................................................................*.................. // @slothy:interleaving_class=0 + eor x5, x4, x11, ror #41 // ...............................................................................................................................................................................*................. // @slothy:interleaving_class=0 + eor v24.16B, v31.16B, v24.16B // ...............................................................................................................................................................................*................. // @slothy:interleaving_class=1 + eor x11, x19, x13, ror #35 // ................................................................................................................................................................................*................ // @slothy:interleaving_class=0 + bic v31.16B, v2.16B, v27.16B // ................................................................................................................................................................................*................ // @slothy:interleaving_class=1 + bic x13, x26, x24, ror #10 // .................................................................................................................................................................................*............... // @slothy:interleaving_class=0 + eor x28, x27, x24, ror #57 // .................................................................................................................................................................................*............... // @slothy:interleaving_class=0 + eor v0.16B, v31.16B, v30.16B // ..................................................................................................................................................................................*.............. // @slothy:interleaving_class=1 + bic x27, x24, x9, ror #47 // ..................................................................................................................................................................................*.............. // @slothy:interleaving_class=0 + bic v31.16B, v3.16B, v2.16B // ...................................................................................................................................................................................*............. // @slothy:interleaving_class=1 + bic x19, x23, x3, ror #9 // ...................................................................................................................................................................................*............. // @slothy:interleaving_class=0 + eor v1.16B, v31.16B, v27.16B // ....................................................................................................................................................................................*............ // @slothy:interleaving_class=1 + bic x4, x29, x14, ror #41 // ....................................................................................................................................................................................*............ // @slothy:interleaving_class=0 + eor x24, x19, x29, ror #44 // .....................................................................................................................................................................................*........... // @slothy:interleaving_class=0 + bic v31.16B, v4.16B, v3.16B // .....................................................................................................................................................................................*........... // @slothy:interleaving_class=1 + bic x29, x3, x29, ror #35 // ......................................................................................................................................................................................*.......... // @slothy:interleaving_class=0 + eor v2.16B, v31.16B, v2.16B // ......................................................................................................................................................................................*.......... // @slothy:interleaving_class=1 + eor x13, x13, x9, ror #57 // .......................................................................................................................................................................................*......... // @slothy:interleaving_class=0 + bic v31.16B, v30.16B, v4.16B // .......................................................................................................................................................................................*......... // @slothy:interleaving_class=1 + eor x19, x29, x14, ror #12 // ........................................................................................................................................................................................*........ // @slothy:interleaving_class=0 + bic x29, x9, x0, ror #19 // ........................................................................................................................................................................................*........ // @slothy:interleaving_class=0 + eor v3.16B, v31.16B, v3.16B // .........................................................................................................................................................................................*....... // @slothy:interleaving_class=1 + bic x14, x14, x8, ror #5 // .........................................................................................................................................................................................*....... // @slothy:interleaving_class=0 + bic v31.16B, v27.16B, v30.16B // ..........................................................................................................................................................................................*...... // @slothy:interleaving_class=1 + eor x9, x14, x23, ror #43 // ..........................................................................................................................................................................................*...... // @slothy:interleaving_class=0 + eor x14, x4, x8, ror #46 // ...........................................................................................................................................................................................*..... // @slothy:interleaving_class=0 + eor v4.16B, v31.16B, v4.16B // ...........................................................................................................................................................................................*..... // @slothy:interleaving_class=1 + bic x23, x8, x23, ror #38 // ............................................................................................................................................................................................*.... // @slothy:interleaving_class=0 + eor x8, x27, x0, ror #2 // ............................................................................................................................................................................................*.... // @slothy:interleaving_class=0 + eor x4, x23, x3, ror #47 // .............................................................................................................................................................................................*... // @slothy:interleaving_class=0 + bic x3, x0, x30, ror #5 // .............................................................................................................................................................................................*... // @slothy:interleaving_class=0 + eor x23, x3, x26, ror #52 // ..............................................................................................................................................................................................*.. // @slothy:interleaving_class=0 + eor x3, x29, x30, ror #24 // ..............................................................................................................................................................................................*.. // @slothy:interleaving_class=0 + ldr x30, [sp, #STACK_OFFSET_CONST_VECTOR] // ...............................................................................................................................................................................................*. // @slothy:reads=STACK_OFFSET_CONST_VECTOR // @slothy:interleaving_class=1 + ld1r {v28.2D}, [x30], #8 // ...............................................................................................................................................................................................*. // @slothy:interleaving_class=1 + str x30, [sp, #STACK_OFFSET_CONST_VECTOR] // ................................................................................................................................................................................................* // @slothy:writes=STACK_OFFSET_CONST_VECTOR // @slothy:interleaving_class=1 + eor v0.16B, v0.16B, v28.16B // ................................................................................................................................................................................................* // @slothy:interleaving_class=1 + + // --------------------------------------------------------------------------------------- cycle (expected) ---------------------------------------------------------------------------------------> + // 0 25 50 75 100 125 150 175 + // |------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|------------------------|----------------- + // eor x0, x15, x11, ror #52 // *................................................................................................................................................................................................ + // eor x0, x0, x13, ror #48 // *................................................................................................................................................................................................ + // eor x26, x8, x9, ror #57 // ..*.............................................................................................................................................................................................. + // eor x27, x0, x14, ror #10 // ...*............................................................................................................................................................................................. + // eor x29, x16, x28, ror #63 // ...*............................................................................................................................................................................................. + // eor x26, x26, x6, ror #51 // ....*............................................................................................................................................................................................ + // eor x30, x23, x22, ror #50 // .....*........................................................................................................................................................................................... + // eor x0, x26, x10, ror #31 // ......*.......................................................................................................................................................................................... + // eor x29, x29, x19, ror #37 // .......*......................................................................................................................................................................................... + // eor x27, x27, x12, ror #5 // ........*........................................................................................................................................................................................ + // eor x30, x30, x24, ror #34 // .........*....................................................................................................................................................................................... + // eor x0, x0, x7, ror #27 // .........*....................................................................................................................................................................................... + // eor x26, x30, x21, ror #26 // ..........*...................................................................................................................................................................................... + // eor x26, x26, x25, ror #15 // ...........*..................................................................................................................................................................................... + // ror x30, x27, #62 // ............*.................................................................................................................................................................................... + // eor x30, x30, x26, ror #57 // .............*................................................................................................................................................................................... + // ror x26, x26, #58 // ..............*.................................................................................................................................................................................. + // eor x16, x30, x16 // ...............*................................................................................................................................................................................. + // eor x28, x30, x28, ror #63 // ................*................................................................................................................................................................................ + // str x28, [sp, #STACK_LOC_0] // ................*................................................................................................................................................................................ + // eor x29, x29, x17, ror #36 // .................*............................................................................................................................................................................... + // eor x28, x1, x2, ror #61 // ..................*.............................................................................................................................................................................. + // eor x19, x30, x19, ror #37 // ...................*............................................................................................................................................................................. + // eor x29, x29, x20, ror #2 // ....................*............................................................................................................................................................................ + // eor x28, x28, x4, ror #54 // .....................*........................................................................................................................................................................... + // eor x26, x26, x0, ror #55 // ......................*.......................................................................................................................................................................... + // eor x28, x28, x3, ror #39 // ......................*.......................................................................................................................................................................... + // eor x28, x28, x5, ror #25 // .......................*......................................................................................................................................................................... + // ror x0, x0, #56 // ........................*........................................................................................................................................................................ + // eor x0, x0, x29, ror #63 // .........................*....................................................................................................................................................................... + // eor x27, x28, x27, ror #61 // ..........................*...................................................................................................................................................................... + // eor x13, x0, x13, ror #46 // ...........................*..................................................................................................................................................................... + // eor x28, x29, x28, ror #63 // ............................*.................................................................................................................................................................... + // eor x29, x30, x20, ror #2 // ............................*.................................................................................................................................................................... + // eor x20, x26, x3, ror #39 // .............................*................................................................................................................................................................... + // eor x11, x0, x11, ror #50 // ..............................*.................................................................................................................................................................. + // eor x25, x28, x25, ror #9 // ...............................*................................................................................................................................................................. + // eor x3, x28, x21, ror #20 // ................................*................................................................................................................................................................ + // eor x21, x26, x1 // .................................*............................................................................................................................................................... + // eor x9, x27, x9, ror #49 // ..................................*.............................................................................................................................................................. + // eor x24, x28, x24, ror #28 // ..................................*.............................................................................................................................................................. + // eor x1, x30, x17, ror #36 // ...................................*............................................................................................................................................................. + // eor x14, x0, x14, ror #8 // ....................................*............................................................................................................................................................ + // eor x22, x28, x22, ror #44 // .....................................*........................................................................................................................................................... + // eor x8, x27, x8, ror #56 // ......................................*.......................................................................................................................................................... + // eor x17, x27, x7, ror #19 // .......................................*......................................................................................................................................................... + // eor x15, x0, x15, ror #62 // ........................................*........................................................................................................................................................ + // bic x7, x20, x22, ror #47 // ........................................*........................................................................................................................................................ + // eor x4, x26, x4, ror #54 // .........................................*....................................................................................................................................................... + // eor x0, x0, x12, ror #3 // ..........................................*...................................................................................................................................................... + // eor x28, x28, x23, ror #58 // ...........................................*..................................................................................................................................................... + // eor x23, x26, x2, ror #61 // ............................................*.................................................................................................................................................... + // eor x26, x26, x5, ror #25 // .............................................*................................................................................................................................................... + // eor x2, x7, x16, ror #39 // ..............................................*.................................................................................................................................................. + // bic x7, x9, x20, ror #42 // ...............................................*................................................................................................................................................. + // bic x30, x15, x9, ror #16 // ...............................................*................................................................................................................................................. + // eor x7, x7, x22, ror #25 // ................................................*................................................................................................................................................ + // eor x12, x30, x20, ror #58 // .................................................*............................................................................................................................................... + // bic x20, x22, x16, ror #56 // ..................................................*.............................................................................................................................................. + // eor x30, x27, x6, ror #43 // ...................................................*............................................................................................................................................. + // eor x22, x20, x15, ror #23 // ....................................................*............................................................................................................................................ + // bic x6, x19, x13, ror #42 // .....................................................*........................................................................................................................................... + // eor x6, x6, x17, ror #41 // .....................................................*........................................................................................................................................... + // bic x5, x13, x17, ror #63 // ......................................................*.......................................................................................................................................... + // eor x5, x21, x5, ror #21 // .......................................................*......................................................................................................................................... + // bic x17, x17, x21, ror #44 // ........................................................*........................................................................................................................................ + // eor x27, x27, x10, ror #23 // .........................................................*....................................................................................................................................... + // bic x21, x21, x25, ror #50 // ..........................................................*...................................................................................................................................... + // bic x20, x27, x4, ror #25 // ...........................................................*..................................................................................................................................... + // bic x10, x16, x15, ror #31 // ...........................................................*..................................................................................................................................... + // eor x16, x21, x19, ror #43 // ............................................................*.................................................................................................................................... + // eor x21, x17, x25, ror #30 // .............................................................*................................................................................................................................... + // bic x19, x25, x19, ror #57 // ..............................................................*.................................................................................................................................. + // ldr x25, [sp, #STACK_OFFSET_COUNT] // ...............................................................*................................................................................................................................. + // eor x17, x10, x9, ror #47 // ................................................................*................................................................................................................................ + // ldr x9, [sp, #STACK_OFFSET_CONST_SCALAR] // .................................................................*............................................................................................................................... + // eor x15, x20, x28, ror #27 // .................................................................*............................................................................................................................... + // bic x20, x4, x28, ror #2 // ..................................................................*.............................................................................................................................. + // eor x10, x20, x1, ror #50 // ...................................................................*............................................................................................................................. + // bic x20, x11, x27, ror #60 // ....................................................................*............................................................................................................................ + // eor x20, x20, x4, ror #21 // .....................................................................*........................................................................................................................... + // bic x4, x28, x1, ror #48 // ......................................................................*.......................................................................................................................... + // bic x1, x1, x11, ror #57 // .......................................................................*......................................................................................................................... + // ldr x28, [x9, w25, UXTW #3] // ........................................................................*........................................................................................................................ + // ldr x9, [sp, #STACK_LOC_0] // ........................................................................*........................................................................................................................ + // add x25, x25, #1 // .........................................................................*....................................................................................................................... + // str x25, [sp, #STACK_OFFSET_COUNT] // ..........................................................................*...................................................................................................................... + // cmp x25, #(KECCAK_F1600_ROUNDS-1) // ...........................................................................*..................................................................................................................... + // eor x25, x1, x27, ror #53 // ............................................................................*.................................................................................................................... + // bic x27, x30, x26, ror #47 // .............................................................................*................................................................................................................... + // eor x1, x5, x28 // ..............................................................................*.................................................................................................................. + // eor x5, x4, x11, ror #41 // ..............................................................................*.................................................................................................................. + // eor x11, x19, x13, ror #35 // ...............................................................................*................................................................................................................. + // bic x13, x26, x24, ror #10 // ................................................................................*................................................................................................................ + // eor x28, x27, x24, ror #57 // .................................................................................*............................................................................................................... + // bic x27, x24, x9, ror #47 // ..................................................................................*.............................................................................................................. + // bic x19, x23, x3, ror #9 // ...................................................................................*............................................................................................................. + // bic x4, x29, x14, ror #41 // ....................................................................................*............................................................................................................ + // eor x24, x19, x29, ror #44 // ....................................................................................*............................................................................................................ + // bic x29, x3, x29, ror #35 // .....................................................................................*........................................................................................................... + // eor x13, x13, x9, ror #57 // ......................................................................................*.......................................................................................................... + // eor x19, x29, x14, ror #12 // .......................................................................................*......................................................................................................... + // bic x29, x9, x0, ror #19 // ........................................................................................*........................................................................................................ + // bic x14, x14, x8, ror #5 // .........................................................................................*....................................................................................................... + // eor x9, x14, x23, ror #43 // ..........................................................................................*...................................................................................................... + // eor x14, x4, x8, ror #46 // ..........................................................................................*...................................................................................................... + // bic x23, x8, x23, ror #38 // ...........................................................................................*..................................................................................................... + // eor x8, x27, x0, ror #2 // ............................................................................................*.................................................................................................... + // eor x4, x23, x3, ror #47 // .............................................................................................*................................................................................................... + // bic x3, x0, x30, ror #5 // ..............................................................................................*.................................................................................................. + // eor x23, x3, x26, ror #52 // ...............................................................................................*................................................................................................. + // eor x3, x29, x30, ror #24 // ................................................................................................*................................................................................................ + // eor x0, x15, x11, ror #52 // ................................................................................................*................................................................................................ + // eor x0, x0, x13, ror #48 // .................................................................................................*............................................................................................... + // eor x26, x8, x9, ror #57 // ..................................................................................................*.............................................................................................. + // eor x27, x0, x14, ror #10 // ...................................................................................................*............................................................................................. + // eor x29, x16, x28, ror #63 // ....................................................................................................*............................................................................................ + // eor x26, x26, x6, ror #51 // .....................................................................................................*........................................................................................... + // eor x30, x23, x22, ror #50 // ......................................................................................................*.......................................................................................... + // eor x0, x26, x10, ror #31 // .......................................................................................................*......................................................................................... + // eor x29, x29, x19, ror #37 // .......................................................................................................*......................................................................................... + // eor x27, x27, x12, ror #5 // ........................................................................................................*........................................................................................ + // eor x30, x30, x24, ror #34 // .........................................................................................................*....................................................................................... + // eor x0, x0, x7, ror #27 // ..........................................................................................................*...................................................................................... + // eor x26, x30, x21, ror #26 // ...........................................................................................................*..................................................................................... + // eor x26, x26, x25, ror #15 // ............................................................................................................*.................................................................................... + // ror x30, x27, #62 // .............................................................................................................*................................................................................... + // eor x30, x30, x26, ror #57 // .............................................................................................................*................................................................................... + // ror x26, x26, #58 // ..............................................................................................................*.................................................................................. + // eor x16, x30, x16 // ...............................................................................................................*................................................................................. + // eor x28, x30, x28, ror #63 // ................................................................................................................*................................................................................ + // str x28, [sp, #STACK_LOC_0] // .................................................................................................................*............................................................................... + // eor x29, x29, x17, ror #36 // ..................................................................................................................*.............................................................................. + // eor x28, x1, x2, ror #61 // ...................................................................................................................*............................................................................. + // eor x19, x30, x19, ror #37 // ...................................................................................................................*............................................................................. + // eor x29, x29, x20, ror #2 // ....................................................................................................................*............................................................................ + // eor x28, x28, x4, ror #54 // .....................................................................................................................*........................................................................... + // eor x26, x26, x0, ror #55 // ......................................................................................................................*.......................................................................... + // eor x28, x28, x3, ror #39 // .......................................................................................................................*......................................................................... + // eor x28, x28, x5, ror #25 // ........................................................................................................................*........................................................................ + // ror x0, x0, #56 // .........................................................................................................................*....................................................................... + // eor x0, x0, x29, ror #63 // .........................................................................................................................*....................................................................... + // eor x27, x28, x27, ror #61 // ..........................................................................................................................*...................................................................... + // eor x13, x0, x13, ror #46 // ...........................................................................................................................*..................................................................... + // eor x28, x29, x28, ror #63 // ............................................................................................................................*.................................................................... + // eor x29, x30, x20, ror #2 // .............................................................................................................................*................................................................... + // eor x20, x26, x3, ror #39 // ..............................................................................................................................*.................................................................. + // eor x11, x0, x11, ror #50 // ...............................................................................................................................*................................................................. + // eor x25, x28, x25, ror #9 // ................................................................................................................................*................................................................ + // eor x3, x28, x21, ror #20 // ................................................................................................................................*................................................................ + // eor x21, x26, x1 // .................................................................................................................................*............................................................... + // eor x9, x27, x9, ror #49 // ..................................................................................................................................*.............................................................. + // eor x24, x28, x24, ror #28 // ...................................................................................................................................*............................................................. + // eor x1, x30, x17, ror #36 // ....................................................................................................................................*............................................................ + // eor x14, x0, x14, ror #8 // .....................................................................................................................................*........................................................... + // eor x22, x28, x22, ror #44 // ......................................................................................................................................*.......................................................... + // eor x8, x27, x8, ror #56 // ......................................................................................................................................*.......................................................... + // eor x17, x27, x7, ror #19 // .......................................................................................................................................*......................................................... + // eor x15, x0, x15, ror #62 // ........................................................................................................................................*........................................................ + // bic x7, x20, x22, ror #47 // .........................................................................................................................................*....................................................... + // eor x4, x26, x4, ror #54 // ..........................................................................................................................................*...................................................... + // eor x0, x0, x12, ror #3 // ...........................................................................................................................................*..................................................... + // eor x28, x28, x23, ror #58 // ............................................................................................................................................*.................................................... + // eor x23, x26, x2, ror #61 // ............................................................................................................................................*.................................................... + // eor x26, x26, x5, ror #25 // .............................................................................................................................................*................................................... + // eor x2, x7, x16, ror #39 // ..............................................................................................................................................*.................................................. + // bic x7, x9, x20, ror #42 // ...............................................................................................................................................*................................................. + // bic x30, x15, x9, ror #16 // ................................................................................................................................................*................................................ + // eor x7, x7, x22, ror #25 // .................................................................................................................................................*............................................... + // eor x12, x30, x20, ror #58 // ..................................................................................................................................................*.............................................. + // bic x20, x22, x16, ror #56 // ..................................................................................................................................................*.............................................. + // eor x30, x27, x6, ror #43 // ...................................................................................................................................................*............................................. + // eor x22, x20, x15, ror #23 // ....................................................................................................................................................*............................................ + // bic x6, x19, x13, ror #42 // .....................................................................................................................................................*........................................... + // eor x6, x6, x17, ror #41 // ......................................................................................................................................................*.......................................... + // bic x5, x13, x17, ror #63 // .......................................................................................................................................................*......................................... + // eor x5, x21, x5, ror #21 // ........................................................................................................................................................*........................................ + // bic x17, x17, x21, ror #44 // .........................................................................................................................................................*....................................... + // eor x27, x27, x10, ror #23 // .........................................................................................................................................................*....................................... + // bic x21, x21, x25, ror #50 // ..........................................................................................................................................................*...................................... + // bic x20, x27, x4, ror #25 // ...........................................................................................................................................................*..................................... + // bic x10, x16, x15, ror #31 // ............................................................................................................................................................*.................................... + // eor x16, x21, x19, ror #43 // .............................................................................................................................................................*................................... + // eor x21, x17, x25, ror #30 // ..............................................................................................................................................................*.................................. + // bic x19, x25, x19, ror #57 // ...............................................................................................................................................................*................................. + // ldr x25, [sp, #STACK_OFFSET_COUNT] // ...............................................................................................................................................................*................................. + // eor x17, x10, x9, ror #47 // ................................................................................................................................................................*................................ + // ldr x9, [sp, #STACK_OFFSET_CONST_SCALAR] // .................................................................................................................................................................*............................... + // eor x15, x20, x28, ror #27 // ..................................................................................................................................................................*.............................. + // bic x20, x4, x28, ror #2 // ...................................................................................................................................................................*............................. + // eor x10, x20, x1, ror #50 // ....................................................................................................................................................................*............................ + // bic x20, x11, x27, ror #60 // .....................................................................................................................................................................*........................... + // eor x20, x20, x4, ror #21 // .....................................................................................................................................................................*........................... + // bic x4, x28, x1, ror #48 // ......................................................................................................................................................................*.......................... + // bic x1, x1, x11, ror #57 // .......................................................................................................................................................................*......................... + // ldr x28, [x9, w25, UXTW #3] // ........................................................................................................................................................................*........................ + // ldr x9, [sp, #STACK_LOC_0] // .........................................................................................................................................................................*....................... + // add x25, x25, #1 // ..........................................................................................................................................................................*...................... + // str x25, [sp, #STACK_OFFSET_COUNT] // ...........................................................................................................................................................................*..................... + // cmp x25, #(KECCAK_F1600_ROUNDS-1) // ...........................................................................................................................................................................*..................... + // eor x25, x1, x27, ror #53 // ............................................................................................................................................................................*.................... + // bic x27, x30, x26, ror #47 // .............................................................................................................................................................................*................... + // eor x1, x5, x28 // ..............................................................................................................................................................................*.................. + // eor x5, x4, x11, ror #41 // ...............................................................................................................................................................................*................. + // eor x11, x19, x13, ror #35 // ................................................................................................................................................................................*................ + // bic x13, x26, x24, ror #10 // .................................................................................................................................................................................*............... + // eor x28, x27, x24, ror #57 // .................................................................................................................................................................................*............... + // bic x27, x24, x9, ror #47 // ..................................................................................................................................................................................*.............. + // bic x19, x23, x3, ror #9 // ...................................................................................................................................................................................*............. + // bic x4, x29, x14, ror #41 // ....................................................................................................................................................................................*............ + // eor x24, x19, x29, ror #44 // .....................................................................................................................................................................................*........... + // bic x29, x3, x29, ror #35 // ......................................................................................................................................................................................*.......... + // eor x13, x13, x9, ror #57 // .......................................................................................................................................................................................*......... + // eor x19, x29, x14, ror #12 // ........................................................................................................................................................................................*........ + // bic x29, x9, x0, ror #19 // ........................................................................................................................................................................................*........ + // bic x14, x14, x8, ror #5 // .........................................................................................................................................................................................*....... + // eor x9, x14, x23, ror #43 // ..........................................................................................................................................................................................*...... + // eor x14, x4, x8, ror #46 // ...........................................................................................................................................................................................*..... + // bic x23, x8, x23, ror #38 // ............................................................................................................................................................................................*.... + // eor x8, x27, x0, ror #2 // ............................................................................................................................................................................................*.... + // eor x4, x23, x3, ror #47 // .............................................................................................................................................................................................*... + // bic x3, x0, x30, ror #5 // .............................................................................................................................................................................................*... + // eor x23, x3, x26, ror #52 // ..............................................................................................................................................................................................*.. + // eor x3, x29, x30, ror #24 // ..............................................................................................................................................................................................*.. + // eor v30.16b, v0.16b, v5.16b // .*............................................................................................................................................................................................... + // eor v30.16b, v30.16b, v10.16b // .*............................................................................................................................................................................................... + // eor v30.16b, v30.16b, v15.16b // ..*.............................................................................................................................................................................................. + // eor v30.16b, v30.16b, v20.16b // ....*............................................................................................................................................................................................ + // eor v29.16b, v1.16b, v6.16b // .....*........................................................................................................................................................................................... + // eor v29.16b, v29.16b, v11.16b // ......*.......................................................................................................................................................................................... + // eor v29.16b, v29.16b, v16.16b // .......*......................................................................................................................................................................................... + // eor v29.16b, v29.16b, v21.16b // ........*........................................................................................................................................................................................ + // eor v28.16b, v2.16b, v7.16b // ..........*...................................................................................................................................................................................... + // eor v28.16b, v28.16b, v12.16b // ...........*..................................................................................................................................................................................... + // eor v28.16b, v28.16b, v17.16b // ............*.................................................................................................................................................................................... + // eor v28.16b, v28.16b, v22.16b // .............*................................................................................................................................................................................... + // eor v27.16b, v3.16b, v8.16b // ..............*.................................................................................................................................................................................. + // eor v27.16b, v27.16b, v13.16b // ...............*................................................................................................................................................................................. + // eor v27.16b, v27.16b, v18.16b // .................*............................................................................................................................................................................... + // eor v27.16b, v27.16b, v23.16b // ..................*.............................................................................................................................................................................. + // eor v26.16b, v4.16b, v9.16b // ...................*............................................................................................................................................................................. + // eor v26.16b, v26.16b, v14.16b // ....................*............................................................................................................................................................................ + // eor v26.16b, v26.16b, v19.16b // .....................*........................................................................................................................................................................... + // eor v26.16b, v26.16b, v24.16b // .......................*......................................................................................................................................................................... + // add v31.2d, v28.2d, v28.2d // ........................*........................................................................................................................................................................ + // sri v31.2d, v28.2d, #63 // .........................*....................................................................................................................................................................... + // eor v25.16b, v31.16b, v30.16b // ..........................*...................................................................................................................................................................... + // add v31.2d, v26.2d, v26.2d // ...........................*..................................................................................................................................................................... + // sri v31.2d, v26.2d, #63 // .............................*................................................................................................................................................................... + // eor v28.16b, v31.16b, v28.16b // ..............................*.................................................................................................................................................................. + // add v31.2d, v29.2d, v29.2d // ...............................*................................................................................................................................................................. + // sri v31.2d, v29.2d, #63 // ................................*................................................................................................................................................................ + // eor v26.16b, v31.16b, v26.16b // .................................*............................................................................................................................................................... + // add v31.2d, v27.2d, v27.2d // ...................................*............................................................................................................................................................. + // sri v31.2d, v27.2d, #63 // ....................................*............................................................................................................................................................ + // eor v29.16b, v31.16b, v29.16b // .....................................*........................................................................................................................................................... + // add v31.2d, v30.2d, v30.2d // ......................................*.......................................................................................................................................................... + // sri v31.2d, v30.2d, #63 // .......................................*......................................................................................................................................................... + // eor v27.16b, v31.16b, v27.16b // .........................................*....................................................................................................................................................... + // eor v30.16b, v0.16b, v26.16b // ..........................................*...................................................................................................................................................... + // eor v31.16b, v2.16b, v29.16b // ...........................................*..................................................................................................................................................... + // shl v0.2d, v31.2d, #(64-2) // ............................................*.................................................................................................................................................... + // sri v0.2d, v31.2d, #(2) // .............................................*................................................................................................................................................... + // eor v31.16b, v12.16b, v29.16b // ..............................................*.................................................................................................................................................. + // shl v2.2d, v31.2d, #(64-21) // ................................................*................................................................................................................................................ + // sri v2.2d, v31.2d, #(21) // .................................................*............................................................................................................................................... + // eor v31.16b, v13.16b, v28.16b // ..................................................*.............................................................................................................................................. + // shl v12.2d, v31.2d, #(64-39) // ...................................................*............................................................................................................................................. + // sri v12.2d, v31.2d, #(39) // ....................................................*............................................................................................................................................ + // eor v31.16b, v19.16b, v27.16b // ......................................................*.......................................................................................................................................... + // shl v13.2d, v31.2d, #(64-56) // .......................................................*......................................................................................................................................... + // sri v13.2d, v31.2d, #(56) // ........................................................*........................................................................................................................................ + // eor v31.16b, v23.16b, v28.16b // .........................................................*....................................................................................................................................... + // shl v19.2d, v31.2d, #(64-8) // ..........................................................*...................................................................................................................................... + // sri v19.2d, v31.2d, #(8) // ............................................................*.................................................................................................................................... + // eor v31.16b, v15.16b, v26.16b // .............................................................*................................................................................................................................... + // shl v23.2d, v31.2d, #(64-23) // ..............................................................*.................................................................................................................................. + // sri v23.2d, v31.2d, #(23) // ...............................................................*................................................................................................................................. + // eor v31.16b, v1.16b, v25.16b // ................................................................*................................................................................................................................ + // shl v15.2d, v31.2d, #(64-63) // ..................................................................*.............................................................................................................................. + // sri v15.2d, v31.2d, #(63) // ...................................................................*............................................................................................................................. + // eor v31.16b, v8.16b, v28.16b // ....................................................................*............................................................................................................................ + // shl v1.2d, v31.2d, #(64-9) // .....................................................................*........................................................................................................................... + // sri v1.2d, v31.2d, #(9) // ......................................................................*.......................................................................................................................... + // eor v31.16b, v16.16b, v25.16b // .......................................................................*......................................................................................................................... + // shl v8.2d, v31.2d, #(64-19) // .........................................................................*....................................................................................................................... + // sri v8.2d, v31.2d, #(19) // ..........................................................................*...................................................................................................................... + // eor v31.16b, v7.16b, v29.16b // ...........................................................................*..................................................................................................................... + // shl v16.2d, v31.2d, #(64-58) // ............................................................................*.................................................................................................................... + // sri v16.2d, v31.2d, #(58) // .............................................................................*................................................................................................................... + // eor v31.16b, v10.16b, v26.16b // ...............................................................................*................................................................................................................. + // shl v7.2d, v31.2d, #(64-61) // ................................................................................*................................................................................................................ + // sri v7.2d, v31.2d, #(61) // .................................................................................*............................................................................................................... + // eor v31.16b, v3.16b, v28.16b // ..................................................................................*.............................................................................................................. + // shl v10.2d, v31.2d, #(64-36) // ...................................................................................*............................................................................................................. + // sri v10.2d, v31.2d, #(36) // .....................................................................................*........................................................................................................... + // eor v31.16b, v18.16b, v28.16b // ......................................................................................*.......................................................................................................... + // shl v3.2d, v31.2d, #(64-43) // .......................................................................................*......................................................................................................... + // sri v3.2d, v31.2d, #(43) // ........................................................................................*........................................................................................................ + // eor v31.16b, v17.16b, v29.16b // .........................................................................................*....................................................................................................... + // shl v18.2d, v31.2d, #(64-49) // ...........................................................................................*..................................................................................................... + // sri v18.2d, v31.2d, #(49) // ............................................................................................*.................................................................................................... + // eor v31.16b, v11.16b, v25.16b // .............................................................................................*................................................................................................... + // shl v17.2d, v31.2d, #(64-54) // ..............................................................................................*.................................................................................................. + // sri v17.2d, v31.2d, #(54) // ...............................................................................................*................................................................................................. + // eor v31.16b, v9.16b, v27.16b // .................................................................................................*............................................................................................... + // shl v11.2d, v31.2d, #(64-44) // ..................................................................................................*.............................................................................................. + // sri v11.2d, v31.2d, #(44) // ...................................................................................................*............................................................................................. + // eor v31.16b, v22.16b, v29.16b // ....................................................................................................*............................................................................................ + // shl v9.2d, v31.2d, #(64-3) // .....................................................................................................*........................................................................................... + // sri v9.2d, v31.2d, #(3) // ......................................................................................................*.......................................................................................... + // eor v31.16b, v14.16b, v27.16b // ........................................................................................................*........................................................................................ + // shl v22.2d, v31.2d, #(64-25) // .........................................................................................................*....................................................................................... + // sri v22.2d, v31.2d, #(25) // ..........................................................................................................*...................................................................................... + // eor v31.16b, v20.16b, v26.16b // ...........................................................................................................*..................................................................................... + // shl v14.2d, v31.2d, #(64-46) // ............................................................................................................*.................................................................................... + // sri v14.2d, v31.2d, #(46) // ..............................................................................................................*.................................................................................. + // eor v31.16b, v4.16b, v27.16b // ...............................................................................................................*................................................................................. + // shl v20.2d, v31.2d, #(64-37) // ................................................................................................................*................................................................................ + // sri v20.2d, v31.2d, #(37) // .................................................................................................................*............................................................................... + // eor v31.16b, v24.16b, v27.16b // ..................................................................................................................*.............................................................................. + // shl v4.2d, v31.2d, #(64-50) // ....................................................................................................................*............................................................................ + // sri v4.2d, v31.2d, #(50) // .....................................................................................................................*........................................................................... + // eor v31.16b, v21.16b, v25.16b // ......................................................................................................................*.......................................................................... + // shl v24.2d, v31.2d, #(64-62) // .......................................................................................................................*......................................................................... + // sri v24.2d, v31.2d, #(62) // ........................................................................................................................*........................................................................ + // eor v31.16b, v5.16b, v26.16b // ..........................................................................................................................*...................................................................... + // shl v21.2d, v31.2d, #(64-28) // ...........................................................................................................................*..................................................................... + // sri v21.2d, v31.2d, #(28) // ............................................................................................................................*.................................................................... + // eor v31.16b, v6.16b, v25.16b // .............................................................................................................................*................................................................... + // shl v27.2d, v31.2d, #(64-20) // ..............................................................................................................................*.................................................................. + // sri v27.2d, v31.2d, #(20) // ...............................................................................................................................*................................................................. + // ldr x30, [sp, #STACK_OFFSET_CONST_VECTOR] // ...............................................................................................................................................................................................*. + // ld1r {v28.2d}, [x30], #8 // ...............................................................................................................................................................................................*. + // str x30, [sp, #STACK_OFFSET_CONST_VECTOR] // ................................................................................................................................................................................................* + // bic v31.16b, v7.16b, v11.16b // .................................................................................................................................*............................................................... + // eor v5.16b, v31.16b, v10.16b // ..................................................................................................................................*.............................................................. + // bic v31.16b, v8.16b, v7.16b // ...................................................................................................................................*............................................................. + // eor v6.16b, v31.16b, v11.16b // ....................................................................................................................................*............................................................ + // bic v31.16b, v9.16b, v8.16b // .....................................................................................................................................*........................................................... + // eor v7.16b, v31.16b, v7.16b // .......................................................................................................................................*......................................................... + // bic v31.16b, v10.16b, v9.16b // ........................................................................................................................................*........................................................ + // eor v8.16b, v31.16b, v8.16b // .........................................................................................................................................*....................................................... + // bic v31.16b, v11.16b, v10.16b // ..........................................................................................................................................*...................................................... + // eor v9.16b, v31.16b, v9.16b // ...........................................................................................................................................*..................................................... + // bic v31.16b, v12.16b, v16.16b // .............................................................................................................................................*................................................... + // eor v10.16b, v31.16b, v15.16b // ..............................................................................................................................................*.................................................. + // bic v31.16b, v13.16b, v12.16b // ...............................................................................................................................................*................................................. + // eor v11.16b, v31.16b, v16.16b // ................................................................................................................................................*................................................ + // bic v31.16b, v14.16b, v13.16b // .................................................................................................................................................*............................................... + // eor v12.16b, v31.16b, v12.16b // ...................................................................................................................................................*............................................. + // bic v31.16b, v15.16b, v14.16b // ....................................................................................................................................................*............................................ + // eor v13.16b, v31.16b, v13.16b // .....................................................................................................................................................*........................................... + // bic v31.16b, v16.16b, v15.16b // ......................................................................................................................................................*.......................................... + // eor v14.16b, v31.16b, v14.16b // .......................................................................................................................................................*......................................... + // bic v31.16b, v17.16b, v21.16b // ........................................................................................................................................................*........................................ + // eor v15.16b, v31.16b, v20.16b // ..........................................................................................................................................................*...................................... + // bic v31.16b, v18.16b, v17.16b // ...........................................................................................................................................................*..................................... + // eor v16.16b, v31.16b, v21.16b // ............................................................................................................................................................*.................................... + // bic v31.16b, v19.16b, v18.16b // .............................................................................................................................................................*................................... + // eor v17.16b, v31.16b, v17.16b // ..............................................................................................................................................................*.................................. + // bic v31.16b, v20.16b, v19.16b // ................................................................................................................................................................*................................ + // eor v18.16b, v31.16b, v18.16b // .................................................................................................................................................................*............................... + // bic v31.16b, v21.16b, v20.16b // ..................................................................................................................................................................*.............................. + // eor v19.16b, v31.16b, v19.16b // ...................................................................................................................................................................*............................. + // bic v31.16b, v22.16b, v1.16b // ....................................................................................................................................................................*............................ + // eor v20.16b, v31.16b, v0.16b // ......................................................................................................................................................................*.......................... + // bic v31.16b, v23.16b, v22.16b // .......................................................................................................................................................................*......................... + // eor v21.16b, v31.16b, v1.16b // ........................................................................................................................................................................*........................ + // bic v31.16b, v24.16b, v23.16b // .........................................................................................................................................................................*....................... + // eor v22.16b, v31.16b, v22.16b // ..........................................................................................................................................................................*...................... + // bic v31.16b, v0.16b, v24.16b // ............................................................................................................................................................................*.................... + // eor v23.16b, v31.16b, v23.16b // .............................................................................................................................................................................*................... + // bic v31.16b, v1.16b, v0.16b // ..............................................................................................................................................................................*.................. + // eor v24.16b, v31.16b, v24.16b // ...............................................................................................................................................................................*................. + // bic v31.16b, v2.16b, v27.16b // ................................................................................................................................................................................*................ + // eor v0.16b, v31.16b, v30.16b // ..................................................................................................................................................................................*.............. + // bic v31.16b, v3.16b, v2.16b // ...................................................................................................................................................................................*............. + // eor v1.16b, v31.16b, v27.16b // ....................................................................................................................................................................................*............ + // bic v31.16b, v4.16b, v3.16b // .....................................................................................................................................................................................*........... + // eor v2.16b, v31.16b, v2.16b // ......................................................................................................................................................................................*.......... + // bic v31.16b, v30.16b, v4.16b // .......................................................................................................................................................................................*......... + // eor v3.16b, v31.16b, v3.16b // .........................................................................................................................................................................................*....... + // bic v31.16b, v27.16b, v30.16b // ..........................................................................................................................................................................................*...... + // eor v4.16b, v31.16b, v4.16b // ...........................................................................................................................................................................................*..... + // eor v0.16b, v0.16b, v28.16b // ................................................................................................................................................................................................* + + loop_end: + + ble loop + final_scalar_rotate + + // Read outer loop flag: We repeat the above twice + ldr outer, [sp, #STACK_OFFSET_OUTER] // @slothy:reads=STACK_OFFSET_OUTER + cmp outer, #1 + beq done + + // Update outer loop flag + mov outer, #1 + str outer, [sp, #STACK_OFFSET_OUTER] // @slothy:writes=STACK_OFFSET_OUTER + + ldr input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:reads=STACK_OFFSET_INPUT + store_input_scalar 4,0 // Store first scalar data + load_input_scalar 4,1 // Load second scalar input + + b initial +done: + + ldr input_addr, [sp, #STACK_OFFSET_INPUT] // @slothy:reads=STACK_OFFSET_INPUT + store_input_scalar 4,1 + store_input_vector 2,1 + + restore_vregs + restore_gprs + free_stack + ret diff --git a/examples/naive/aarch64/keccak_f1600_x4_hybrid_slothy_symbolic.s b/examples/naive/aarch64/keccak_f1600_x4_hybrid_slothy_symbolic.s index ac78abef..a3278e3f 100644 --- a/examples/naive/aarch64/keccak_f1600_x4_hybrid_slothy_symbolic.s +++ b/examples/naive/aarch64/keccak_f1600_x4_hybrid_slothy_symbolic.s @@ -24,6 +24,8 @@ */ #include +#define KECCAK_F1600_ROUNDS 24 + // // Author: Hanno Becker // Author: Matthias Kannwischer @@ -180,7 +182,7 @@ round_constants: sAka .req x3 sAke .req x8 sAki .req x13 - sAko .req x18 + sAko .req x28 sAku .req x23 sAma .req x4 sAme .req x9 @@ -193,46 +195,6 @@ round_constants: sAso .req x20 sAsu .req x25 - /* sA_[y,2*x+3*y] = rot(A[x,y]) */ - sAba_ .req x0 - sAbe_ .req x28 - sAbi_ .req x11 - sAbo_ .req x16 - sAbu_ .req x21 - sAga_ .req x3 - sAge_ .req x8 - sAgi_ .req x12 - sAgo_ .req x17 - sAgu_ .req x22 - sAka_ .req x4 - sAke_ .req x9 - sAki_ .req x13 - sAko_ .req x18 - sAku_ .req x23 - sAma_ .req x5 - sAme_ .req x10 - sAmi_ .req x14 - sAmo_ .req x19 - sAmu_ .req x24 - sAsa_ .req x1 - sAse_ .req x6 - sAsi_ .req x15 - sAso_ .req x20 - sAsu_ .req x25 - - /* sC[x] = sA[x,0] xor sA[x,1] xor sA[x,2] xor sA[x,3] xor sA[x,4], for x in 0..4 */ - /* sE[x] = sC[x-1] xor rot(C[x+1],1), for x in 0..4 */ - sC0 .req x0 - sE0 .req x29 - sC1 .req x26 - sE1 .req x30 - sC2 .req x27 - sE2 .req x26 - sC3 .req x28 - sE3 .req x27 - sC4 .req x29 - sE4 .req x28 - tmp .req x30 /************************ MACROS ****************************/ @@ -482,15 +444,7 @@ round_constants: eor \out\(), \a\(), X, ror #\r2 .endm -.macro hybrid_round_initial - scalar_round_initial - scalar_round_noninitial - vector_round - vector_round -.endm - .macro scalar_round_initial - eor5 X, sAma, sAsa, sAba, sAga, sAka eor5 X, sAme, sAse, sAbe, sAge, sAke eor5 X, sAmi, sAsi, sAbi, sAgi, sAki @@ -604,9 +558,9 @@ round_constants: xar_m1 vAsu_, vAse, E1, 62 xar_m1 vAme_, vAga, E0, 28 xar_m1 vAbe_, vAge, E1, 20 - ldr sE1, [sp, #STACK_OFFSET_CONST_VECTOR] // @slothy:reads=STACK_OFFSET_CONST_VECTOR - ld1r {v28.2d}, [sE1], #8 - str sE1, [sp, #STACK_OFFSET_CONST_VECTOR] // @slothy:writes=STACK_OFFSET_CONST_VECTOR + ldr tmp, [sp, #STACK_OFFSET_CONST_VECTOR] // @slothy:reads=STACK_OFFSET_CONST_VECTOR + ld1r {v28.2d}, [tmp], #8 + str tmp, [sp, #STACK_OFFSET_CONST_VECTOR] // @slothy:writes=STACK_OFFSET_CONST_VECTOR bcax_m1 vAga, vAga_, vAgi_, vAge_ bcax_m1 vAge, vAge_, vAgo_, vAgi_ bcax_m1 vAgi, vAgi_, vAgu_, vAgo_ @@ -759,8 +713,6 @@ round_constants: ror sAsu, sAsu,#(64-55) .endm -#define KECCAK_F1600_ROUNDS 24 - .global keccak_f1600_x4_hybrid_slothy_symbolic .global _keccak_f1600_x4_hybrid_slothy_symbolic .text @@ -784,15 +736,15 @@ _keccak_f1600_x4_hybrid_slothy_symbolic: load_input_scalar 4,0 // First scalar input initial: - scalar_round_initial - scalar_round_noninitial - vector_round + scalar_round_initial // @slothy:interleaving_class=0 + scalar_round_noninitial // @slothy:interleaving_class=0 + vector_round // @slothy:interleaving_class=1 loop: - scalar_round_noninitial - scalar_round_noninitial - vector_round + scalar_round_noninitial // @slothy:interleaving_class=0 + scalar_round_noninitial // @slothy:interleaving_class=0 + vector_round // @slothy:interleaving_class=1 loop_end: - ble loop_0 + ble loop final_scalar_rotate // Read outer loop flag: We repeat the above twice diff --git a/examples/opt/aarch64/keccak_f1600_x4_hybrid_no_symbolic_opt_a55.s b/examples/opt/aarch64/keccak_f1600_x4_hybrid_no_symbolic_opt_a55.s index 9f25df6d..4cd39483 100644 --- a/examples/opt/aarch64/keccak_f1600_x4_hybrid_no_symbolic_opt_a55.s +++ b/examples/opt/aarch64/keccak_f1600_x4_hybrid_no_symbolic_opt_a55.s @@ -2564,7 +2564,7 @@ _keccak_f1600_x4_hybrid_no_symbolic: loop_end: - ble loop_0 + ble loop final_scalar_rotate // Read outer loop flag: We repeat the above twice