From 11385db258b2b42d259babc3eb49897b6ca49d73 Mon Sep 17 00:00:00 2001 From: Tiago Oliveira Date: Thu, 27 Jun 2024 12:42:32 +0100 Subject: [PATCH] sct: fix compilation of kyber* (no sct yet); there will be a separate PR; --- src/common/keccak/common/fips202_DIRTY.jinc | 6 +- .../amd64/avx2/keccak1600_nomsf.jinc | 272 ++++++++++++++++++ .../amd64/avx2/keccakf1600_nomsf.jinc | 202 +++++++++++++ 3 files changed, 476 insertions(+), 4 deletions(-) create mode 100644 src/common/keccak/keccak1600/amd64/avx2/keccak1600_nomsf.jinc create mode 100644 src/common/keccak/keccak1600/amd64/avx2/keccakf1600_nomsf.jinc diff --git a/src/common/keccak/common/fips202_DIRTY.jinc b/src/common/keccak/common/fips202_DIRTY.jinc index 92698c60..82f6c335 100644 --- a/src/common/keccak/common/fips202_DIRTY.jinc +++ b/src/common/keccak/common/fips202_DIRTY.jinc @@ -1,7 +1,5 @@ -param int KECCAK_ROUNDS=24; - -from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600.jinc" -from Jade require "common/keccak/keccak1600/amd64/avx2/keccakf1600.jinc" +from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600_nomsf.jinc" +from Jade require "common/keccak/keccak1600/amd64/avx2/keccakf1600_nomsf.jinc" require "fips202_params.jinc" #[returnaddress="stack"] diff --git a/src/common/keccak/keccak1600/amd64/avx2/keccak1600_nomsf.jinc b/src/common/keccak/keccak1600/amd64/avx2/keccak1600_nomsf.jinc new file mode 100644 index 00000000..0f6ace84 --- /dev/null +++ b/src/common/keccak/keccak1600/amd64/avx2/keccak1600_nomsf.jinc @@ -0,0 +1,272 @@ +param int KECCAK_ROUNDS=24; + +require "keccakf1600_nomsf.jinc" + +inline fn __keccak_init_avx2() -> reg u256[7] +{ + inline int i; + reg u256[7] state; + + for i=0 to 7 + { state[i] = #set0_256(); } + + return state; +} + + +inline fn __init_s_state_avx2() -> stack u64[28] +{ + inline int i; + stack u64[28] s_state; + reg u256 zero; + + zero = #set0_256(); + for i=0 to 7 + { s_state[u256 i] = zero; } + + return s_state; +} + + +inline fn __add_full_block_avx2( + reg u256[7] state, + stack u64[28] s_state, + reg ptr u64[25] a_jagged_p, + reg u64 in inlen, + reg u64 rate +) -> reg u256[7], stack u64[28], reg u64, reg u64 +{ + + inline int i; + reg u64 j l t rate8; + + rate8 = rate; + rate8 >>= 3; + j = 0; + while ( j < rate8 ) + { + t = [in + 8*j]; + l = a_jagged_p[(int) j]; + s_state[(int) l] = t; + j += 1; + } + + //TODO: check & change to #VPBROADCAST_4u64 + t = s_state[0]; + s_state[1] = t; + s_state[2] = t; + s_state[3] = t; + + for i = 0 to 7 + { state[i] ^= s_state[u256 i]; } + + in += rate; + inlen -= rate; + + return state, s_state, in, inlen; +} + + +// TODO: refactor when this feature is available: https://github.com/haslab/libjbn/wiki/Feature-request-%231#procedural-parameters +inline fn __add_final_block_avx2( + reg u256[7] state, + stack u64[28] s_state, + reg ptr u64[25] a_jagged_p, + reg u64 in inlen, + reg u8 trail_byte, + reg u64 rate +) -> reg u256[7] +{ + inline int i; + reg u64 j l t inlen8; + reg u8 c; + + s_state = __init_s_state_avx2(); + + inlen8 = inlen; + inlen8 >>= 3; + j = 0; + while ( j < inlen8 ) + { + t = [in + 8*j]; + l = a_jagged_p[(int) j]; + s_state[(int) l] = t; + j += 1; + } + l = a_jagged_p[(int) j]; + l <<= 3; + j <<= 3; + + while ( j < inlen ) + { + c = (u8)[in + j]; + s_state[u8 (int) l] = c; + j += 1; + l += 1; + } + + s_state[u8 (int) l] = trail_byte; + + // j = (rate-1) >> 3; + j = rate; j -= 1; j >>= 3; + l = a_jagged_p[(int) j]; + l <<= 3; + // l += ((rate-1) & 0x7) + j = rate; j -= 1; j &= 0x7; + l += j; + + s_state[u8 (int) l] ^= 0x80; + + t = s_state[0]; + s_state[1] = t; + s_state[2] = t; + s_state[3] = t; + + for i = 0 to 7 + { state[i] ^= s_state[u256 i]; } + + return state; +} + + +// obs: @pre: len <= rate_in_bytes +inline fn __xtr_full_block_avx2( + reg u256[7] state, + reg ptr u64[25] a_jagged_p, + reg u64 out, + reg u64 len +) -> reg u64 +{ + inline int i; + stack u64[28] s_state; + reg u64 j l t len8; + + for i = 0 to 7 + { s_state[u256 i] = state[i]; } + + len8 = len; + len8 >>= 3; + j = 0; + while ( j < len8 ) + { + l = a_jagged_p[(int) j]; + t = s_state[(int) l]; + [out + 8*j] = t; + j += 1; + } + + out += len; + + return out; +} + + +// obs: @pre: len <= rate_in_bytes +inline fn __xtr_bytes_avx2( + reg u256[7] state, + reg ptr u64[25] a_jagged_p, + reg u64 out, + reg u64 len +) -> reg u64 +{ + inline int i; + stack u64[28] s_state; + reg u64 j l t len8; + reg u8 c; + + for i = 0 to 7 + { s_state[u256 i] = state[i]; } + + len8 = len; + len8 >>= 3; + j = 0; + while ( j < len8 ) + { l = a_jagged_p[(int) j]; + t = s_state[(int) l]; + [out + 8*j] = t; + j += 1; + } + l = a_jagged_p[(int)j]; + j <<= 3; + l <<= 3; + + while ( j < len ) + { + c = s_state[u8 (int) l]; + (u8)[out + j] = c; + j += 1; + l += 1; + } + + out += len; + + return out; +} + + +inline fn __absorb_avx2( + reg u256[7] state, + reg u64 in inlen, + reg u8 trail_byte, + reg u64 rate +) -> reg u256[7] +{ + stack u64[28] s_state; + reg ptr u64[25] a_jagged_p; + + a_jagged_p = KECCAK_A_JAGGED; + s_state = __init_s_state_avx2(); + + // intermediate blocks + while ( inlen >= rate ) + { + state, s_state, in, inlen = __add_full_block_avx2(state, s_state, a_jagged_p, in, inlen, rate); + state = __keccakf1600_avx2(state); + } + + // final block + state = __add_final_block_avx2(state, s_state, a_jagged_p, in, inlen, trail_byte, rate); + + return state; +} + + +inline fn __squeeze_avx2(reg u256[7] state, reg u64 out outlen rate) +{ + reg ptr u64[25] a_jagged_p; + + a_jagged_p = KECCAK_A_JAGGED; + + // intermediate blocks + while ( outlen > rate ) + { + state = __keccakf1600_avx2(state); + out = __xtr_full_block_avx2(state, a_jagged_p, out, rate); + outlen -= rate; + } + + state = __keccakf1600_avx2(state); + out = __xtr_bytes_avx2(state, a_jagged_p, out, outlen); +} + + +inline fn __keccak1600_avx2(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate) +{ + reg u256[7] state; + + state = __keccak_init_avx2(); + + // absorb + state = __absorb_avx2(state, in, inlen, trail_byte, rate); + + // squeeze + __squeeze_avx2(state, out, outlen, rate); +} + + +fn _keccak1600_avx2(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate) +{ + __keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate); +} + + diff --git a/src/common/keccak/keccak1600/amd64/avx2/keccakf1600_nomsf.jinc b/src/common/keccak/keccak1600/amd64/avx2/keccakf1600_nomsf.jinc new file mode 100644 index 00000000..6ca9dda6 --- /dev/null +++ b/src/common/keccak/keccak1600/amd64/avx2/keccakf1600_nomsf.jinc @@ -0,0 +1,202 @@ + +u256[24] KECCAK_IOTAS = +{ (4u64)[0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001] + ,(4u64)[0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082] + ,(4u64)[0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a] + ,(4u64)[0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000] + ,(4u64)[0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b] + ,(4u64)[0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001] + ,(4u64)[0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081] + ,(4u64)[0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009] + ,(4u64)[0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a] + ,(4u64)[0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088] + ,(4u64)[0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009] + ,(4u64)[0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a] + ,(4u64)[0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b] + ,(4u64)[0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b] + ,(4u64)[0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089] + ,(4u64)[0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003] + ,(4u64)[0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002] + ,(4u64)[0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080] + ,(4u64)[0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a] + ,(4u64)[0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a] + ,(4u64)[0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081] + ,(4u64)[0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080] + ,(4u64)[0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001] + ,(4u64)[0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008] +}; + + +u256[6] KECCAK_RHOTATES_LEFT = +{ + (4u64)[41, 36, 18, 3], + (4u64)[27, 28, 62, 1], + (4u64)[39, 56, 6, 45], + (4u64)[ 8, 55, 61, 10], + (4u64)[20, 25, 15, 2], + (4u64)[14, 21, 43, 44] +}; + + +u256[6] KECCAK_RHOTATES_RIGHT = +{ + (4u64)[64-41, 64-36, 64-18, 64- 3], + (4u64)[64-27, 64-28, 64-62, 64- 1], + (4u64)[64-39, 64-56, 64- 6, 64-45], + (4u64)[64- 8, 64-55, 64-61, 64-10], + (4u64)[64-20, 64-25, 64-15, 64- 2], + (4u64)[64-14, 64-21, 64-43, 64-44] +}; + + +u64[25] KECCAK_A_JAGGED = +{ + 0, 4, 5, 6, 7, + 10, 24, 13, 18, 23, + 8, 16, 25, 22, 15, + 11, 12, 21, 26, 19, + 9, 20, 17, 14, 27 +}; + + +inline fn __keccakf1600_avx2(reg u256[7] state) -> reg u256[7] +{ + reg u256[9] t; + reg u256 c00 c14 d00 d14; + + reg bool zf; + reg u64 r iotas_o; + + reg ptr u256[24] iotas_p; + reg ptr u256[6] rhotates_left_p; + reg ptr u256[6] rhotates_right_p; + + iotas_p = KECCAK_IOTAS; + iotas_o = 0; + rhotates_left_p = KECCAK_RHOTATES_LEFT; + rhotates_right_p = KECCAK_RHOTATES_RIGHT; + + r = KECCAK_ROUNDS; + while + { + //######################################## Theta + c00 = #VPSHUFD_256(state[2], (4u2)[1,0,3,2]); + c14 = state[5] ^ state[3]; + t[2] = state[4] ^ state[6]; + c14 = c14 ^ state[1]; + c14 = c14 ^ t[2]; + t[4] = #VPERMQ(c14, (4u2)[2,1,0,3]); + c00 = c00 ^ state[2]; + t[0] = #VPERMQ(c00, (4u2)[1,0,3,2]); + t[1] = c14 >>4u64 63; + t[2] = c14 +4u64 c14; + t[1] = t[1] | t[2]; + d14 = #VPERMQ(t[1], (4u2)[0,3,2,1]); + d00 = t[1] ^ t[4]; + d00 = #VPERMQ(d00, (4u2)[0,0,0,0]); + c00 = c00 ^ state[0]; + c00 = c00 ^ t[0]; + t[0] = c00 >>4u64 63; + t[1] = c00 +4u64 c00; + t[1] = t[1] | t[0]; + state[2] = state[2] ^ d00; + state[0] = state[0] ^ d00; + d14 = #VPBLEND_8u32(d14, t[1], (8u1)[1,1,0,0,0,0,0,0]); + t[4] = #VPBLEND_8u32(t[4], c00, (8u1)[0,0,0,0,0,0,1,1]); + d14 = d14 ^ t[4]; + + //######################################## Rho + Pi + pre-Chi shuffle + t[3] = #VPSLLV_4u64(state[2], rhotates_left_p[0] ); + state[2] = #VPSRLV_4u64(state[2], rhotates_right_p[0] ); + state[2] = state[2] | t[3]; + state[3] = state[3] ^ d14; + t[4] = #VPSLLV_4u64(state[3], rhotates_left_p[2] ); + state[3] = #VPSRLV_4u64(state[3], rhotates_right_p[2] ); + state[3] = state[3] | t[4]; + state[4] = state[4] ^ d14; + t[5] = #VPSLLV_4u64(state[4], rhotates_left_p[3] ); + state[4] = #VPSRLV_4u64(state[4], rhotates_right_p[3] ); + state[4] = state[4] | t[5]; + state[5] = state[5] ^ d14; + t[6] = #VPSLLV_4u64(state[5], rhotates_left_p[4] ); + state[5] = #VPSRLV_4u64(state[5], rhotates_right_p[4] ); + state[5] = state[5] | t[6]; + state[6] = state[6] ^ d14; + t[3] = #VPERMQ(state[2], (4u2)[2,0,3,1]); + t[4] = #VPERMQ(state[3], (4u2)[2,0,3,1]); + t[7] = #VPSLLV_4u64(state[6], rhotates_left_p[5] ); + t[1] = #VPSRLV_4u64(state[6], rhotates_right_p[5] ); + t[1] = t[1] | t[7]; + state[1] = state[1] ^ d14; + t[5] = #VPERMQ(state[4], (4u2)[0,1,2,3]); + t[6] = #VPERMQ(state[5], (4u2)[1,3,0,2]); + t[8] = #VPSLLV_4u64(state[1], rhotates_left_p[1] ); + t[2] = #VPSRLV_4u64(state[1], rhotates_right_p[1] ); + t[2] = t[2] | t[8]; + + //######################################## Chi + t[7] = #VPSRLDQ_256(t[1], 8); + t[0] = !t[1] & t[7]; + state[3] = #VPBLEND_8u32(t[2], t[6], (8u1)[0,0,0,0,1,1,0,0]); + t[8] = #VPBLEND_8u32(t[4], t[2], (8u1)[0,0,0,0,1,1,0,0]); + state[5] = #VPBLEND_8u32(t[3], t[4], (8u1)[0,0,0,0,1,1,0,0]); + t[7] = #VPBLEND_8u32(t[2], t[3], (8u1)[0,0,0,0,1,1,0,0]); + state[3] = #VPBLEND_8u32(state[3], t[4], (8u1)[0,0,1,1,0,0,0,0]); + t[8] = #VPBLEND_8u32(t[8], t[5], (8u1)[0,0,1,1,0,0,0,0]); + state[5] = #VPBLEND_8u32(state[5], t[2], (8u1)[0,0,1,1,0,0,0,0]); + t[7] = #VPBLEND_8u32(t[7], t[6], (8u1)[0,0,1,1,0,0,0,0]); + state[3] = #VPBLEND_8u32(state[3], t[5], (8u1)[1,1,0,0,0,0,0,0]); + t[8] = #VPBLEND_8u32(t[8], t[6], (8u1)[1,1,0,0,0,0,0,0]); + state[5] = #VPBLEND_8u32(state[5], t[6], (8u1)[1,1,0,0,0,0,0,0]); + t[7] = #VPBLEND_8u32(t[7], t[4], (8u1)[1,1,0,0,0,0,0,0]); + state[3] = !state[3] & t[8]; + state[5] = !state[5] & t[7]; + state[6] = #VPBLEND_8u32(t[5], t[2], (8u1)[0,0,0,0,1,1,0,0]); + t[8] = #VPBLEND_8u32(t[3], t[5], (8u1)[0,0,0,0,1,1,0,0]); + state[3] = state[3] ^ t[3]; + state[6] = #VPBLEND_8u32(state[6], t[3], (8u1)[0,0,1,1,0,0,0,0]); + t[8] = #VPBLEND_8u32(t[8], t[4], (8u1)[0,0,1,1,0,0,0,0]); + state[5] = state[5] ^ t[5]; + state[6] = #VPBLEND_8u32(state[6], t[4], (8u1)[1,1,0,0,0,0,0,0]); + t[8] = #VPBLEND_8u32(t[8], t[2], (8u1)[1,1,0,0,0,0,0,0]); + state[6] = !state[6] & t[8]; + state[6] = state[6] ^ t[6]; + state[4] = #VPERMQ(t[1], (4u2)[0,1,3,2]); + t[8] = #VPBLEND_8u32(state[4], state[0], (8u1)[0,0,1,1,0,0,0,0]); + state[1] = #VPERMQ(t[1], (4u2)[0,3,2,1]); + state[1] = #VPBLEND_8u32(state[1], state[0], (8u1)[1,1,0,0,0,0,0,0]); + state[1] = !state[1] & t[8]; + state[2] = #VPBLEND_8u32(t[4], t[5], (8u1)[0,0,0,0,1,1,0,0]); + t[7] = #VPBLEND_8u32(t[6], t[4], (8u1)[0,0,0,0,1,1,0,0]); + state[2] = #VPBLEND_8u32(state[2], t[6], (8u1)[0,0,1,1,0,0,0,0]); + t[7] = #VPBLEND_8u32(t[7], t[3], (8u1)[0,0,1,1,0,0,0,0]); + state[2] = #VPBLEND_8u32(state[2], t[3], (8u1)[1,1,0,0,0,0,0,0]); + t[7] = #VPBLEND_8u32(t[7], t[5], (8u1)[1,1,0,0,0,0,0,0]); + state[2] = !state[2] & t[7]; + state[2] = state[2] ^ t[2]; + t[0] = #VPERMQ(t[0], (4u2)[0,0,0,0]); + state[3] = #VPERMQ(state[3], (4u2)[0,1,2,3]); + state[5] = #VPERMQ(state[5], (4u2)[2,0,3,1]); + state[6] = #VPERMQ(state[6], (4u2)[1,3,0,2]); + state[4] = #VPBLEND_8u32(t[6], t[3], (8u1)[0,0,0,0,1,1,0,0]); + t[7] = #VPBLEND_8u32(t[5], t[6], (8u1)[0,0,0,0,1,1,0,0]); + state[4] = #VPBLEND_8u32(state[4], t[5], (8u1)[0,0,1,1,0,0,0,0]); + t[7] = #VPBLEND_8u32(t[7], t[2], (8u1)[0,0,1,1,0,0,0,0]); + state[4] = #VPBLEND_8u32(state[4], t[2], (8u1)[1,1,0,0,0,0,0,0]); + t[7] = #VPBLEND_8u32(t[7], t[3], (8u1)[1,1,0,0,0,0,0,0]); + state[4] = !state[4] & t[7]; + state[0] = state[0] ^ t[0]; + state[1] = state[1] ^ t[1]; + state[4] = state[4] ^ t[4]; + + //######################################## Iota + state[0] = state[0] ^ iotas_p.[(int) iotas_o]; + iotas_o += 32; + + _,_,_,zf,r = #DEC_64(r); + }(!zf) + + return state; +} + +