From c91d34c99f2d319f8c082d5a9a7d821db09d56d3 Mon Sep 17 00:00:00 2001 From: Anton Kirilov Date: Sun, 27 Jan 2019 15:13:35 +0000 Subject: [PATCH] Optimizations for Armv8-A These changes apply only to the AArch64 execution state. --- picohttpparser.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/picohttpparser.c b/picohttpparser.c index 74ccc3e..ed34f56 100644 --- a/picohttpparser.c +++ b/picohttpparser.c @@ -34,6 +34,9 @@ #include #endif #endif +#ifdef __ARM_NEON +#include +#endif #include "picohttpparser.h" #if __GNUC__ >= 3 @@ -71,9 +74,8 @@ #define ADVANCE_TOKEN(tok, toklen) \ do { \ const char *tok_start = buf; \ - static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; \ int found2; \ - buf = findchar_fast(buf, buf_end, ranges2, 4, &found2); \ + buf = findchar_nonprintable_fast(buf, buf_end, &found2); \ if (!found2) { \ CHECK_EOF(); \ } \ @@ -131,6 +133,46 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha return buf; } +static const char *findchar_nonprintable_fast(const char *buf, const char *buf_end, int *found) +{ +#if defined(__ARM_64BIT_STATE) && defined(__ARM_FEATURE_UNALIGNED) && !defined(__ARM_BIG_ENDIAN) + *found = 0; + + for (size_t i = (buf_end - buf) / sizeof(uint8x16_t); i; i--) { + // This mask makes it possible to pack the comparison result into half a vector, + // which has the same size as uint64_t. + const uint8x16_t mask = vreinterpretq_u8_u16(vmovq_n_u16(0x8008)); + uint8x16_t v = vld1q_u8((const uint8_t *) buf); + + v = vorrq_u8(vcltq_u8(v, vmovq_n_u8('\041')), vceqq_u8(v, vmovq_n_u8('\177'))); + // After masking, a byte in the result does not have the same bits set as any of its neighbours. + v = vandq_u8(v, mask); + // Pack the comparison result into 64 bits. + v = vpaddq_u8(v, v); + + uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v), 0); + + if (offset) { + *found = 1; + __asm__ ("rbit %x0, %x0" : "+r" (offset)); + static_assert(sizeof(unsigned long long) == sizeof(uint64_t), + "Need the number of leading 0-bits in uint64_t."); + // offset uses 4 bits per byte of input. + buf += __builtin_clzll(offset) / 4; + break; + } + + buf += sizeof(v); + } + + return buf; +#else + static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; + + return findchar_fast(buf, buf_end, ranges2, 4, found); +#endif +} + static const char *get_token_to_eol(const char *buf, const char *buf_end, const char **token, size_t *token_len, int *ret) { const char *token_start = buf; @@ -143,6 +185,50 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const buf = findchar_fast(buf, buf_end, ranges1, 6, &found); if (found) goto FOUND_CTL; +#elif defined(__ARM_64BIT_STATE) && defined(__ARM_FEATURE_UNALIGNED) && !defined(__ARM_BIG_ENDIAN) + for (size_t i = (buf_end - buf) / (2 * sizeof(uint8x16_t)); i; i--) { + const uint8x16_t space = vmovq_n_u8('\040'); + const uint8x16_t threshold = vmovq_n_u8(0137u); + const uint8x16_t v1 = vld1q_u8((const uint8_t *) buf); + const uint8x16_t v2 = vld1q_u8((const uint8_t *) buf + sizeof(v1)); + uint8x16_t v3 = vcgeq_u8(vsubq_u8(v1, space), threshold); + uint8x16_t v4 = vcgeq_u8(vsubq_u8(v2, space), threshold); + + v3 = vorrq_u8(v3, v4); + // Pack the comparison result into half a vector, i.e. 64 bits; the result will still be non-zero + // even if any adjacent bytes are the same (either 0 or 0xFF). + v3 = vpaddq_u8(v3, v3); + + if (vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0)) { + const uint8x16_t del = vmovq_n_u8('\177'); + const uint8x16_t mask = vreinterpretq_u8_u32(vmovq_n_u32(0x40100401)); + const uint8x16_t tab = vmovq_n_u8('\011'); + + v3 = vbicq_u8(vcltq_u8(v1, space), vceqq_u8(v1, tab)); + v4 = vbicq_u8(vcltq_u8(v2, space), vceqq_u8(v2, tab)); + v3 = vorrq_u8(v3, vceqq_u8(v1, del)); + v4 = vorrq_u8(v4, vceqq_u8(v2, del)); + // After masking, four consecutive bytes in the results do not have the same bits set. + v3 = vandq_u8(v3, mask); + v4 = vandq_u8(v4, mask); + // Pack the comparison results into 128, and then 64 bits. + v3 = vpaddq_u8(v3, v4); + v3 = vpaddq_u8(v3, v3); + + uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0); + + if (offset) { + __asm__ ("rbit %x0, %x0" : "+r" (offset)); + static_assert(sizeof(unsigned long long) == sizeof(uint64_t), + "Need the number of leading 0-bits in uint64_t."); + // offset uses 2 bits per byte of input. + buf += __builtin_clzll(offset) / 2; + goto FOUND_CTL; + } + } + + buf += sizeof(v1) + sizeof(v2); + } #else /* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */ while (likely(buf_end - buf >= 8)) {