From af782f38fd2cd3c40ab340b679f2696daf42eb39 Mon Sep 17 00:00:00 2001
From: Anton Kirilov <antonvkirilov@gmail.com>
Date: Sun, 27 Jan 2019 15:13:35 +0000
Subject: [PATCH] Optimizations for Armv8-A

These changes apply only to the AArch64 execution state.
---
 picohttpparser.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 136 insertions(+), 2 deletions(-)

diff --git a/picohttpparser.c b/picohttpparser.c
index 5e5783a..8e322d7 100644
--- a/picohttpparser.c
+++ b/picohttpparser.c
@@ -34,6 +34,12 @@
 #include <x86intrin.h>
 #endif
 #endif
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
 #include "picohttpparser.h"
 
 #if __GNUC__ >= 3
@@ -71,9 +77,8 @@
 #define ADVANCE_TOKEN(tok, toklen)                                                                                                 \
     do {                                                                                                                           \
         const char *tok_start = buf;                                                                                               \
-        static const char ALIGNED(16) ranges2[16] = "\000\040\177\177";                                                            \
         int found2;                                                                                                                \
-        buf = findchar_fast(buf, buf_end, ranges2, 4, &found2);                                                                    \
+        buf = findchar_nonprintable_fast(buf, buf_end, &found2);                                                                   \
         if (!found2) {                                                                                                             \
             CHECK_EOF();                                                                                                           \
         }                                                                                                                          \
@@ -131,6 +136,65 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha
     return buf;
 }
 
+static const char *findchar_nonprintable_fast(const char *buf, const char *buf_end, int *found)
+{
+#ifdef __ARM_FEATURE_SVE
+    *found = 0;
+
+    for (uint64_t i = 0;; i = svqincb(i, 1)) {
+        const uint64_t len = buf_end - buf;
+        const svbool_t pg = svwhilelt_b8(i, len);
+
+        if (!svptest_first(svptrue_b8(), pg)) {
+            buf = buf_end;
+            break;
+        }
+
+        const svuint8_t v = svld1(pg, (const uint8_t *) buf + i);
+        const svbool_t c = svorr_z(pg, svcmplt(pg, v, '\041'), svcmpeq(pg, v, '\177'));
+
+        if (svptest_any(pg, c)) {
+            *found = 1;
+            buf += i + svcntp_b8(pg, svbrkb_z(pg, c));
+            break;
+        }
+    }
+
+    return buf;
+#elif defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)
+    *found = 0;
+
+    const size_t block_size = sizeof(uint8x16_t) - 1;
+    const char * const end = (size_t) (buf_end - buf) >= block_size ? buf_end - block_size : buf;
+
+    for (; buf < end; buf += sizeof(uint8x16_t)) {
+        uint8x16_t v = vld1q_u8((const uint8_t *) buf);
+
+        v = vorrq_u8(vcltq_u8(v, vmovq_n_u8('\041')), vceqq_u8(v, vmovq_n_u8('\177')));
+
+        // Pack the comparison result into 64 bits.
+        const uint8x8_t rv = vshrn_n_u16(vreinterpretq_u16_u8(v), 4);
+        uint64_t offset = vget_lane_u64(vreinterpret_u64_u8(rv), 0);
+
+        if (offset) {
+            *found = 1;
+            __asm__ ("rbit %x0, %x0" : "+r" (offset));
+            static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
+                          "Need the number of leading 0-bits in uint64_t.");
+            // offset uses 4 bits per byte of input.
+            buf += __builtin_clzll(offset) / 4;
+            break;
+        }
+    }
+
+    return buf;
+#else
+    static const char ALIGNED(16) ranges2[16] = "\000\040\177\177";
+
+    return findchar_fast(buf, buf_end, ranges2, 4, found);
+#endif
+}
+
 static const char *get_token_to_eol(const char *buf, const char *buf_end, const char **token, size_t *token_len, int *ret)
 {
     const char *token_start = buf;
@@ -143,6 +207,76 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const
     buf = findchar_fast(buf, buf_end, ranges1, 6, &found);
     if (found)
         goto FOUND_CTL;
+#elif defined(__ARM_FEATURE_SVE)
+    for (uint64_t i = 0;; i = svqincb(i, 1)) {
+        const uint64_t len = buf_end - buf;
+        const svbool_t pg = svwhilelt_b8(i, len);
+
+        if (!svptest_first(svptrue_b8(), pg)) {
+            buf = buf_end;
+            break;
+        }
+
+        const svuint8_t v = svld1(pg, (const uint8_t *) buf + i);
+        const uint8_t space = '\040';
+
+        if (svptest_any(pg, svcmpge(pg, svsub_x(pg, v, space), 0137u))) {
+            svbool_t c = svcmpne(svcmplt(pg, v, space), v, '\011');
+
+            c = svorr_z(pg, c, svcmpeq(pg, v, '\177'));
+
+            if (svptest_any(pg, c)) {
+                buf += i + svcntp_b8(pg, svbrkb_z(pg, c));
+                goto FOUND_CTL;
+            }
+        }
+    }
+#elif defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)
+    const size_t block_size = 2 * sizeof(uint8x16_t) - 1;
+    const char * const end = (size_t) (buf_end - buf) >= block_size ? buf_end - block_size : buf;
+
+    for (; buf < end; buf += 2 * sizeof(uint8x16_t)) {
+        const uint8x16_t space = vmovq_n_u8('\040');
+        const uint8x16_t threshold = vmovq_n_u8(0137u);
+        const uint8x16_t v1 = vld1q_u8((const uint8_t *) buf);
+        const uint8x16_t v2 = vld1q_u8((const uint8_t *) buf + sizeof(v1));
+        uint8x16_t v3 = vcgeq_u8(vsubq_u8(v1, space), threshold);
+        uint8x16_t v4 = vcgeq_u8(vsubq_u8(v2, space), threshold);
+
+        v3 = vorrq_u8(v3, v4);
+        // Pack the comparison result into half a vector, i.e. 64 bits.
+        v3 = vpmaxq_u8(v3, v3);
+
+        if (vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0)) {
+            const uint8x16_t del = vmovq_n_u8('\177');
+            // This mask makes it possible to pack the comparison results into half a vector,
+            // which has the same size as uint64_t.
+            const uint8x16_t mask = vreinterpretq_u8_u32(vmovq_n_u32(0x40100401));
+            const uint8x16_t tab = vmovq_n_u8('\011');
+
+            v3 = vbicq_u8(vcltq_u8(v1, space), vceqq_u8(v1, tab));
+            v4 = vbicq_u8(vcltq_u8(v2, space), vceqq_u8(v2, tab));
+            v3 = vorrq_u8(v3, vceqq_u8(v1, del));
+            v4 = vorrq_u8(v4, vceqq_u8(v2, del));
+            // After masking, four consecutive bytes in the results do not have the same bits set.
+            v3 = vandq_u8(v3, mask);
+            v4 = vandq_u8(v4, mask);
+            // Pack the comparison results into 128, and then 64 bits.
+            v3 = vpaddq_u8(v3, v4);
+            v3 = vpaddq_u8(v3, v3);
+
+            uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0);
+
+            if (offset) {
+                __asm__ ("rbit %x0, %x0" : "+r" (offset));
+                static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
+                              "Need the number of leading 0-bits in uint64_t.");
+                // offset uses 2 bits per byte of input.
+                buf += __builtin_clzll(offset) / 2;
+                goto FOUND_CTL;
+            }
+        }
+    }
 #else
     /* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
     while (likely(buf_end - buf >= 8)) {