From d91daffc9e400037fbe874ef24c82920a71b5838 Mon Sep 17 00:00:00 2001 From: gtsoul-tech Date: Thu, 25 Jul 2024 14:54:14 +0300 Subject: [PATCH 1/3] fixed paths and utf8-lossy=true --- unit/hyperscan/rebar_tests.cpp | 84 ++++++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 15 deletions(-) diff --git a/unit/hyperscan/rebar_tests.cpp b/unit/hyperscan/rebar_tests.cpp index be23d677d..12e9905b7 100644 --- a/unit/hyperscan/rebar_tests.cpp +++ b/unit/hyperscan/rebar_tests.cpp @@ -43,6 +43,12 @@ using namespace std; +#define xstr(s) to_string_literal(s) +#define to_string_literal(s) #s + +#define SRCDIR_PREFIX xstr(SRCDIR) + + TEST(rebar, leipzig_math_symbols_count) { hs_database_t *db = nullptr; hs_compile_error_t *compile_err = nullptr; @@ -60,8 +66,8 @@ TEST(rebar, leipzig_math_symbols_count) { ASSERT_EQ(HS_SUCCESS, err); ASSERT_TRUE(scratch != nullptr); - - std::ifstream file("../source/unit/hyperscan/datafiles/leipzig-3200.txt"); + string filename = "unit/hyperscan/datafiles/leipzig-3200.txt"; + std::ifstream file((string(SRCDIR_PREFIX) + "/" + filename).c_str()); std::stringstream buffer; buffer << file.rdbuf(); // Read the file into the buffer std::string data = buffer.str(); // Convert the buffer into a std::string @@ -77,14 +83,55 @@ TEST(rebar, leipzig_math_symbols_count) { ASSERT_EQ(HS_SUCCESS, err); } +// Function to replace invalid UTF-8 sequences with the replacement character +std::string utf8_lossy_decode(const std::string &input) { + std::string output; + for (size_t i = 0; i < input.size(); ++i) { + unsigned char c = input[i]; + if (c < 0x80) { + output += c; + } else if (c < 0xC0) { + output += '\xEF'; + output += '\xBF'; + output += '\xBD'; + } else if (c < 0xE0) { + if (i + 1 < input.size() && (input[i + 1] & 0xC0) == 0x80) { + output += c; + output += input[i + 1]; + ++i; + } else { + output += '\xEF'; + output += '\xBF'; + output += '\xBD'; + } + } else if (c < 0xF0) { + if (i + 2 < input.size() && (input[i + 1] & 0xC0) == 0x80 && (input[i + 2] & 0xC0) == 0x80) { + output += c; + output += input[i + 1]; + output += input[i + 2]; + i += 2; + } else { + output += '\xEF'; + output += '\xBF'; + output += '\xBD'; + } + } else { + output += '\xEF'; + output += '\xBF'; + output += '\xBD'; + } + } + return output; +} + TEST(rebar, lh3lh3_reb_uri_or_email_grep) { hs_database_t *db = nullptr; hs_compile_error_t *compile_err = nullptr; CallBackContext c; const char *expr = "([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^ @]+)"; const unsigned flag = 0; - const unsigned id= 1; - hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK,nullptr, &db, &compile_err); + const unsigned id = 1; + hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK, nullptr, &db, &compile_err); ASSERT_EQ(HS_SUCCESS, err); ASSERT_TRUE(db != nullptr); @@ -94,14 +141,17 @@ TEST(rebar, lh3lh3_reb_uri_or_email_grep) { ASSERT_EQ(HS_SUCCESS, err); ASSERT_TRUE(scratch != nullptr); - - std::ifstream file("../source/unit/hyperscan/datafiles/lh3lh3-reb-howto.txt"); + string filename = "unit/hyperscan/datafiles/lh3lh3-reb-howto.txt"; + std::ifstream file((string(SRCDIR_PREFIX) + "/" + filename).c_str()); std::stringstream buffer; buffer << file.rdbuf(); // Read the file into the buffer std::string data = buffer.str(); // Convert the buffer into a std::string + // Decode the data using UTF-8 lossy decoding + std::string decoded_data = utf8_lossy_decode(data); + c.halt = 0; - err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + err = hs_scan(db, decoded_data.c_str(), decoded_data.size(), 0, scratch, record_cb, reinterpret_cast(&c)); ASSERT_EQ(HS_SUCCESS, err); ASSERT_EQ(888987, c.matches.size()); @@ -117,8 +167,8 @@ TEST(rebar, lh3lh3_reb_email_grep) { CallBackContext c; const char *expr = "([^ @]+)@([^ @]+)"; const unsigned flag = 0; - const unsigned id= 1; - hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK,nullptr, &db, &compile_err); + const unsigned id = 1; + hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK, nullptr, &db, &compile_err); ASSERT_EQ(HS_SUCCESS, err); ASSERT_TRUE(db != nullptr); @@ -128,14 +178,17 @@ TEST(rebar, lh3lh3_reb_email_grep) { ASSERT_EQ(HS_SUCCESS, err); ASSERT_TRUE(scratch != nullptr); - - std::ifstream file("../source/unit/hyperscan/datafiles/lh3lh3-reb-howto.txt"); + string filename = "unit/hyperscan/datafiles/lh3lh3-reb-howto.txt"; + std::ifstream file((string(SRCDIR_PREFIX) + "/" + filename).c_str()); std::stringstream buffer; buffer << file.rdbuf(); // Read the file into the buffer std::string data = buffer.str(); // Convert the buffer into a std::string + // Decode the data using UTF-8 lossy decoding + std::string decoded_data = utf8_lossy_decode(data); + c.halt = 0; - err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + err = hs_scan(db, decoded_data.c_str(), decoded_data.size(), 0, scratch, record_cb, reinterpret_cast(&c)); ASSERT_EQ(HS_SUCCESS, err); ASSERT_EQ(232354, c.matches.size()); @@ -164,13 +217,14 @@ TEST(rebar, lh3lh3_reb_date_grep) { ASSERT_TRUE(scratch != nullptr); - std::ifstream file("../source/unit/hyperscan/datafiles/lh3lh3-reb-howto.txt"); + string filename = "unit/hyperscan/datafiles/lh3lh3-reb-howto.txt"; + std::ifstream file((string(SRCDIR_PREFIX) + "/" + filename).c_str()); std::stringstream buffer; buffer << file.rdbuf(); // Read the file into the buffer std::string data = buffer.str(); // Convert the buffer into a std::string - + std::string decoded_data = utf8_lossy_decode(data); c.halt = 0; - err = hs_scan(db, data.c_str(), data.size(), 0, scratch, record_cb, + err = hs_scan(db, decoded_data.c_str(), decoded_data.size(), 0, scratch, record_cb, reinterpret_cast(&c)); ASSERT_EQ(HS_SUCCESS, err); ASSERT_EQ(819, c.matches.size()); From 99b286b165a8b2906b128467a03a615d94baf058 Mon Sep 17 00:00:00 2001 From: gtsoul-tech Date: Thu, 25 Jul 2024 14:54:56 +0300 Subject: [PATCH 2/3] revert to maskz (its the bug) --- src/nfa/vermicelli_simd.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nfa/vermicelli_simd.cpp b/src/nfa/vermicelli_simd.cpp index 67ac1dac8..6aaa679c1 100644 --- a/src/nfa/vermicelli_simd.cpp +++ b/src/nfa/vermicelli_simd.cpp @@ -124,8 +124,8 @@ static const u8 *vermicelliExecReal(SuperVector const chars, SuperVector c // finish off tail if (d != buf_end) { - SuperVector data = SuperVector::loadu(buf_end - S); - rv = vermicelliBlock(data, chars, casemask, buf_end - S, buf_end - d); + SuperVector data = SuperVector::loadu_maskz(d, buf_end - d); + rv = vermicelliBlock(data, chars, casemask, d, buf_end - d); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } From 624c6a6ca74cff7c3ddeb32c64d55fb2dfd56ef3 Mon Sep 17 00:00:00 2001 From: gtsoul-tech Date: Fri, 26 Jul 2024 11:04:26 +0300 Subject: [PATCH 3/3] cppcheck fix --- unit/hyperscan/rebar_tests.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/unit/hyperscan/rebar_tests.cpp b/unit/hyperscan/rebar_tests.cpp index 12e9905b7..26e720551 100644 --- a/unit/hyperscan/rebar_tests.cpp +++ b/unit/hyperscan/rebar_tests.cpp @@ -55,7 +55,6 @@ TEST(rebar, leipzig_math_symbols_count) { CallBackContext c; const char *expr = "\\p{Sm}"; const unsigned flag = HS_FLAG_UCP | HS_FLAG_UTF8; - const unsigned id= 1; hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK,nullptr, &db, &compile_err); ASSERT_EQ(HS_SUCCESS, err); @@ -130,7 +129,6 @@ TEST(rebar, lh3lh3_reb_uri_or_email_grep) { CallBackContext c; const char *expr = "([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^ @]+)"; const unsigned flag = 0; - const unsigned id = 1; hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK, nullptr, &db, &compile_err); ASSERT_EQ(HS_SUCCESS, err); @@ -167,7 +165,6 @@ TEST(rebar, lh3lh3_reb_email_grep) { CallBackContext c; const char *expr = "([^ @]+)@([^ @]+)"; const unsigned flag = 0; - const unsigned id = 1; hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK, nullptr, &db, &compile_err); ASSERT_EQ(HS_SUCCESS, err); @@ -205,7 +202,6 @@ TEST(rebar, lh3lh3_reb_date_grep) { CallBackContext c; const char *expr = "([0-9][0-9]?)/([0-9][0-9]?)/([0-9][0-9]([0-9][0-9])?)"; const unsigned flag = 0; - const unsigned id= 1; hs_error_t err = hs_compile(expr, flag, HS_MODE_BLOCK,nullptr, &db, &compile_err); ASSERT_EQ(HS_SUCCESS, err);