From 0daceeb0757a5058a9abb005bae459c7a04bdd73 Mon Sep 17 00:00:00 2001 From: masklinn Date: Sun, 2 Jun 2024 20:23:50 +0200 Subject: [PATCH] Initial implementation Notes: - uap is licensed under Apache 2.0 because that's the normal license for the project - regex-filtered is licensed under BSD 3-clauses because it's largely a translation (with changes) of re2's FilteredRE2 and IANAL but it seems fairer (and safer) to match --- .github/workflows/rust.yml | 28 ++ .gitignore | 2 + .gitmodules | 3 + Cargo.toml | 3 + README.md | 141 ++++++++ regex-filtered/Cargo.toml | 23 ++ regex-filtered/LICENSE | 28 ++ regex-filtered/README.md | 97 +++++ regex-filtered/benches/regex.rs | 34 ++ regex-filtered/src/lib.rs | 442 +++++++++++++++++++++++ regex-filtered/src/mapper.rs | 429 ++++++++++++++++++++++ regex-filtered/src/model.rs | 516 +++++++++++++++++++++++++++ ua-parser/Cargo.toml | 19 + ua-parser/LICENSE | 201 +++++++++++ ua-parser/examples/bench.rs | 46 +++ ua-parser/src/lib.rs | 609 ++++++++++++++++++++++++++++++++ ua-parser/src/resolvers.rs | 171 +++++++++ ua-parser/tests/integration.rs | 391 ++++++++++++++++++++ ua-parser/uap-core | 1 + 19 files changed, 3184 insertions(+) create mode 100644 .github/workflows/rust.yml create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 Cargo.toml create mode 100644 README.md create mode 100644 regex-filtered/Cargo.toml create mode 100644 regex-filtered/LICENSE create mode 100644 regex-filtered/README.md create mode 100644 regex-filtered/benches/regex.rs create mode 100644 regex-filtered/src/lib.rs create mode 100644 regex-filtered/src/mapper.rs create mode 100644 regex-filtered/src/model.rs create mode 100644 ua-parser/Cargo.toml create mode 100644 ua-parser/LICENSE create mode 100644 ua-parser/examples/bench.rs create mode 100644 ua-parser/src/lib.rs create mode 100644 ua-parser/src/resolvers.rs create mode 100644 ua-parser/tests/integration.rs create mode 160000 ua-parser/uap-core diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 0000000..537b525 --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,28 @@ +name: Rust + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +env: + CARGO_TERM_COLOR: always + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Build + run: cargo build --verbose + - name: Format + run: cargo fmt --check + - name: clippy + run: cargo clippy + - name: Run tests + run: cargo test -r --verbose diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..96ef6c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..542987b --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "ua-parser/uap-core"] + path = ua-parser/uap-core + url = https://github.com/ua-parser/uap-core diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..63463c3 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,3 @@ +[workspace] +members = ["regex-filtered", "ua-parser"] +resolver = "2" diff --git a/README.md b/README.md new file mode 100644 index 0000000..293a83e --- /dev/null +++ b/README.md @@ -0,0 +1,141 @@ +# User Agent Parser + +This module implements the [browserscope / uap +standard](https://github.com/ua-parser/uap-core) for rust, allowing +the extraction of various metadata from user agents. + +The browserscope standard is data-oriented, with [`regexes.yaml`] +specifying the matching and extraction from user-agent strings. This +library implements the maching protocols and provides various types to +make loading the dataset easier, however it does *not* provide the +data itself, to avoid dependencies on serialization libraries or +constrain loading. + +## Dataset loading + +The crate does not provide any sort of precompiled data file, or +dedicated loader, however [`Regexes`] implements +[`serde::Deserialize`] and can load a [`regexes.yaml`] file or any +format-preserving conversion thereof (e.g. loading from json or cbor +might be preferred if the application already depends on one of +those): + +```no_run +# let ua_str = ""; +let f = std::fs::File::open("regexes.yaml")?; +let regexes: ua_parser::Regexes = serde_yaml::from_reader(f)?; +let extractor = ua_parser::Extractor::try_from(regexes)?; + +# Ok::<(), Box>(()) +``` + +All the data-description structures are also Plain Old Data, so they +can be embedded in the application directly e.g. via a build script: + +``` rust +let parsers = vec![ + ua_parser::user_agent::Parser { + regex: "foo".into(), + family_replacement: Some("bar".into()), + ..Default::default() + } +]; +``` +## Extraction + +The crate provides the ability to either extract individual +information sets (user agent — browser, OS, and device) or extract all +three in a single call. + +The three infosets are are independent and non-overlapping so while +the full extractor may be convenient if only one is needed a complete +extraction is unnecessary overhead, and the extractors themselves are +somewhat costly to create and take up memory. + +### Complete Extractor + +For the complete extractor, it is simply converted from the +[`Regexes`] structure. The resulting [`Extractor`] embeds all three +module-level extractors as attributes, and [`Extractor::extract`]-s +into a 3-uple of `ValueRef`s. + + +### Individual Extractors + +The individual extractors are in the [`user_agent`], [`os`], and +[`device`] modules, the three modules follow the exact same model: + +- a `Parser` struct which specifies individual parser configurations, + used as inputs to the `Builder` +- a `Builder`, into which the relevant parsers can be `push`-ed +- an `Extractor` created from the `Builder`, from which the user can + `extract` a `ValueRef` +- the `ValueRef` result of data extraction, which may borrow from (and + is thus lifetime-bound to) the `Parser` substitution data and the + user agent string it was extracted from +- for convenience, an owned `Value` variant of the `ValueRef` + +``` rust +use ua_parser::os::{Builder, Parser, ValueRef}; + +let e = Builder::new() + .push(Parser { + regex: r"(Android)[ \-/](\d+)(?:\.(\d+)|)(?:[.\-]([a-z0-9]+)|)".into(), + ..Default::default() + })? + .push(Parser { + regex: r"(Android) Donut".into(), + os_v1_replacement: Some("1".into()), + os_v2_replacement: Some("2".into()), + ..Default::default() + })? + .push(Parser { + regex: r"(Android) Eclair".into(), + os_v1_replacement: Some("2".into()), + os_v2_replacement: Some("1".into()), + ..Default::default() + })? + .push(Parser { + regex: r"(Android) Froyo".into(), + os_v1_replacement: Some("2".into()), + os_v2_replacement: Some("2".into()), + ..Default::default() + })? + .push(Parser { + regex: r"(Android) Gingerbread".into(), + os_v1_replacement: Some("2".into()), + os_v2_replacement: Some("3".into()), + ..Default::default() + })? + .push(Parser { + regex: r"(Android) Honeycomb".into(), + os_v1_replacement: Some("3".into()), + ..Default::default() + })? + .push(Parser { + regex: r"(Android) (\d+);".into(), + ..Default::default() + })? + .build()?; + +assert_eq!( + e.extract("Android Donut"), + Some(ValueRef { + os: "Android".into(), + major: Some("1".into()), + minor: Some("2".into()), + ..Default::default() + }), +); +assert_eq!( + e.extract("Android 15"), + Some(ValueRef { os: "Android".into(), major: Some("15".into()), ..Default::default()}), +); +assert_eq!( + e.extract("ZuneWP7"), + None, +); +# Ok::<(), Box>(()) +``` + +[`regexes.yaml`]: https://github.com/ua-parser/uap-core/blob/master/regexes.yaml diff --git a/regex-filtered/Cargo.toml b/regex-filtered/Cargo.toml new file mode 100644 index 0000000..a0f9490 --- /dev/null +++ b/regex-filtered/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "regex-filtered" +version = "0.1.0" +edition = "2021" +description = "Efficiently check an input against a large number of patterns" +keywords = ["regex", "filter", "FilteredRE2", "multiple", "prefilter"] +license = "BSD-3-Clause" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +aho-corasick = "1.1.3" +indexmap = "2.2.6" +itertools = "0.13.0" +regex = "1.10.4" +regex-syntax = "0.8.3" + +[dev-dependencies] +criterion = "0.5.1" + +[[bench]] +name = "regex" +harness = false diff --git a/regex-filtered/LICENSE b/regex-filtered/LICENSE new file mode 100644 index 0000000..9b346f1 --- /dev/null +++ b/regex-filtered/LICENSE @@ -0,0 +1,28 @@ +BSD 3-Clause License + +Copyright (c) 2024, ua-parser project + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/regex-filtered/README.md b/regex-filtered/README.md new file mode 100644 index 0000000..1667920 --- /dev/null +++ b/regex-filtered/README.md @@ -0,0 +1,97 @@ +# regex-filtered: FilteredRE2 for rust-regex + +This crate implements the logic behind [`FilteredRE2`] on top of +[`regex`]. + +The purpose is to allow efficient selection of one or more regexes +matching an input from a *large* set without having to check every +regex linearly, by prefiltering candidate regexes and only matching +those against the input. + +This should be preferred to [`regex::RegexSet`] if the regexes are +non-trivial (e.g. non-literal), as [`regex::RegexSet`] constructs a +single state machine which quickly grows huge and slow. + +Linear matching does not have *that* issue and works fine with complex +regexes, but doesn't scale as the number of regexes increases and +match failures quickly get very expensive (as they require traversing +the entire set every time). + +## Usage + +``` rust +let matcher = regex_filtered::Builder::new() + .push("foo")? + .push("bar")? + .push("baz")? + .push("quux")? + .build()?; + +assert!(matcher.is_match("bar")); +assert_eq!(matcher.matching("baz").count(), 1); +assert_eq!(matcher.matching("foo quux").count(), 2); +# Ok::<(), Box>(()) +``` + +[`Regexes::is_match`] returns whether *any* pattern in the set matches +the haystack. It is essentially equivalent to +`matcher.matching(...).next().is_some()`. + +[`Regexes::matching`] returns an iterator of matching [`regex::Regex`] +and corresponding index. The index can be used to look up ancillary +data (e.g. replacement content), and the [`regex::Regex`] can be used +to [`regex::Regex::find`] or [`regex::Regex::captures`] data out of +the haystack. + +## Notes + +`regex-filtered` only returns the matching regexes (and their index) +as capturing especially is *significantly* more expensive than +checking for a match, this slightly pessimises situations where the +prefilter prunes perfectly but it is a large gain as soon as that's +not the case and the prefilter has to be post-filtered. + +## Concepts + +From a large set of regexes, extract distinguishing literal tokens, +match the tokens against the input, reverse-lookup which regexes the +matching tokens correspond to, and only run the corresponding regexes +on the input. + +This extraction is done by gathering literal items, converting them to +content sets, then symbolically executing concatenations and +alternations (`|`) in order to find out what literal items *need* to +be present in the haystack for this regex to match. A reverse index is +then built from literal items to regexes. + +At match time, a prefilter is run checking which literals are present +in the haystack then find out what regexes that corresponds to, +following which the regexes themselves are matched against the +haystack to only return actual matching regexes. + +## Divergences + +While [`FilteredRE2`] requires the user to perform prefiltering, +`regex-filtered` handles this internally: [`aho-corasick`] is pretty +much ideal for that task and already a dependency of [`regex`] which +`regex-filtered` based on. + +## TODO + +- add a stats feature to report various build-size infos e.g. + + - number of tokens + - number of regexes + - number of unfiltered regexes, this would be useful to know if + prefiltering will be done or a naive sequential application would + be a better idea. + - ratio of checked regexes to successes (how does it work with lazy + iterators?) + - total / prefiltered (- unfiltered) so atom size impact can be + evaluated + - also maybe mapper stats on the pruning stuff and whatever + +[`aho-corasick`]: https://docs.rs/aho-corasick/ +[`FilteredRE2`]: https://github.com/google/re2/blob/main/re2/filtered_re2.h +[`regex`]: https://docs.rs/regex/ +[`regex-syntax`]: https://docs.rs/regex-syntax/ diff --git a/regex-filtered/benches/regex.rs b/regex-filtered/benches/regex.rs new file mode 100644 index 0000000..62d6f94 --- /dev/null +++ b/regex-filtered/benches/regex.rs @@ -0,0 +1,34 @@ +use criterion::{criterion_group, criterion_main, Criterion}; + +use regex::Regex; + +/// On this trivial syntetic test, the results on an M1P are: +/// +/// * 18ns for a match failure +/// * 33ns for a match success +/// * 44ns for a capture failure +/// * 111ns for a capture success +/// +/// Cutoff is at n=1.27 failures average. So really depends how +/// selective the prefilter is... +fn bench_regex(c: &mut Criterion) { + let r = Regex::new(r"(foo|bar)baz/(\d+)\.(\d+)").unwrap(); + + c.bench_function("has match - success", |b| { + b.iter(|| r.is_match("foobaz/1.2")) + }); + c.bench_function("has match - failure", |b| { + b.iter(|| r.is_match("fooxbaz/1.2")) + }); + + c.bench_function("match - success", |b| b.iter(|| r.find("foobaz/1.2"))); + c.bench_function("match - failure", |b| b.iter(|| r.find("fooxbaz/1.2"))); + + c.bench_function("capture - success", |b| b.iter(|| r.captures("foobaz/1.2"))); + c.bench_function("capture - failure", |b| { + b.iter(|| r.captures("fooxbaz/1.2")) + }); +} + +criterion_group!(benches, bench_regex); +criterion_main!(benches); diff --git a/regex-filtered/src/lib.rs b/regex-filtered/src/lib.rs new file mode 100644 index 0000000..9b54604 --- /dev/null +++ b/regex-filtered/src/lib.rs @@ -0,0 +1,442 @@ +#![doc = include_str!("../README.md")] +#![deny(unsafe_code)] +#![warn(missing_docs)] + +use aho_corasick::AhoCorasick; + +mod mapper; +mod model; +pub use model::Error as ModelError; + +/// Builder for the regexes set +pub struct Builder { + regexes: Vec, + mapper_builder: mapper::Builder, +} + +/// Parser configuration, can be used to tune the regex parsing when +/// adding it to the [`Builder`]. Every option defaults to `false` +/// whether through [`Default`] or [`Options::new`]. +/// +/// The parser can also be configured via standard [`regex`] inline +/// flags. +#[derive(Default)] +pub struct Options { + case_insensitive: bool, + dot_matches_new_line: bool, + ignore_whitespace: bool, + multi_line: bool, + crlf: bool, +} + +impl Options { + /// Create a new options object. + pub fn new() -> Self { + Self::default() + } + /// Configures case-insensitive matching for the entire pattern. + pub fn case_insensitive(&mut self, yes: bool) -> &mut Self { + self.case_insensitive = yes; + self + } + /// Configures `.` to match newline characters, by default `.` + /// matches everything *except* newline characters. + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Self { + self.dot_matches_new_line = yes; + self + } + /// Configures ignoring whitespace inside patterns, as well as `#` + /// line comments ("verbose" mode). + /// + /// Verbose mode is useful to break up complex regexes and improve + /// their documentation. + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Self { + self.ignore_whitespace = yes; + self + } + /// Configures multi-line mode. When enabled, `^` matches at every + /// start of line and `$` at every end of line, by default they + /// match only the start and end of the string respectively.ca + pub fn multi_line(&mut self, yes: bool) -> &mut Self { + self.multi_line = yes; + self + } + /// Allows `\r` as a line terminator, by default only `\n` is a + /// line terminator (relevant for [`Self::ignore_whitespace`] and + /// [`Self::multi_line`]). + pub fn crlf(&mut self, yes: bool) -> &mut Self { + self.crlf = yes; + self + } + fn to_regex(&self, pattern: &str) -> Result { + regex::RegexBuilder::new(pattern) + .case_insensitive(self.case_insensitive) + .dot_matches_new_line(self.dot_matches_new_line) + .ignore_whitespace(self.ignore_whitespace) + .multi_line(self.multi_line) + .crlf(self.crlf) + .build() + } +} +impl From for regex_syntax::Parser { + fn from(opt: Options) -> Self { + Self::from(&opt) + } +} +impl From<&Options> for regex_syntax::Parser { + fn from( + Options { + case_insensitive, + dot_matches_new_line, + ignore_whitespace, + multi_line, + crlf, + }: &Options, + ) -> Self { + regex_syntax::ParserBuilder::new() + .case_insensitive(*case_insensitive) + .dot_matches_new_line(*dot_matches_new_line) + .ignore_whitespace(*ignore_whitespace) + .multi_line(*multi_line) + .crlf(*crlf) + .build() + } +} + +/// Parsing error when adding a new regex to the [`Builder`]. +#[derive(Debug)] +pub enum ParseError { + /// An error occurred while parsing the regex or translating it to + /// HIR. + SyntaxError(String), + /// An error occurred while processing the regex for atom + /// extraction. + ProcessingError(ModelError), + /// The regex was too large to compile to the NFA (within the + /// default limits). + RegexTooLarge(usize), +} +impl std::error::Error for ParseError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + ParseError::ProcessingError(e) => Some(e), + ParseError::SyntaxError(_) => None, + ParseError::RegexTooLarge(_) => None, + } + } +} +impl std::fmt::Display for ParseError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{self:?}") + } +} +impl From for ParseError { + fn from(value: regex_syntax::Error) -> Self { + Self::SyntaxError(value.to_string()) + } +} +impl From for ParseError { + fn from(value: regex::Error) -> Self { + match value { + regex::Error::CompiledTooBig(v) => Self::RegexTooLarge(v), + e => Self::SyntaxError(e.to_string()), + } + } +} +impl From for ParseError { + fn from(value: ModelError) -> Self { + Self::ProcessingError(value) + } +} + +/// Error while compiling the builder to a prefiltered set. +#[derive(Debug)] +pub enum BuildError { + /// Error while building the prefilter. + PrefilterError(aho_corasick::BuildError), +} +impl std::error::Error for BuildError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + BuildError::PrefilterError(p) => Some(p), + } + } +} +impl std::fmt::Display for BuildError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{self:?}") + } +} +impl From for BuildError { + fn from(value: aho_corasick::BuildError) -> Self { + Self::PrefilterError(value) + } +} + +impl Builder { + /// Instantiate a builder with the default metadata configuration: + /// + /// - minimum atom length 3 + #[must_use] + pub fn new() -> Self { + Self::new_atom_len(3) + } + + /// Instantiate a builder with a custom minimum atom length. + /// Increasing the atom length decreases the size and cost of the + /// prefilter, but may make more regexes impossible to prefilter, + /// which can increase matching costs. + #[must_use] + pub fn new_atom_len(min_atom_len: usize) -> Self { + Self { + regexes: Vec::new(), + mapper_builder: mapper::Builder::new(min_atom_len), + } + } + + /// Currently loaded regexes. + pub fn regexes(&self) -> &[regex::Regex] { + &self.regexes + } + + /// Push a single regex into the builder, using the default + /// parsing options. + pub fn push(self, s: &str) -> Result { + self.push_opt(s, &Options::new()) + } + + /// Push a single regex into the builder, using custom parsing + /// options. + pub fn push_opt(mut self, regex: &str, opts: &Options) -> Result { + let hir = regex_syntax::Parser::from(opts).parse(regex)?; + let pf = model::Model::new(&hir)?; + self.mapper_builder.push(pf); + self.regexes.push(opts.to_regex(regex)?); + Ok(self) + } + + /// Push a batch of regexes into the builder, using the default + /// parsing options. + pub fn push_all(self, i: I) -> Result + where + T: AsRef, + I: IntoIterator, + { + i.into_iter().try_fold(self, |b, s| b.push(s.as_ref())) + } + + /// Build the regexes set from the current builder. + /// + /// Building a regexes set from no regexes is useless but not an + /// error. + pub fn build(self) -> Result { + let Self { + regexes, + mapper_builder, + } = self; + let (mapper, atoms) = mapper_builder.build(); + + // Instead of returning a bunch of atoms for the user to + // manage, since `regex` depends on aho-corasick by default we + // can use that directly and not bother the user. + let prefilter = AhoCorasick::builder() + .ascii_case_insensitive(true) + .prefilter(true) + .build(atoms)?; + + Ok(Regexes { + regexes, + mapper, + prefilter, + }) + } +} + +impl Default for Builder { + fn default() -> Self { + Self::new() + } +} + +/// Regexes set, allows testing inputs against a *large* number of +/// *non-trivial* regexes. +pub struct Regexes { + regexes: Vec, + mapper: mapper::Mapper, + prefilter: AhoCorasick, +} + +impl Regexes { + // TODO: + // - number of tokens (prefilter.patterns_len()) + // - number of regexes + // - number of unfiltered regexes (from mapper) + // - ratio of checked regexes to successes (cfg-gated) + // - total / prefiltered (- unfiltered?) so atom size can be manipulated + #[inline] + fn prefilter<'a>(&'a self, haystack: &'a str) -> impl Iterator + 'a { + self.prefilter + .find_overlapping_iter(haystack) + .map(|m| m.pattern().as_usize()) + } + + #[inline] + fn prefiltered(&self, haystack: &str) -> impl Iterator { + self.mapper.atom_to_re(self.prefilter(haystack)).into_iter() + } + + /// Returns *whether* any regex in the set matches the haystack. + pub fn is_match(&self, haystack: &str) -> bool { + eprintln!("{}", self.prefiltered(haystack).count()); + self.prefiltered(haystack) + .any(|idx| self.regexes[idx].is_match(haystack)) + } + + /// Yields the regexes matching the haystack along with their + /// index. + /// + /// The results are guaranteed to be returned in ascending order. + pub fn matching<'a>( + &'a self, + haystack: &'a str, + ) -> impl Iterator + 'a { + self.prefiltered(haystack).filter_map(move |idx| { + let r = &self.regexes[idx]; + r.is_match(haystack).then_some((idx, r)) + }) + } + + /// Returns a reference to all the regexes in the set. + pub fn regexes(&self) -> &[regex::Regex] { + &self.regexes + } +} + +#[cfg(test)] +mod test { + use super::*; + use itertools::Itertools; + + #[test] + fn empty_filter() { + let f = Builder::new().build().unwrap(); + assert_eq!(f.prefilter("0123").collect_vec(), vec![]); + + assert_eq!(f.matching("foo").count(), 0); + } + + #[test] + fn empty_pattern() { + let f = Builder::new().push("").unwrap().build().unwrap(); + + assert_eq!(f.prefilter("0123").collect_vec(), vec![]); + + assert_eq!( + f.matching("0123").map(|(idx, _)| idx).collect_vec(), + vec![0] + ); + } + + #[test] + fn small_or_test() { + let f = Builder::new_atom_len(4) + .push("(foo|bar)") + .unwrap() + .build() + .unwrap(); + + assert_eq!(f.prefilter("lemurs bar").collect_vec(), vec![]); + + assert_eq!( + f.matching("lemurs bar").map(|(idx, _)| idx).collect_vec(), + vec![0], + ); + + let f = Builder::new().push("(foo|bar)").unwrap().build().unwrap(); + + assert_eq!(f.prefilter("lemurs bar").collect_vec(), vec![1]); + + assert_eq!( + f.matching("lemurs bar").map(|(idx, _)| idx).collect_vec(), + vec![0], + ); + } + + #[test] + fn basic_matches() { + let f = Builder::new() + .push("(abc123|abc|defxyz|ghi789|abc1234|xyz).*[x-z]+") + .unwrap() + .push("abcd..yyy..yyyzzz") + .unwrap() + .push("mnmnpp[a-z]+PPP") + .unwrap() + .build() + .unwrap(); + + assert_eq!( + f.matching("abc121212xyz").map(|(idx, _)| idx).collect_vec(), + vec![0], + ); + + assert_eq!( + f.matching("abc12312yyyzzz") + .map(|(idx, _)| idx) + .collect_vec(), + vec![0], + ); + + assert_eq!( + f.matching("abcd12yyy32yyyzzz") + .map(|(idx, _)| idx) + .collect_vec(), + vec![0, 1], + ); + } + + #[test] + fn basics() { + // In re2 this is the `MoveSemantics` test, which is... so not + // necessary for us. But it's a pair of extra regexes we can + // test + + let f = Builder::new().push("foo\\d+").unwrap().build().unwrap(); + + assert_eq!( + f.matching("abc foo1 xyz").map(|(idx, _)| idx).collect_vec(), + vec![0], + ); + assert_eq!( + f.matching("abc bar2 xyz").map(|(idx, _)| idx).collect_vec(), + vec![], + ); + + let f = Builder::new().push("bar\\d+").unwrap().build().unwrap(); + + assert_eq!( + f.matching("abc foo1 xyz").map(|(idx, _)| idx).collect_vec(), + vec![], + ); + assert_eq!( + f.matching("abc bar2 xyz").map(|(idx, _)| idx).collect_vec(), + vec![0], + ); + } + + #[test] + fn bulk_api() { + use std::io::BufRead as _; + + Builder::new().push_all(["a", "b"]).unwrap(); + + Builder::new() + .push_all(vec!["a".to_string(), "b".to_string()]) + .unwrap(); + + Builder::new().push_all("a\nb\nc\nd\n".lines()).unwrap(); + + Builder::new() + .push_all(b"a\nb\nc\nd\n".lines().map(|l| l.unwrap())) + .unwrap(); + } +} diff --git a/regex-filtered/src/mapper.rs b/regex-filtered/src/mapper.rs new file mode 100644 index 0000000..a582b9c --- /dev/null +++ b/regex-filtered/src/mapper.rs @@ -0,0 +1,429 @@ +use std::collections::{HashMap, HashSet}; +use std::fmt::Display; +use std::fmt::Formatter; + +use indexmap::IndexSet; + +use super::model::Model; + +pub struct Builder { + min_atom_len: usize, + models: Vec, + unfiltered: Vec, +} +impl Builder { + pub fn new(min_atom_len: usize) -> Self { + Self { + min_atom_len, + models: Vec::new(), + unfiltered: Vec::new(), + } + } + + pub fn push(&mut self, mut pf: Model) { + if !self.keep_node(&mut pf) { + self.unfiltered.push(self.models.len()); + // these go into unfiltered: regexes which always pass + // through the filter + // re2 uses nulls here but that's not us + pf = Model::all(); + } + self.models.push(pf); + } + fn keep_node(&self, pf: &mut Model) -> bool { + match pf { + Model::All(_) | Model::None(_) => false, + Model::Atom(_, s) => s.len() >= self.min_atom_len, + Model::And(_, subs) => { + subs.retain_mut(|p| self.keep_node(p)); + !subs.is_empty() + } + Model::Or(_, subs) => subs.iter_mut().all(|p| self.keep_node(p)), + } + } + + pub fn build(self) -> (Mapper, Vec) { + // inlined `assign_unique_ids` because it doesn't seem super useful... to us + #[allow(clippy::mutable_key_type)] + let mut nodes = NodeSet::new(); + let mut atoms = Vec::new(); + let mut atom_index_to_id = Vec::new(); + // Build vector of all filter nodes, sorted topologically, + // from top to bottom in v add the top-level node of each + // regexp model + let mut v = self.models.iter().collect::>(); + + // now add all the descendant nodes, this has to be a `while` because we unroll the source + let mut i = 0; + while i < v.len() { + let p = &v[i]; + i += 1; + + if let Model::And(_, s) | Model::Or(_, s) = &p { + v.extend(s.iter()); + } + } + + let mut unique_id = 0..; + // identify unique nodes + for node in v.iter().rev() { + if let Some(canonical) = nodes.get(node) { + node.set_unique_id(canonical.unique_id()); + } else { + let uid = unique_id.next().expect("infinite"); + node.set_unique_id(uid); + if let Model::Atom(_, s) = &node { + atoms.push(s.to_string()); + atom_index_to_id.push(uid); + } + nodes.insert(node); + } + } + + // maybe this could just be a prealloc and we append since id + // should be a sequence? + let mut entries = vec![Entry::default(); unique_id.next().expect("infinite(ish) sequence")]; + // Fill the entries + for model in v.iter().rev() { + if nodes.get(model) != Some(model) { + continue; + } + let id = model.unique_id(); + match &model { + Model::None(_) => unreachable!("no idea why this is an error"), + // We replace excluded models by All rather than null, + // so those are not unreachable. + Model::All(_) => (), + Model::Atom(_, _) => { + entries[id].propagate_up_at_count = 1; + } + // For each child, we append our id to the child's + // list of parent ids... unless we happen to have done + // so already. The number of appends is the number of + // unique children, which allows correct upward + // propagation from AND nodes. + Model::And(_, s) | Model::Or(_, s) => { + let mut up_count = 0; + for child_id in s.iter().map(|c| c.unique_id()) { + let parents = &mut entries[child_id].parents; + if parents.last() != Some(&id) { + parents.push(id); + up_count += 1; + } + } + + entries[id].propagate_up_at_count = if matches!(&model, Model::And(..)) { + up_count + } else { + 1 + }; + } + } + } + + // For top level nodes, populate regexp id + for (i, tl) in v[..self.models.len()].iter().enumerate() { + if let Some(p) = nodes.get(tl) { + entries[p.unique_id()].regexps.push(i); + } + } + + // Lastly, using probability-based heuristics, we identify nodes + // that trigger too many parents and then we try to prune edges. + // We use logarithms below to avoid the likelihood of underflow. + let log_num_regexps = ((self.models.len() - self.unfiltered.len()) as f64).ln(); + // Hoisted this above the loop so that we don't thrash the heap. (???) + let mut entries_by_num_edges = Vec::<(usize, usize)>::new(); + for model in v.iter().rev() { + let Model::And(_, s) = &model else { + continue; + }; + if nodes.get(model) != Some(model) { + continue; + } + let id = model.unique_id(); + + // Sort the current node's children by the numbers of parents. + for child_id in s.iter().map(|c| c.unique_id()) { + entries_by_num_edges.push((entries[child_id].parents.len(), child_id)); + } + entries_by_num_edges.sort_unstable(); + + // A running estimate of how many regexps will be + // triggered by pruning the remaining children's edges to + // the current node. Our nominal target is one, so the + // threshold is log(1) == 0; pruning occurs iff the child + // has more than nine edges left. + let mut log_num_triggered = log_num_regexps; + for (_, child_id) in entries_by_num_edges.drain(..) { + let parents = &mut entries[child_id].parents; + if log_num_triggered > 0. { + log_num_triggered += (parents.len() as f64).ln(); + log_num_triggered -= log_num_regexps; + } else if parents.len() > 9 { + if let Some(idx) = parents.iter().position(|&p| p == id) { + parents.swap_remove(idx); + // re2 uses an `int`, which can go negative, + // we use a usize (because it's based on the + // number of children or sth though it's + // probably unnecessary) but that means we + // can't keep decrementing below 0 + entries[id].propagate_up_at_count = + entries[id].propagate_up_at_count.saturating_sub(1); + } + } + } + } + + ( + Mapper { + entries, + unfiltered: self.unfiltered, + atom_to_entry: atom_index_to_id, + regexp_count: self.models.len(), + }, + atoms, + ) + } +} + +impl Display for Mapper { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + writeln!(f, "#Unique Atoms: {}", self.atom_to_entry.len())?; + for (i, e) in self.atom_to_entry.iter().copied().enumerate() { + writeln!(f, "\tatom {i} -> entry {e}")?; + for r in self.propagate_match([e].into()) { + writeln!(f, "\t\tregex {r}")?; + } + } + + writeln!(f, "#Unique Entries: {}", self.entries.len())?; + for (i, entry) in self.entries.iter().enumerate() { + writeln!( + f, + "\tEntry: {i} Regexps: {} Threshold: {}", + entry.regexps.len(), + entry.propagate_up_at_count, + )?; + for parent in &entry.parents { + writeln!(f, "\t\tParent {parent}")?; + } + } + Ok(()) + } +} + +type NodeSet<'a> = std::collections::HashSet<&'a Model>; + +/// Each unique node has a corresponding Entry that helps in passing +/// the matching trigger information along the tree. +#[derive(Default, Clone, Debug)] +struct Entry { + /// How many children should match before this node triggers the + /// parent. For an atom and an OR node, this is 1 and for an AND + /// node, it is the number of unique children. + propagate_up_at_count: usize, + + /// When this node is ready to trigger the parent, what are the indices + /// of the parent nodes to trigger. The reason there may be more than + /// one is because of sharing. For example (abc | def) and (xyz | def) + /// are two different nodes, but they share the atom 'def'. So when + /// 'def' matches, it triggers two parents, corresponding to the two + /// different OR nodes. + parents: Vec, + + /// When this node is ready to trigger the parent, what are the + /// regexps that are triggered. + regexps: Vec, +} +pub struct Mapper { + /// Number of regexes covered by the mapper + regexp_count: usize, + /// Nodes formed by build, there is one node for each unique atom + /// and each unique and/or node + entries: Vec, + /// Indices of regexp which always make it through the filter + /// (didn't find distinguishing literals in them) + unfiltered: Vec, + /// Atom index to entry id mapping + atom_to_entry: Vec, +} +impl Mapper { + // name is shit and also needs to see if we can generate stuff on the fly + pub fn atom_to_re(&self, atoms: impl IntoIterator) -> Vec { + let matched_atom_ids = atoms + .into_iter() + .map(|idx| self.atom_to_entry[idx]) + .collect(); + let regexps_map = self.propagate_match(matched_atom_ids); + + let mut regexps = Vec::with_capacity(regexps_map.len() + self.unfiltered.len()); + regexps.extend(&self.unfiltered); + regexps.extend(regexps_map); + + regexps.sort_unstable(); + regexps + } + + fn propagate_match(&self, mut work: IndexSet) -> HashSet { + work.reserve(self.entries.len() - work.len()); + let mut count = HashMap::with_capacity(self.entries.len()); + + let mut regexps = HashSet::with_capacity(self.regexp_count); + + let mut i = 0; + while i < work.len() { + let idx = work[i]; + i += 1; + + let entry = &self.entries[idx]; + // record regexps triggered + regexps.extend(&entry.regexps); + // pass trigger up to parents + for &j in &entry.parents { + let parent = &self.entries[j]; + // Delay until all the children have succeeded. + if parent.propagate_up_at_count > 1 { + let c = count.entry(j).and_modify(|e| *e += 1).or_insert(1); + if *c < parent.propagate_up_at_count { + continue; + } + } + work.insert(j); + } + } + + regexps + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::model::Model; + use regex_syntax::parse; + + #[test] + fn empty_matcher() { + let (m, atoms) = Builder::new(3).build(); + assert_eq!(atoms.len(), 0); + assert_eq!(&m.unfiltered, &[]); + } + + #[test] + fn empty_pattern() { + let mut b = Builder::new(3); + b.push(Model::new(&parse("").unwrap()).unwrap()); + let (m, atoms) = b.build(); + assert_eq!(atoms.len(), 0); + assert_eq!(&m.unfiltered, &[0]); + } + + #[test] + fn small_or_test() { + let mut b = Builder::new(4); + b.push(Model::new(&parse("(foo|bar)").unwrap()).unwrap()); + let (m, atoms) = b.build(); + assert_eq!(atoms.len(), 0); + assert_eq!(&m.unfiltered, &[0]); + assert_eq!(&m.atom_to_entry, &[]) + } + + #[test] + fn reverse_index() { + let mut b = Builder::new(3); + b.push(Model::new(&parse("(foo|bar)").unwrap()).unwrap()); + let (m, _) = b.build(); + + assert_eq!(m.entries.len(), 3); + assert_eq!(&m.atom_to_entry, &[0, 1]); + assert_eq!(m.propagate_match([0].into()), [0].into(),); + assert_eq!(m.propagate_match([1].into()), [0].into(),); + } + + fn check_patterns(patterns: &'static [&'static str], expected: &'static [&'static str]) { + let mut b = Builder::new(3); + for pattern in patterns { + b.push(Model::new(&parse(pattern).unwrap()).unwrap()); + } + let (_, mut atoms) = b.build(); + + atoms.sort(); + let mut sortspected = expected.to_vec(); + sortspected.sort(); + assert_eq!(atoms, sortspected); + } + + #[test] + fn empty_patterns_are_allowed() { + check_patterns(&[""], &[]); + } + + #[test] + fn all_atoms_greater_than_minlength_are_found_and_none_smaller() { + check_patterns( + &[ + "(abc123|def456|ghi789).*mnop[x-z]+", + "abc..yyy..zz", + "mnmnpp[a-z]+PPP", + ], + &[ + "abc123", "def456", "ghi789", "mnop", "abc", "yyy", "mnmnpp", "ppp", + ], + ); + } + #[test] + fn shortest_substrings_are_kept() { + check_patterns( + &[ + "(abc123|abc|defxyz|ghi789|abc1234|xyz).*[x-z]+", + "abcd..yyy..yyyzzz", + "mnmnpp[a-z]+PPP", + ], + &[ + "abc", "ghi789", "xyz", "abcd", "yyy", "yyyzzz", "mnmnpp", "ppp", + ], + ); + } + + #[test] + fn character_class_expansion() { + check_patterns( + &["m[a-c][d-f]n.*[x-z]+", "[x-y]bcde[ab]"], + &[ + "madn", "maen", "mafn", "mbdn", "mben", "mbfn", "mcdn", "mcen", "mcfn", "xbcdea", + "xbcdeb", "ybcdea", "ybcdeb", + ], + ); + } + #[test] + fn non_ascii_casefolding() { + check_patterns( + &[ + // re2 apparently does some sort of strange normalisation + // pass which regex does not and which does not seem + // entirely kosher (might be a unicode-aware but + // per-character upper then lower since it gets the final + // position sigma "wrong") + //"(?i)ΔδΠϖπΣςσ", + "ΛΜΝΟΠ", + "ψρστυ", + ], + &[ + //"δδπππσσσ", + "λμνοπ", + "ψρστυ", + ], + ); + } + + #[test] + fn test_empty_string_in_string_set() { + let mut b = Builder::new(0); + b.push(Model::new(&parse("-R.+(|ADD=;AA){12}}").unwrap()).unwrap()); + let (_, mut atoms) = b.build(); + atoms.sort(); + + assert_eq!(atoms, vec!["", "-r", "add=;aa", "}"],); + } +} diff --git a/regex-filtered/src/model.rs b/regex-filtered/src/model.rs new file mode 100644 index 0000000..1270aff --- /dev/null +++ b/regex-filtered/src/model.rs @@ -0,0 +1,516 @@ +use itertools::iproduct; +use regex_syntax::hir::{self, visit, Hir, HirKind, Visitor}; +use std::cell::Cell; +use std::fmt::{Display, Formatter, Write}; +use std::str::Utf8Error; +use std::{collections::BTreeSet, ops::Deref}; + +#[derive(Clone, Debug)] +pub enum Model { + /// Everything matches. + All(Cell), + /// Nothing matches. + None(Cell), + /// The string matches. + Atom(Cell, String), + /// All sub-filters must match. + And(Cell, Vec), + /// One sub-filter must match. + Or(Cell, Vec), +} +use Model::{All, And, Atom, None, Or}; + +impl std::hash::Hash for Model { + fn hash(&self, state: &mut H) { + state.write_u8(self.op()); + match self { + All(_) | None(_) => (), + Atom(_, s) => s.hash(state), + And(_, ps) | Or(_, ps) => { + state.write_usize(ps.len()); + for p in ps { + state.write_usize(p.unique_id()); + } + } + } + } +} + +impl std::cmp::PartialEq for Model { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (All(_), All(_)) | (None(_), None(_)) => true, + (Atom(_, a), Atom(_, b)) => a == b, + (And(_, va), And(_, vb)) | (Or(_, va), Or(_, vb)) => { + va.len() == vb.len() + && std::iter::zip(va, vb).all(|(a, b)| a.unique_id() == b.unique_id()) + } + _ => false, + } + } +} +impl Eq for Model {} + +impl From for Model { + fn from(s: String) -> Self { + Atom(Cell::new(usize::MAX), s) + } +} + +impl Display for Model { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match &self { + All(_) => f.write_str(""), + None(_) => f.write_str("*no-matches*"), + Atom(_, s) => f.write_str(s), + And(_, subs) => { + for (i, s) in subs.iter().enumerate() { + if i != 0 { + f.write_char(' ')?; + } + write!(f, "{s}")?; + } + Ok(()) + } + Or(_, subs) => { + f.write_char('(')?; + for (i, s) in subs.iter().enumerate() { + if i != 0 { + f.write_char('|')?; + } + write!(f, "{s}")?; + } + f.write_char(')') + } + } + } +} + +/// Processing errors +#[derive(Debug)] +pub enum Error { + /// Processing missed or exceeded some of the stack + FinalizationError, + /// Processing reached HIR nodes limit + EarlyStop, + /// Literal was not a valid string + DecodeError(Utf8Error), + /// Non-decodable character class + ClassError(hir::ClassBytes), +} +impl Display for Error { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{self:?}") + } +} +impl std::error::Error for Error {} +impl From for Error { + fn from(value: Utf8Error) -> Self { + Error::DecodeError(value) + } +} + +impl Model { + pub fn new(r: &Hir) -> Result { + visit(r, InfoVisitor::default()) + } + + pub fn unique_id(&self) -> usize { + match self { + All(id) | None(id) | Atom(id, _) | And(id, _) | Or(id, _) => id.get(), + } + } + pub fn set_unique_id(&self, value: usize) { + match self { + All(id) | None(id) | Atom(id, _) | And(id, _) | Or(id, _) => id.set(value), + } + } + + pub fn all() -> Self { + All(Cell::new(usize::MAX)) + } + + pub fn none() -> Self { + None(Cell::new(usize::MAX)) + } + + fn or_strings(strings: SSet) -> Self { + Model::Or( + Cell::new(usize::MAX), + simplify_string_set(strings).map(From::from).collect(), + ) + } + + fn op(&self) -> u8 { + match self { + All(_) => 0, + None(_) => 1, + Atom(_, _) => 2, + And(_, _) => 3, + Or(_, _) => 4, + } + } + + /// Simplifies And and Or nodes + fn simplify(self) -> Self { + match self { + And(uid, v) if v.is_empty() => All(uid), + Or(uid, v) if v.is_empty() => None(uid), + And(_, mut v) | Or(_, mut v) if v.len() == 1 => { + v.pop().expect("we checked the length").simplify() + } + s => s, + } + } + + // re2 merges those into separate functions but it only saves on + // the header and increases the branching complexity of the rest + // so y? + fn and(self, mut b: Self) -> Self { + let mut a = self.simplify(); + b = b.simplify(); + + // Canonicalize: a->op <= b->op. + if a.op() > b.op() { + std::mem::swap(&mut a, &mut b); + } + + // ALL and NONE are smallest opcodes. + a = match a { + // ALL and b = b + All(..) => return b, + // NONE and b = None + None(uid) => return None(uid), + a => a, + }; + + match (a, b) { + // If a and b match op, merge their contents. + (And(unique_id, mut va), And(_, vb)) => { + va.extend(vb); + And(unique_id, va) + } + // If a or b matches the operation, merge the other one in + (And(unique_id, mut v), vv) | (vv, And(unique_id, mut v)) => { + v.push(vv); + And(unique_id, v) + } + (a, b) => And(Cell::new(usize::MAX), vec![a, b]), + } + } + + fn or(self, mut b: Self) -> Self { + let mut a = self.simplify(); + b = b.simplify(); + + // Canonicalize: a->op <= b->op. + if a.op() > b.op() { + std::mem::swap(&mut a, &mut b); + } + + a = match a { + // NONE or b = b + None(..) => return b, + // ALL or b = ALL + All(uid) => return All(uid), + a => a, + }; + + match (a, b) { + // If a and b match op, merge their contents. + (Or(unique_id, mut va), Or(_, vb)) => { + va.extend(vb); + Or(unique_id, va) + } + // If a or b matches the operation, merge the other one in + (Or(unique_id, mut v), vv) | (vv, Or(unique_id, mut v)) => { + v.push(vv); + Or(unique_id, v) + } + (a, b) => Or(Cell::new(usize::MAX), vec![a, b]), + } + } +} + +// Necessary for simplify_string_set to work: the simplification +// consists of removing every "superset" of an other string of the +// set, that is any strings which contains an other (non-empty) string +// of the set, because the smaller atom will already indicate that the +// pattern is a candidate, so also matching the larger atom is useless +// +// In order to make the implementation simpler and more efficient, +// visit the smaller strings first that way we only need to visit the +// following siblings (larger strings which *might* contain the +// current one). +#[derive(PartialEq, Eq, Debug, Clone)] +struct LengthThenLex(pub String); +impl Deref for LengthThenLex { + type Target = String; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} +impl Ord for LengthThenLex { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.0 + .len() + .cmp(&other.0.len()) + .then_with(|| self.0.cmp(&other.0)) + } +} +impl PartialOrd for LengthThenLex { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} +type SSet = BTreeSet; +fn simplify_string_set(strings: SSet) -> impl Iterator { + let mut to_keep = vec![true; strings.len()]; + let mut e = strings.iter().enumerate(); + while let Some((i, s)) = e.next() { + if s.is_empty() || !to_keep[i] { + continue; + } + + for (keep, (_, s2)) in to_keep[i..].iter_mut().skip(1).zip(e.clone()) { + if *keep && s2.len() > s.len() && s2.0.contains(&s.0) { + *keep = false; + } + } + } + + std::iter::zip(to_keep, strings) + .filter(|v| v.0) + .map(|v| v.1 .0) +} + +/// Intermediate information about the set of strings a regex matches, +/// used for the computation of a prefilter. +#[derive(Debug)] +enum Info { + Match(Model), + Exact(SSet), +} +impl Info { + fn take_match(self) -> Model { + match self { + Self::Match(p) => p, + Self::Exact(s) => Model::or_strings(s), + } + } + + fn into_exact(self) -> Option { + match self { + Self::Exact(s) => Some(s), + Self::Match(_) => Option::None, + } + } +} + +struct InfoVisitor { + stack: Vec, + max_visits: usize, +} +impl Default for InfoVisitor { + fn default() -> Self { + Self { + max_visits: 100_000, + stack: Vec::new(), + } + } +} + +// [`regex_syntax::hir::Visitor`] works pretty differently than +// `re2::Regexp::Walker` as it does not return / merge anything, so we +// need to merge down into the stack on post. +impl Visitor for InfoVisitor { + type Output = Model; + type Err = Error; + + fn finish(mut self) -> Result { + (self.stack.len() == 1) + .then_some(&mut self.stack) + .and_then(|s| s.pop()) + .map(Info::take_match) + .ok_or(Error::FinalizationError) + } + + fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> { + // re2 sets `stopped_early` and calls `ShortVisit` but keeps + // on keeping on, not clear why & ultimately BuildInfo only + // cares about having stopped early + self.max_visits = self.max_visits.checked_sub(1).ok_or(Error::EarlyStop)?; + + Ok(()) + } + + fn visit_post(&mut self, hir: &Hir) -> Result<(), Self::Err> { + match hir.kind() { + HirKind::Empty | HirKind::Look(_) => { + self.stack + .push(Info::Exact([LengthThenLex(String::new())].into())); + } + HirKind::Literal(hir::Literal(data)) => { + if data.is_empty() { + // NoMatch + self.stack.push(Info::Match(Model::none())); + } else { + // re2 does this weird as it performs a cross + // product of individual characters, but as far as + // I understand that's just a complicated way to + // build a singleton set of the payload? + self.stack.push(Info::Exact( + [LengthThenLex(std::str::from_utf8(data)?.to_lowercase())].into(), + )); + } + } + HirKind::Class(cls) => { + let uc; + let c = match cls { + hir::Class::Unicode(c) => c, + hir::Class::Bytes(b) => { + uc = b + .to_unicode_class() + .ok_or_else(|| Error::ClassError(b.clone()))?; + &uc + } + }; + self.stack + .push(if c.iter().map(|r| r.len()).sum::() > 10 { + Info::Match(Model::all()) + } else { + Info::Exact( + c.iter() + .flat_map(|r| (r.start()..=r.end())) + .map(char::to_lowercase) + .map(String::from_iter) + .map(LengthThenLex) + .collect(), + ) + }); + } + // Apparently re2 and regex have inverse choices, re2 + // normalises repetitions to */+/?, regex normalises + // everything to {a, b}, so this may or may make any sense + HirKind::Repetition(hir::Repetition { min, .. }) => { + if *min == 0 { + // corresponds to */? (star/quest) + self.stack.pop(); + self.stack.push(Info::Match(Model::all())); + } else { + // corresponds to + + let arg = self + .stack + .pop() + .expect("a repetition to be associated with a pattern to repeat") + .take_match(); + self.stack.push(Info::Match(arg)); + } + } + // should just leave its child on the stack for whoever + // lives up + HirKind::Capture(_) => (), + HirKind::Alternation(alt) => { + // needs to pop alt.len() items from the stack, and if + // they're ``exact`` then just merge them, otherwise + // ``Prefilter::Or`` them + + // sort the topn to have the exacts at the top, largest top + let topn = self.stack.len() - alt.len()..; + let infos = &mut self.stack[topn.clone()]; + + let matches = + topn.start + infos.iter().filter(|v| matches!(v, Info::Match(_))).count(); + // I think we can do that because we don't actually + // regex match so order should not matter question + // mark + infos.sort_unstable_by_key(|v| match v { + Info::Match(_) => (false, 0), + Info::Exact(s) => (true, s.len()), + }); + // there are exact matches, merge them + let exacts = self + .stack + .drain(matches..) + .rev() + .fold(BTreeSet::new(), |mut s, i| { + s.append( + &mut i + .into_exact() + .expect("the top `matches` records should be exacts"), + ); + s + }); + let mut matches = self + .stack + .drain(topn) + .map(Info::take_match) + .collect::>(); + self.stack.push(if matches.is_empty() { + Info::Exact(exacts) + } else { + if !exacts.is_empty() { + matches.push(Model::or_strings(exacts)); + } + Info::Match( + matches + .into_iter() + .map(From::from) + .fold(Model::none(), Model::or), + ) + }); + } + // and this one gets really painful, like above we need to + // take the topn but unlike the above we can't reorder all + // our stuff around + HirKind::Concat(c) => { + let topn = self.stack.len() - c.len()..; + + // ALL is the identity element of AND + let mut result = Info::Match(Model::all()); + let mut exacts = BTreeSet::new(); + for info in self.stack.drain(topn) { + match info { + Info::Exact(set) if exacts.is_empty() => { + exacts = set; + } + Info::Exact(set) if set.len() * exacts.len() <= 16 => { + // Not useful to consume the existing + // `exacts` up-front, as each item has to + // be splatted over `set`. + exacts = iproduct!(&exacts, &set) + .map(|(s, ss)| { + let mut r = String::with_capacity(s.len() + ss.len()); + r.push_str(s); + r.push_str(ss); + LengthThenLex(r) + }) + .collect(); + } + i => { + // here AND the combination of info, + // exact, and the existing garbage + let mut p = result.take_match(); + if !exacts.is_empty() { + p = Model::and(p, Model::or_strings(std::mem::take(&mut exacts))); + } + p = Model::and(p, i.take_match()); + result = Info::Match(p); + } + } + } + + if exacts.is_empty() { + self.stack.push(result); + } else { + self.stack.push(Info::Match(Model::and( + result.take_match(), + Model::or_strings(exacts), + ))); + } + } + } + Ok(()) + } +} diff --git a/ua-parser/Cargo.toml b/ua-parser/Cargo.toml new file mode 100644 index 0000000..8d6e06b --- /dev/null +++ b/ua-parser/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "ua-parser" +version = "0.1.0" +edition = "2021" +license = "Apache-2.0" +description = "Rust implementation of the User Agent String Parser project" +keywords = ["ua_parser", "user-agent", "user-agent parser"] + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +regex = "1.10.4" +regex-filtered = { version = "0.1.0", path = "../regex-filtered" } +serde = { version = "1.0.203", features = ["derive"] } + +[dev-dependencies] +clap = { version = "4.5.6", features = ["derive"] } +serde_json = "1.0.117" +serde_yaml = "0.9.34" diff --git a/ua-parser/LICENSE b/ua-parser/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/ua-parser/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/ua-parser/examples/bench.rs b/ua-parser/examples/bench.rs new file mode 100644 index 0000000..460a271 --- /dev/null +++ b/ua-parser/examples/bench.rs @@ -0,0 +1,46 @@ +use clap::Parser; +use std::io::{BufRead, BufReader}; +use std::path::PathBuf; + +#[derive(Parser, Debug)] +struct Args { + /// regexes.yaml file to parse the data file with + regexes: PathBuf, + /// user agents file + user_agents: PathBuf, + /// number of repetitions through the user agent file + #[arg(short, long, default_value_t = 1)] + repetitions: usize, +} + +fn main() -> Result<(), Box> { + let Args { + regexes, + user_agents, + repetitions, + } = Args::parse(); + + let f = std::fs::File::open(regexes)?; + let r = ua_parser::Extractor::try_from(serde_yaml::from_reader::<_, ua_parser::Regexes>(f)?)?; + + let uas = BufReader::new(std::fs::File::open(user_agents)?) + .lines() + .collect::, _>>()?; + + let duration = std::time::Instant::now(); + for _ in 0..repetitions { + for ua in &uas { + drop(r.extract(ua)); + } + } + + let elapsed = duration.elapsed(); + println!("Lines: {}", repetitions * uas.len()); + println!("Total time: {elapsed:?}"); + println!( + "{}µs / line", + elapsed.as_micros() / (repetitions * uas.len()) as u128 + ); + + Ok(()) +} diff --git a/ua-parser/src/lib.rs b/ua-parser/src/lib.rs new file mode 100644 index 0000000..9f9d0d5 --- /dev/null +++ b/ua-parser/src/lib.rs @@ -0,0 +1,609 @@ +#![deny(unsafe_code)] +#![warn(missing_docs)] +#![allow(clippy::empty_docs)] +#![doc = include_str!("../../README.md")] + +use regex::Captures; +use serde::Deserialize; + +pub use regex_filtered::{BuildError, ParseError}; + +mod resolvers; + +/// Error returned if the conversion of [`Regexes`] to [`Extractor`] +/// fails. +#[derive(Debug)] +pub enum Error { + /// Compilation failed because one of the input regexes could not + /// be parsed or processed. + ParseError(ParseError), + /// Compilation failed because one of the prefilters could not be + /// built. + BuildError(BuildError), + /// A replacement template requires a group missing from the regex + MissingGroup(usize), +} +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Error::ParseError(p) => Some(p), + Error::BuildError(b) => Some(b), + Error::MissingGroup(_) => None, + } + } +} +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{self:?}") + } +} +impl From for Error { + fn from(value: ParseError) -> Self { + Self::ParseError(value) + } +} +impl From for Error { + fn from(value: BuildError) -> Self { + Self::BuildError(value) + } +} + +/// Deserialization target for the parser descriptors, can be used +/// with the relevant serde implementation to load from `regexes.yaml` +/// or a conversion thereof. +/// +/// Can then be compiled to a full [`Extractor`], or an individual +/// list of parsers can be converted to the corresponding extractor. +#[allow(missing_docs)] +#[derive(Deserialize)] +pub struct Regexes<'a> { + pub user_agent_parsers: Vec>, + pub os_parsers: Vec>, + pub device_parsers: Vec>, +} + +impl<'a> TryFrom> for Extractor<'a> { + type Error = Error; + /// Compile parsed regexes to the corresponding full extractor. + /// + /// Prefer using individual builder / extractors if you don't need + /// all three domains extracted, as creating the individual + /// extractors does have a cost. + fn try_from(r: Regexes<'a>) -> Result { + let ua = r + .user_agent_parsers + .into_iter() + .try_fold(user_agent::Builder::new(), |b, p| b.push(p))? + .build()?; + let os = r + .os_parsers + .into_iter() + .try_fold(os::Builder::new(), |b, p| b.push(p))? + .build()?; + let dev = r + .device_parsers + .into_iter() + .try_fold(device::Builder::new(), |b, p| b.push(p))? + .build()?; + Ok(Extractor { ua, os, dev }) + } +} + +/// Full extractor, simply delegates to the underlying individual +/// extractors for the actual job. +#[allow(missing_docs)] +pub struct Extractor<'a> { + pub ua: user_agent::Extractor<'a>, + pub os: os::Extractor<'a>, + pub dev: device::Extractor<'a>, +} +impl<'a> Extractor<'a> { + /// Performs the extraction on every sub-extractor in sequence. + pub fn extract( + &'a self, + ua: &'a str, + ) -> ( + Option>, + Option>, + Option>, + ) { + ( + self.ua.extract(ua), + self.os.extract(ua), + self.dev.extract(ua), + ) + } +} + +/// User agent module. +/// +/// The user agent is the representation of the browser, in UAP lingo +/// the user agent is composed of a *family* (the browser project) and +/// a *version* of up to 4 segments. +pub mod user_agent { + use serde::Deserialize; + use std::borrow::Cow; + + use crate::resolvers::{FallbackResolver, FamilyResolver}; + use regex_filtered::BuildError; + + /// Individual user agent parser description. Plain data which can + /// be deserialized from serde-compatible storage, or created + /// literally (e.g. using a conversion or build script). + #[derive(Deserialize, Default)] + pub struct Parser<'a> { + /// Regex to check the UA against, if the regex matches the + /// parser applies. + pub regex: Cow<'a, str>, + /// If set, used for the [`ValueRef::family`] field. If it + /// contains a `$1` placeholder, that is replaced by the value + /// of the first match group. + /// + /// If unset, the first match group is used directly. + pub family_replacement: Option>, + /// If set, provides the value of the major version number, + /// otherwise the second match group is used. + pub v1_replacement: Option>, + /// If set, provides the value of the minor version number, + /// otherwise the third match group is used. + pub v2_replacement: Option>, + /// If set, provides the value of the patch version number, + /// otherwise the fourth match group is used. + pub v3_replacement: Option>, + /// If set, provides the value of the minor patch version + /// number, otherwise the fifth match group is used. + pub v4_replacement: Option>, + } + + type Repl<'a> = ( + FamilyResolver<'a>, + // Per spec, should actually be restrict-templated (same as + // family but for indexes 2-5 instead of 1). + FallbackResolver<'a>, + FallbackResolver<'a>, + FallbackResolver<'a>, + FallbackResolver<'a>, + ); + + /// Extractor builder, used to `push` parsers into before building + /// the extractor. + #[derive(Default)] + pub struct Builder<'a> { + builder: regex_filtered::Builder, + repl: Vec>, + } + impl<'a> Builder<'a> { + /// Initialise an empty builder. + pub fn new() -> Self { + Self::default() + } + + /// Build the extractor, may be called without pushing any + /// parser in though that is not very useful. + pub fn build(self) -> Result, BuildError> { + let Self { builder, repl } = self; + + Ok(Extractor { + matcher: builder.build()?, + repl, + }) + } + + /// Pushes a parser into the builder, may fail if the + /// [`Parser::regex`] is invalid. + pub fn push(mut self, ua: Parser<'a>) -> Result { + self.builder = self.builder.push(&ua.regex)?; + let r = &self.builder.regexes()[self.builder.regexes().len() - 1]; + // number of groups in regex, excluding implicit entire match group + let groups = r.captures_len() - 1; + self.repl.push(( + FamilyResolver::new(ua.family_replacement, groups)?, + FallbackResolver::new(ua.v1_replacement, groups, 2), + FallbackResolver::new(ua.v2_replacement, groups, 3), + FallbackResolver::new(ua.v3_replacement, groups, 4), + FallbackResolver::new(ua.v4_replacement, groups, 5), + )); + Ok(self) + } + + /// Bulk loading of parsers into the builder. + pub fn push_all(self, ua: I) -> Result + where + I: IntoIterator>, + { + ua.into_iter().try_fold(self, |s, p| s.push(p)) + } + } + + /// User Agent extractor. + pub struct Extractor<'a> { + matcher: regex_filtered::Regexes, + repl: Vec>, + } + impl<'a> Extractor<'a> { + /// Tries the loaded [`Parser`], upon finding the first + /// matching [`Parser`] performs data extraction following its + /// replacement directives and returns the result. + /// + /// Returns [`None`] if: + /// + /// - no matching parser was found + /// - the match does not have any matching groups *and* + /// [`Parser::family_replacement`] is unset + /// - [`Parser::family_replacement`] has a substitution + /// but there is no group in the regex + pub fn extract(&'a self, ua: &'a str) -> Option> { + let (idx, re) = self.matcher.matching(ua).next()?; + let c = re.captures(ua)?; + + let (f, v1, v2, v3, v4) = &self.repl[idx]; + + Some(ValueRef { + family: f.resolve(&c), + major: v1.resolve(&c), + minor: v2.resolve(&c), + patch: v3.resolve(&c), + patch_minor: v4.resolve(&c), + }) + } + } + /// Borrowed extracted value, borrows the content of the original + /// parser or the content of the user agent string, unless a + /// replacement is performed. (which is only possible for the ) + #[derive(PartialEq, Eq, Default, Debug)] + pub struct ValueRef<'a> { + /// + pub family: Cow<'a, str>, + /// + pub major: Option<&'a str>, + /// + pub minor: Option<&'a str>, + /// + pub patch: Option<&'a str>, + /// + pub patch_minor: Option<&'a str>, + } + + impl ValueRef<'_> { + /// Converts the borrowed result into an owned one, + /// independent from both the extractor and the user agent + /// string. + pub fn into_owned(self) -> Value { + Value { + family: self.family.into_owned(), + major: self.major.map(|c| c.to_string()), + minor: self.minor.map(|c| c.to_string()), + patch: self.patch.map(|c| c.to_string()), + patch_minor: self.patch_minor.map(|c| c.to_string()), + } + } + } + + /// Owned extracted value, identical to [`ValueRef`] but not + /// linked to either the UA string or the extractor. + #[derive(PartialEq, Eq, Default, Debug)] + pub struct Value { + /// + pub family: String, + /// + pub major: Option, + /// + pub minor: Option, + /// + pub patch: Option, + /// + pub patch_minor: Option, + } +} + +/// OS extraction module +pub mod os { + use serde::Deserialize; + use std::borrow::Cow; + + use regex_filtered::{BuildError, ParseError}; + + use crate::resolvers::{OptResolver, Resolver}; + + /// OS parser configuration + #[derive(Deserialize, Default)] + pub struct Parser<'a> { + /// + pub regex: Cow<'a, str>, + /// Replacement for the [`ValueRef::os`], must be set if there + /// is no capture in the [`Self::regex`], if there are + /// captures may be fully templated (with `$n` placeholders + /// for any group of the [`Self::regex`]). + pub os_replacement: Option>, + /// Replacement for the [`ValueRef::major`], may be fully templated. + pub os_v1_replacement: Option>, + /// Replacement for the [`ValueRef::minor`], may be fully templated. + pub os_v2_replacement: Option>, + /// Replacement for the [`ValueRef::patch`], may be fully templated. + pub os_v3_replacement: Option>, + /// Replacement for the [`ValueRef::patch_minor`], may be fully templated. + pub os_v4_replacement: Option>, + } + /// Builder for [`Extractor`]. + #[derive(Default)] + pub struct Builder<'a> { + builder: regex_filtered::Builder, + repl: Vec<( + Resolver<'a>, + OptResolver<'a>, + OptResolver<'a>, + OptResolver<'a>, + OptResolver<'a>, + )>, + } + impl<'a> Builder<'a> { + /// + pub fn new() -> Self { + Self::default() + } + + /// Builds the [`Extractor`], may fail if building the + /// prefilter fails. + pub fn build(self) -> Result, BuildError> { + let Self { builder, repl } = self; + + Ok(Extractor { + matcher: builder.build()?, + repl, + }) + } + + /// Add a [`Parser`] configuration, fails if the regex can not + /// be parsed, or if [`Parser::os_replacement`] is missing and + /// the regex has no groups. + pub fn push(mut self, os: Parser<'a>) -> Result { + self.builder = self.builder.push(&os.regex)?; + let r = &self.builder.regexes()[self.builder.regexes().len() - 1]; + // number of groups in regex, excluding implicit entire match group + let groups = r.captures_len() - 1; + self.repl.push(( + Resolver::new(os.os_replacement, groups, 1), + OptResolver::new(os.os_v1_replacement, groups, 2), + OptResolver::new(os.os_v2_replacement, groups, 3), + OptResolver::new(os.os_v3_replacement, groups, 4), + OptResolver::new(os.os_v4_replacement, groups, 5), + )); + Ok(self) + } + + /// Bulk loading of parsers into the builder. + pub fn push_all(self, ua: I) -> Result + where + I: IntoIterator>, + { + ua.into_iter().try_fold(self, |s, p| s.push(p)) + } + } + + /// OS extractor structure + pub struct Extractor<'a> { + matcher: regex_filtered::Regexes, + repl: Vec<( + Resolver<'a>, + OptResolver<'a>, + OptResolver<'a>, + OptResolver<'a>, + OptResolver<'a>, + )>, + } + impl<'a> Extractor<'a> { + /// Matches & extracts the OS data for this user agent, + /// returns `None` if the UA string could not be matched. + pub fn extract(&'a self, ua: &'a str) -> Option> { + let (idx, re) = self.matcher.matching(ua).next()?; + let c = re.captures(ua)?; + + let (o, v1, v2, v3, v4) = &self.repl[idx]; + + Some(ValueRef { + os: o.resolve(&c), + major: v1.resolve(&c), + minor: v2.resolve(&c), + patch: v3.resolve(&c), + patch_minor: v4.resolve(&c), + }) + } + } + + /// An OS extraction result. + #[derive(PartialEq, Eq, Default, Debug)] + pub struct ValueRef<'a> { + /// + pub os: Cow<'a, str>, + /// + pub major: Option>, + /// + pub minor: Option>, + /// + pub patch: Option>, + /// + pub patch_minor: Option>, + } + + impl ValueRef<'_> { + /// Converts a [`ValueRef`] into a [`Value`] to avoid lifetime + /// concerns, may need to allocate and copy any data currently + /// borrowed from a [`Parser`] or user agent string. + pub fn into_owned(self) -> Value { + Value { + os: self.os.into_owned(), + major: self.major.map(|c| c.into_owned()), + minor: self.minor.map(|c| c.into_owned()), + patch: self.patch.map(|c| c.into_owned()), + patch_minor: self.patch_minor.map(|c| c.into_owned()), + } + } + } + + /// Owned version of [`ValueRef`]. + #[derive(PartialEq, Eq, Default, Debug)] + pub struct Value { + /// + pub os: String, + /// + pub major: Option, + /// + pub minor: Option, + /// + pub patch: Option, + /// + pub patch_minor: Option, + } +} + +/// Extraction module for the device data of the user agent string. +pub mod device { + use serde::Deserialize; + use std::borrow::Cow; + + use regex_filtered::{BuildError, ParseError}; + + use crate::resolvers::{OptResolver, Resolver}; + + /// regex flags + #[derive(Deserialize, PartialEq, Eq)] + pub enum Flag { + /// Enables case-insensitive regex matching, deserializes from + /// the string `"i"` + #[serde(rename = "i")] + IgnoreCase, + } + /// Device parser description. + #[derive(Deserialize, Default)] + pub struct Parser<'a> { + /// Regex pattern to use for matching and data extraction. + pub regex: Cow<'a, str>, + /// Configuration flags for the regex, if any. + pub regex_flag: Option, + /// Device replacement data, fully templated, must be present + /// *or* the regex must have at least one group, which will be + /// used instead. + pub device_replacement: Option>, + /// Brand replacement data, fully templated, optional, if + /// missing there is no fallback. + pub brand_replacement: Option>, + /// Model replacement data, fully templated, optional, if + /// missing will be replaced by the first group if the regex + /// has one. + pub model_replacement: Option>, + } + + /// Extractor builder. + #[derive(Default)] + pub struct Builder<'a> { + builder: regex_filtered::Builder, + repl: Vec<(Resolver<'a>, OptResolver<'a>, OptResolver<'a>)>, + } + impl<'a> Builder<'a> { + /// Creates a builder in the default configurtion, which is + /// the only configuration. + pub fn new() -> Self { + Self::default() + } + + /// Builds an Extractor, may fail if compiling the prefilter fails. + pub fn build(self) -> Result, BuildError> { + let Self { builder, repl } = self; + + Ok(Extractor { + matcher: builder.build()?, + repl, + }) + } + + /// Add a parser to the set, may fail if parsing the regex + /// fails *or* if [`Parser::device_replacement`] is unset and + /// [`Parser::regex`] does not have at least one group, or a + /// templated [`Parser::device_replacement`] requests groups + /// which [`Parser::regex`] is missing. + pub fn push(mut self, device: Parser<'a>) -> Result { + self.builder = self.builder.push_opt( + &device.regex, + regex_filtered::Options::new() + .case_insensitive(device.regex_flag == Some(Flag::IgnoreCase)), + )?; + let r = &self.builder.regexes()[self.builder.regexes().len() - 1]; + // number of groups in regex, excluding implicit entire match group + let groups = r.captures_len() - 1; + self.repl.push(( + Resolver::new(device.device_replacement, groups, 1), + OptResolver::new(device.brand_replacement, 0, 999), + OptResolver::new(device.model_replacement, groups, 1), + )); + Ok(self) + } + + /// Bulk loading of parsers into the builder. + pub fn push_all(self, ua: I) -> Result + where + I: IntoIterator>, + { + ua.into_iter().try_fold(self, |s, p| s.push(p)) + } + } + + /// Device extractor object. + pub struct Extractor<'a> { + matcher: regex_filtered::Regexes, + repl: Vec<(Resolver<'a>, OptResolver<'a>, OptResolver<'a>)>, + } + impl<'a> Extractor<'a> { + /// Perform data extraction from the user agent string, + /// returns `None` if no regex in the [`Extractor`] matches + /// the input. + pub fn extract(&'a self, ua: &'a str) -> Option> { + let (idx, re) = self.matcher.matching(ua).next()?; + let c = re.captures(ua)?; + + let (d, v1, v2) = &self.repl[idx]; + + Some(ValueRef { + device: d.resolve(&c), + brand: v1.resolve(&c), + model: v2.resolve(&c), + }) + } + } + + /// Extracted device content, may borrow from one of the + /// [`Parser`] or from the user agent string. + #[derive(PartialEq, Eq, Default, Debug)] + pub struct ValueRef<'a> { + /// + pub device: Cow<'a, str>, + /// + pub brand: Option>, + /// + pub model: Option>, + } + + impl ValueRef<'_> { + /// Converts [`Self`] to an owned [`Value`] getting rid of + /// borrowing concerns, may need to allocate and copy if any + /// of the attributes actually borrows from a [`Parser`] or + /// the user agent string. + pub fn into_owned(self) -> Value { + Value { + device: self.device.into_owned(), + brand: self.brand.map(|c| c.into_owned()), + model: self.model.map(|c| c.into_owned()), + } + } + } + + /// Owned version of [`ValueRef`]. + #[derive(PartialEq, Eq, Default, Debug)] + pub struct Value { + /// + pub device: String, + /// + pub brand: Option, + /// + pub model: Option, + } +} diff --git a/ua-parser/src/resolvers.rs b/ua-parser/src/resolvers.rs new file mode 100644 index 0000000..9f52161 --- /dev/null +++ b/ua-parser/src/resolvers.rs @@ -0,0 +1,171 @@ +// TODO: what happens in case of optional groups? +// +// Sadly regex offers no way to actually query that nicely: via +// static_captures_len it only specifies whether all groups are +// required, if any group is optional that returns `None`. + +use crate::Error; +use regex::Captures; +use std::borrow::Cow; + +fn get<'s>(c: &Captures<'s>, group: usize) -> Option<&'s str> { + c.get(group).map(|g| g.as_str()).filter(|s| !s.is_empty()) +} + +// TODO: +// - memchr? +// - u16 checks against u16 buffer (check all positions)? +// - svar/simd? +fn has_substitution(s: &str) -> bool { + debug_assert!(!s.is_empty()); + std::iter::zip(s.as_bytes(), &s.as_bytes()[1..]).any(|(&d, n)| d == b'$' && n.is_ascii_digit()) +} + +/// Resolver with full templating: the template string can contain +/// $1-9 markers which get replaced by the corresponding regex string. +/// +/// - if there is a non-null replacement pattern, then it must be used with +/// match groups as template parameters (at indices 1+) +/// - the result is stripped +/// - if it is an empty string, then it's replaced by a null +/// - otherwise fallback to a (possibly optional) match group +/// - or null (device brand has no fallback) +pub(crate) enum Resolver<'a> { + Replacement(Cow<'a, str>), + Capture(usize), + Template(Cow<'a, str>), +} +impl<'a> Resolver<'a> { + pub(crate) fn new(repl: Option>, groups: usize, idx: usize) -> Self { + if let Some(s) = repl.filter(|s| !s.trim().is_empty()) { + if has_substitution(&s) { + Self::Template(s) + } else { + Self::Replacement(s) + } + } else if groups >= idx { + Self::Capture(idx) + } else { + Self::Replacement("".into()) + } + } + + pub(crate) fn resolve(&'a self, c: &Captures<'a>) -> Cow<'a, str> { + match self { + Self::Replacement(s) => (**s).into(), + Self::Capture(i) => get(c, *i).unwrap_or("").into(), + Self::Template(t) => { + let mut r = String::new(); + c.expand(t, &mut r); + let trimmed = r.trim(); + if r.len() == trimmed.len() { + r.into() + } else { + trimmed.to_string().into() + } + } + } + } +} + +/// Similar to [`Resolver`] but allows a [`None`] aka no resolution. +pub(crate) enum OptResolver<'a> { + None, + Replacement(Cow<'a, str>), + Capture(usize), + Template(Cow<'a, str>), +} +impl<'a> OptResolver<'a> { + pub(crate) fn new(repl: Option>, groups: usize, idx: usize) -> Self { + if let Some(s) = repl.filter(|s| !s.trim().is_empty()) { + if has_substitution(&s) { + Self::Template(s) + } else { + Self::Replacement(s) + } + } else if groups >= idx { + Self::Capture(idx) + } else { + Self::None + } + } + + pub(crate) fn resolve(&'a self, c: &Captures<'a>) -> Option> { + match self { + Self::None => None, + Self::Replacement(s) => Some((**s).into()), + Self::Capture(i) => get(c, *i).map(From::from), + Self::Template(t) => { + let mut r = String::new(); + c.expand(t, &mut r); + let trimmed = r.trim(); + if trimmed.is_empty() { + None + } else if r.len() == trimmed.len() { + Some(r.into()) + } else { + Some(trimmed.to_string().into()) + } + } + } + } +} + +/// Dedicated restrict-templated resolver for UserAgent#family: +/// supports templating in the replacement, but only for the `$1` +/// value / group. +pub(crate) enum FamilyResolver<'a> { + Capture, + Replacement(Cow<'a, str>), + Template(Cow<'a, str>), +} +impl<'a> FamilyResolver<'a> { + pub(crate) fn new(repl: Option>, groups: usize) -> Result { + match repl { + Some(s) if s.contains("$1") => { + if groups < 1 { + Err(Error::MissingGroup(1)) + } else { + Ok(FamilyResolver::Template(s)) + } + } + Some(s) if !s.is_empty() => Ok(FamilyResolver::Replacement(s)), + _ if groups >= 1 => Ok(FamilyResolver::Capture), + _ => Ok(FamilyResolver::Replacement("".into())), + } + } + + pub(crate) fn resolve(&'a self, c: &super::Captures<'a>) -> Cow<'a, str> { + match self { + FamilyResolver::Capture => get(c, 1).unwrap_or("").into(), + FamilyResolver::Replacement(s) => (**s).into(), + FamilyResolver::Template(t) => t.replace("$1", get(c, 1).unwrap_or("")).into(), + } + } +} + +/// Untemplated resolver, the replacement value is used as-is if +/// present. +pub(crate) enum FallbackResolver<'a> { + None, + Capture(usize), + Replacement(Cow<'a, str>), +} +impl<'a> FallbackResolver<'a> { + pub(crate) fn new(repl: Option>, groups: usize, idx: usize) -> Self { + if let Some(s) = repl.filter(|s| !s.is_empty()) { + Self::Replacement(s) + } else if groups >= idx { + Self::Capture(idx) + } else { + Self::None + } + } + pub(crate) fn resolve(&'a self, c: &super::Captures<'a>) -> Option<&'a str> { + match self { + FallbackResolver::None => None, + FallbackResolver::Capture(n) => get(c, *n), + FallbackResolver::Replacement(r) => Some(r), + } + } +} diff --git a/ua-parser/tests/integration.rs b/ua-parser/tests/integration.rs new file mode 100644 index 0000000..66429f2 --- /dev/null +++ b/ua-parser/tests/integration.rs @@ -0,0 +1,391 @@ +use serde::Deserialize; + +fn empty_is_none<'de, D>(deserializer: D) -> Result, D::Error> +where + D: serde::de::Deserializer<'de>, +{ + let s: serde_yaml::Value = serde::de::Deserialize::deserialize(deserializer)?; + match s { + serde_yaml::Value::Null => Ok(None), + serde_yaml::Value::String(s) => { + if s.is_empty() { + Ok(None) + } else { + Ok(Some(s)) + } + } + v => panic!("unexpected value {v:?}"), + } +} + +#[derive(Deserialize, PartialEq, Eq, Debug)] +struct UserAgent { + family: String, + #[serde(deserialize_with = "empty_is_none")] + major: Option, + #[serde(deserialize_with = "empty_is_none")] + minor: Option, + #[serde(deserialize_with = "empty_is_none")] + patch: Option, + #[serde(default, deserialize_with = "empty_is_none")] + patch_minor: Option, +} +impl From> for UserAgent { + fn from(value: ua_parser::user_agent::ValueRef<'_>) -> Self { + let value = value.into_owned(); + Self { + family: value.family, + major: value.major, + minor: value.minor, + patch: value.patch, + patch_minor: value.patch_minor, + } + } +} + +#[derive(Deserialize, PartialEq, Eq, Debug)] +pub struct OS { + pub family: String, + pub major: Option, + pub minor: Option, + pub patch: Option, + pub patch_minor: Option, +} +impl From> for OS { + fn from(value: ua_parser::os::ValueRef<'_>) -> Self { + let value = value.into_owned(); + Self { + family: value.os, + major: value.major, + minor: value.minor, + patch: value.patch, + patch_minor: value.patch_minor, + } + } +} + +#[derive(Deserialize, PartialEq, Eq, Debug)] +pub struct Device { + pub family: String, + pub brand: Option, + pub model: Option, +} +impl From> for Device { + fn from(value: ua_parser::device::ValueRef<'_>) -> Self { + let value = value.into_owned(); + Self { + family: value.device, + brand: value.brand, + model: value.model, + } + } +} + +fn get_extractor() -> Result< + &'static ua_parser::Extractor<'static>, + &'static (dyn std::error::Error + Send + Sync + 'static), +> { + static EXTRACTOR: std::sync::OnceLock< + Result, Box>, + > = std::sync::OnceLock::new(); + + EXTRACTOR + .get_or_init(|| { + let p: std::path::PathBuf = [env!("CARGO_MANIFEST_DIR"), "uap-core", "regexes.yaml"] + .iter() + .collect(); + let rs = serde_yaml::from_reader::<_, ua_parser::Regexes>(std::fs::File::open(p)?)? + .try_into()?; + Ok(rs) + }) + .as_ref() + .map_err(|e| &**e) +} + +#[derive(Deserialize)] +struct UaTestCases { + test_cases: Vec, +} +#[derive(Deserialize)] +struct UaTestCase { + user_agent_string: String, + #[serde(flatten)] + ua: UserAgent, +} + +#[test] +fn test_ua() { + let rs = &get_extractor().unwrap().ua; + + let p = [ + env!("CARGO_MANIFEST_DIR"), + "uap-core", + "tests", + "test_ua.yaml", + ] + .iter() + .collect::(); + let items = serde_yaml::from_reader::<_, UaTestCases>(std::fs::File::open(p).unwrap()).unwrap(); + for UaTestCase { + user_agent_string, + ua, + } in items.test_cases + { + let ua_ = rs.extract(&user_agent_string).map_or_else( + || UserAgent { + family: "Other".to_string(), + major: None, + minor: None, + patch: None, + patch_minor: None, + }, + From::from, + ); + assert_eq!(ua, ua_, "{user_agent_string}"); + } +} + +#[test] +fn test_ff() { + let rs = &get_extractor().unwrap().ua; + + let p = [ + env!("CARGO_MANIFEST_DIR"), + "uap-core", + "test_resources", + "firefox_user_agent_strings.yaml", + ] + .iter() + .collect::(); + let items = serde_yaml::from_reader::<_, UaTestCases>(std::fs::File::open(p).unwrap()).unwrap(); + for UaTestCase { + user_agent_string, + ua, + } in items.test_cases + { + let ua_ = rs.extract(&user_agent_string).map_or_else( + || UserAgent { + family: "Other".to_string(), + major: None, + minor: None, + patch: None, + patch_minor: None, + }, + From::from, + ); + assert_eq!(ua, ua_, "{user_agent_string}"); + } +} + +#[test] +fn test_pgts() { + let rs = &get_extractor().unwrap().ua; + + let p = [ + env!("CARGO_MANIFEST_DIR"), + "uap-core", + "test_resources", + "pgts_browser_list.yaml", + ] + .iter() + .collect::(); + let items = serde_yaml::from_reader::<_, UaTestCases>(std::fs::File::open(p).unwrap()).unwrap(); + for UaTestCase { + user_agent_string, + ua, + } in items.test_cases + { + let ua_ = rs.extract(&user_agent_string).map_or_else( + || UserAgent { + family: "Other".to_string(), + major: None, + minor: None, + patch: None, + patch_minor: None, + }, + From::from, + ); + assert_eq!(ua, ua_, "{user_agent_string}"); + } +} + +#[test] +fn test_opera() { + let rs = &get_extractor().unwrap().ua; + + let p = [ + env!("CARGO_MANIFEST_DIR"), + "uap-core", + "test_resources", + "opera_mini_user_agent_strings.yaml", + ] + .iter() + .collect::(); + let items = serde_yaml::from_reader::<_, UaTestCases>(std::fs::File::open(p).unwrap()).unwrap(); + for UaTestCase { + user_agent_string, + ua, + } in items.test_cases + { + let ua_ = rs.extract(&user_agent_string).map_or_else( + || UserAgent { + family: "Other".to_string(), + major: None, + minor: None, + patch: None, + patch_minor: None, + }, + From::from, + ); + assert_eq!(ua, ua_, "{user_agent_string}"); + } +} + +#[test] +fn test_podcasting() { + let rs = &get_extractor().unwrap().ua; + + let p = [ + env!("CARGO_MANIFEST_DIR"), + "uap-core", + "test_resources", + "podcasting_user_agent_strings.yaml", + ] + .iter() + .collect::(); + let items = serde_yaml::from_reader::<_, UaTestCases>(std::fs::File::open(p).unwrap()).unwrap(); + for UaTestCase { + user_agent_string, + ua, + } in items.test_cases + { + let ua_ = rs.extract(&user_agent_string).map_or_else( + || UserAgent { + family: "Other".to_string(), + major: None, + minor: None, + patch: None, + patch_minor: None, + }, + From::from, + ); + assert_eq!(ua, ua_, "{user_agent_string}"); + } +} + +#[derive(Deserialize)] +struct DevTestCases { + test_cases: Vec, +} +#[derive(Deserialize)] +struct DevTestCase { + user_agent_string: String, + #[serde(flatten)] + dev: Device, +} + +#[test] +fn test_device() { + let rs = &get_extractor().unwrap().dev; + + let p = [ + env!("CARGO_MANIFEST_DIR"), + "uap-core", + "tests", + "test_device.yaml", + ] + .iter() + .collect::(); + let items = + serde_yaml::from_reader::<_, DevTestCases>(std::fs::File::open(p).unwrap()).unwrap(); + for DevTestCase { + user_agent_string, + dev, + } in items.test_cases + { + let dev_ = rs.extract(&user_agent_string).map_or_else( + || Device { + family: "Other".to_string(), + brand: None, + model: None, + }, + From::from, + ); + assert_eq!(dev, dev_, "{user_agent_string}"); + } +} + +#[derive(Deserialize)] +struct OSTestCases { + test_cases: Vec, +} +#[derive(Deserialize)] +struct OSTestCase { + user_agent_string: String, + #[serde(flatten)] + os: OS, +} + +#[test] +fn test_os() { + let rs = &get_extractor().unwrap().os; + + let p = [ + env!("CARGO_MANIFEST_DIR"), + "uap-core", + "tests", + "test_os.yaml", + ] + .iter() + .collect::(); + let items = serde_yaml::from_reader::<_, OSTestCases>(std::fs::File::open(p).unwrap()).unwrap(); + for OSTestCase { + user_agent_string, + os, + } in items.test_cases + { + let os_ = rs.extract(&user_agent_string).map_or_else( + || OS { + family: "Other".to_string(), + major: None, + minor: None, + patch: None, + patch_minor: None, + }, + From::from, + ); + assert_eq!(os, os_, "{user_agent_string}"); + } +} + +#[test] +fn test_additional_os() { + let rs = &get_extractor().unwrap().os; + + let p = [ + env!("CARGO_MANIFEST_DIR"), + "uap-core", + "test_resources", + "additional_os_tests.yaml", + ] + .iter() + .collect::(); + let items = serde_yaml::from_reader::<_, OSTestCases>(std::fs::File::open(p).unwrap()).unwrap(); + for OSTestCase { + user_agent_string, + os, + } in items.test_cases + { + let os_ = rs.extract(&user_agent_string).map_or_else( + || OS { + family: "Other".to_string(), + major: None, + minor: None, + patch: None, + patch_minor: None, + }, + From::from, + ); + assert_eq!(os, os_, "{user_agent_string}"); + } +} diff --git a/ua-parser/uap-core b/ua-parser/uap-core new file mode 160000 index 0000000..ae4c16d --- /dev/null +++ b/ua-parser/uap-core @@ -0,0 +1 @@ +Subproject commit ae4c16ddd81a01c66f396953016c9e06f695d78f