From 0daceeb0757a5058a9abb005bae459c7a04bdd73 Mon Sep 17 00:00:00 2001
From: masklinn <github.com@masklinn.net>
Date: Sun, 2 Jun 2024 20:23:50 +0200
Subject: [PATCH] Initial implementation

Notes:

- uap is licensed under Apache 2.0 because that's the normal license
  for the project
- regex-filtered is licensed under BSD 3-clauses because it's largely
  a translation (with changes) of re2's FilteredRE2 and IANAL but it
  seems fairer (and safer) to match
---
 .github/workflows/rust.yml      |  28 ++
 .gitignore                      |   2 +
 .gitmodules                     |   3 +
 Cargo.toml                      |   3 +
 README.md                       | 141 ++++++++
 regex-filtered/Cargo.toml       |  23 ++
 regex-filtered/LICENSE          |  28 ++
 regex-filtered/README.md        |  97 +++++
 regex-filtered/benches/regex.rs |  34 ++
 regex-filtered/src/lib.rs       | 442 +++++++++++++++++++++++
 regex-filtered/src/mapper.rs    | 429 ++++++++++++++++++++++
 regex-filtered/src/model.rs     | 516 +++++++++++++++++++++++++++
 ua-parser/Cargo.toml            |  19 +
 ua-parser/LICENSE               | 201 +++++++++++
 ua-parser/examples/bench.rs     |  46 +++
 ua-parser/src/lib.rs            | 609 ++++++++++++++++++++++++++++++++
 ua-parser/src/resolvers.rs      | 171 +++++++++
 ua-parser/tests/integration.rs  | 391 ++++++++++++++++++++
 ua-parser/uap-core              |   1 +
 19 files changed, 3184 insertions(+)
 create mode 100644 .github/workflows/rust.yml
 create mode 100644 .gitignore
 create mode 100644 .gitmodules
 create mode 100644 Cargo.toml
 create mode 100644 README.md
 create mode 100644 regex-filtered/Cargo.toml
 create mode 100644 regex-filtered/LICENSE
 create mode 100644 regex-filtered/README.md
 create mode 100644 regex-filtered/benches/regex.rs
 create mode 100644 regex-filtered/src/lib.rs
 create mode 100644 regex-filtered/src/mapper.rs
 create mode 100644 regex-filtered/src/model.rs
 create mode 100644 ua-parser/Cargo.toml
 create mode 100644 ua-parser/LICENSE
 create mode 100644 ua-parser/examples/bench.rs
 create mode 100644 ua-parser/src/lib.rs
 create mode 100644 ua-parser/src/resolvers.rs
 create mode 100644 ua-parser/tests/integration.rs
 create mode 160000 ua-parser/uap-core

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
new file mode 100644
index 0000000..537b525
--- /dev/null
+++ b/.github/workflows/rust.yml
@@ -0,0 +1,28 @@
+name: Rust
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+env:
+  CARGO_TERM_COLOR: always
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: true
+    - name: Build
+      run: cargo build --verbose
+    - name: Format
+      run: cargo fmt --check
+    - name: clippy
+      run: cargo clippy
+    - name: Run tests
+      run: cargo test -r --verbose
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..96ef6c0
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/target
+Cargo.lock
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..542987b
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "ua-parser/uap-core"]
+	path = ua-parser/uap-core
+	url = https://github.com/ua-parser/uap-core
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..63463c3
--- /dev/null
+++ b/Cargo.toml
@@ -0,0 +1,3 @@
+[workspace]
+members = ["regex-filtered", "ua-parser"]
+resolver = "2"
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..293a83e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,141 @@
+# User Agent Parser
+
+This module implements the [browserscope / uap
+standard](https://github.com/ua-parser/uap-core) for rust, allowing
+the extraction of various metadata from user agents.
+
+The browserscope standard is data-oriented, with [`regexes.yaml`]
+specifying the matching and extraction from user-agent strings. This
+library implements the maching protocols and provides various types to
+make loading the dataset easier, however it does *not* provide the
+data itself, to avoid dependencies on serialization libraries or
+constrain loading.
+
+## Dataset loading
+
+The crate does not provide any sort of precompiled data file, or
+dedicated loader, however [`Regexes`] implements
+[`serde::Deserialize`] and can load a [`regexes.yaml`] file or any
+format-preserving conversion thereof (e.g. loading from json or cbor
+might be preferred if the application already depends on one of
+those):
+
+```no_run
+# let ua_str = "";
+let f = std::fs::File::open("regexes.yaml")?;
+let regexes: ua_parser::Regexes = serde_yaml::from_reader(f)?;
+let extractor = ua_parser::Extractor::try_from(regexes)?;
+
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+All the data-description structures are also Plain Old Data, so they
+can be embedded in the application directly e.g. via a build script:
+
+``` rust
+let parsers = vec![
+    ua_parser::user_agent::Parser {
+        regex: "foo".into(),
+        family_replacement: Some("bar".into()),
+        ..Default::default()
+    }
+];
+```
+## Extraction
+
+The crate provides the ability to either extract individual
+information sets (user agent — browser, OS, and device) or extract all
+three in a single call.
+
+The three infosets are are independent and non-overlapping so while
+the full extractor may be convenient if only one is needed a complete
+extraction is unnecessary overhead, and the extractors themselves are
+somewhat costly to create and take up memory.
+
+### Complete Extractor
+
+For the complete extractor, it is simply converted from the
+[`Regexes`] structure. The resulting [`Extractor`] embeds all three
+module-level extractors as attributes, and [`Extractor::extract`]-s
+into a 3-uple of `ValueRef`s.
+
+
+### Individual Extractors
+
+The individual extractors are in the [`user_agent`], [`os`], and
+[`device`] modules, the three modules follow the exact same model:
+
+- a `Parser` struct which specifies individual parser configurations,
+  used as inputs to the `Builder`
+- a `Builder`, into which the relevant parsers can be `push`-ed
+- an `Extractor` created from the `Builder`, from which the user can
+  `extract` a `ValueRef`
+- the `ValueRef` result of data extraction, which may borrow from (and
+  is thus lifetime-bound to) the `Parser` substitution data and the
+  user agent string it was extracted from
+- for convenience, an owned `Value` variant of the `ValueRef`
+
+``` rust
+use ua_parser::os::{Builder, Parser, ValueRef};
+
+let e = Builder::new()
+    .push(Parser {
+        regex: r"(Android)[ \-/](\d+)(?:\.(\d+)|)(?:[.\-]([a-z0-9]+)|)".into(),
+        ..Default::default()
+    })?
+    .push(Parser {
+        regex: r"(Android) Donut".into(),
+        os_v1_replacement: Some("1".into()),
+        os_v2_replacement: Some("2".into()),
+        ..Default::default()
+    })?
+    .push(Parser {
+        regex: r"(Android) Eclair".into(),
+        os_v1_replacement: Some("2".into()),
+        os_v2_replacement: Some("1".into()),
+        ..Default::default()
+    })?
+    .push(Parser {
+        regex: r"(Android) Froyo".into(),
+        os_v1_replacement: Some("2".into()),
+        os_v2_replacement: Some("2".into()),
+        ..Default::default()
+    })?
+    .push(Parser {
+        regex: r"(Android) Gingerbread".into(),
+        os_v1_replacement: Some("2".into()),
+        os_v2_replacement: Some("3".into()),
+        ..Default::default()
+    })?
+    .push(Parser {
+        regex: r"(Android) Honeycomb".into(),
+        os_v1_replacement: Some("3".into()),
+       ..Default::default()
+    })?
+    .push(Parser {
+        regex: r"(Android) (\d+);".into(),
+        ..Default::default()
+    })?
+    .build()?;
+
+assert_eq!(
+    e.extract("Android Donut"),
+    Some(ValueRef {
+        os: "Android".into(),
+        major: Some("1".into()),
+        minor: Some("2".into()),
+        ..Default::default()
+    }),
+);
+assert_eq!(
+    e.extract("Android 15"),
+    Some(ValueRef { os: "Android".into(), major: Some("15".into()), ..Default::default()}),
+);
+assert_eq!(
+    e.extract("ZuneWP7"),
+    None,
+);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+[`regexes.yaml`]: https://github.com/ua-parser/uap-core/blob/master/regexes.yaml
diff --git a/regex-filtered/Cargo.toml b/regex-filtered/Cargo.toml
new file mode 100644
index 0000000..a0f9490
--- /dev/null
+++ b/regex-filtered/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "regex-filtered"
+version = "0.1.0"
+edition = "2021"
+description = "Efficiently check an input against a large number of patterns"
+keywords = ["regex", "filter", "FilteredRE2", "multiple", "prefilter"]
+license = "BSD-3-Clause"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+aho-corasick = "1.1.3"
+indexmap = "2.2.6"
+itertools = "0.13.0"
+regex = "1.10.4"
+regex-syntax = "0.8.3"
+
+[dev-dependencies]
+criterion = "0.5.1"
+
+[[bench]]
+name = "regex"
+harness = false
diff --git a/regex-filtered/LICENSE b/regex-filtered/LICENSE
new file mode 100644
index 0000000..9b346f1
--- /dev/null
+++ b/regex-filtered/LICENSE
@@ -0,0 +1,28 @@
+BSD 3-Clause License
+
+Copyright (c) 2024, ua-parser project
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/regex-filtered/README.md b/regex-filtered/README.md
new file mode 100644
index 0000000..1667920
--- /dev/null
+++ b/regex-filtered/README.md
@@ -0,0 +1,97 @@
+# regex-filtered: FilteredRE2 for rust-regex
+
+This crate implements the logic behind [`FilteredRE2`] on top of
+[`regex`].
+
+The purpose is to allow efficient selection of one or more regexes
+matching an input from a *large* set without having to check every
+regex linearly, by prefiltering candidate regexes and only matching
+those against the input.
+
+This should be preferred to [`regex::RegexSet`] if the regexes are
+non-trivial (e.g. non-literal), as [`regex::RegexSet`] constructs a
+single state machine which quickly grows huge and slow.
+
+Linear matching does not have *that* issue and works fine with complex
+regexes, but doesn't scale as the number of regexes increases and
+match failures quickly get very expensive (as they require traversing
+the entire set every time).
+
+## Usage
+
+``` rust
+let matcher = regex_filtered::Builder::new()
+    .push("foo")?
+    .push("bar")?
+    .push("baz")?
+    .push("quux")?
+    .build()?;
+
+assert!(matcher.is_match("bar"));
+assert_eq!(matcher.matching("baz").count(), 1);
+assert_eq!(matcher.matching("foo quux").count(), 2);
+# Ok::<(), Box<dyn std::error::Error>>(())
+```
+
+[`Regexes::is_match`] returns whether *any* pattern in the set matches
+the haystack. It is essentially equivalent to
+`matcher.matching(...).next().is_some()`.
+
+[`Regexes::matching`] returns an iterator of matching [`regex::Regex`]
+and corresponding index. The index can be used to look up ancillary
+data (e.g. replacement content), and the [`regex::Regex`] can be used
+to [`regex::Regex::find`] or [`regex::Regex::captures`] data out of
+the haystack.
+
+## Notes
+
+`regex-filtered` only returns the matching regexes (and their index)
+as capturing especially is *significantly* more expensive than
+checking for a match, this slightly pessimises situations where the
+prefilter prunes perfectly but it is a large gain as soon as that's
+not the case and the prefilter has to be post-filtered.
+
+## Concepts
+
+From a large set of regexes, extract distinguishing literal tokens,
+match the tokens against the input, reverse-lookup which regexes the
+matching tokens correspond to, and only run the corresponding regexes
+on the input.
+
+This extraction is done by gathering literal items, converting them to
+content sets, then symbolically executing concatenations and
+alternations (`|`) in order to find out what literal items *need* to
+be present in the haystack for this regex to match. A reverse index is
+then built from literal items to regexes.
+
+At match time, a prefilter is run checking which literals are present
+in the haystack then find out what regexes that corresponds to,
+following which the regexes themselves are matched against the
+haystack to only return actual matching regexes.
+
+## Divergences
+
+While [`FilteredRE2`] requires the user to perform prefiltering,
+`regex-filtered` handles this internally: [`aho-corasick`] is pretty
+much ideal for that task and already a dependency of [`regex`] which
+`regex-filtered` based on.
+
+## TODO
+
+- add a stats feature to report various build-size infos e.g.
+
+  - number of tokens
+  - number of regexes
+  - number of unfiltered regexes, this would be useful to know if
+    prefiltering will be done or a naive sequential application would
+    be a better idea.
+  - ratio of checked regexes to successes (how does it work with lazy
+    iterators?)
+  - total / prefiltered (- unfiltered) so atom size impact can be
+    evaluated
+  - also maybe mapper stats on the pruning stuff and whatever
+  
+[`aho-corasick`]: https://docs.rs/aho-corasick/
+[`FilteredRE2`]: https://github.com/google/re2/blob/main/re2/filtered_re2.h
+[`regex`]: https://docs.rs/regex/
+[`regex-syntax`]: https://docs.rs/regex-syntax/
diff --git a/regex-filtered/benches/regex.rs b/regex-filtered/benches/regex.rs
new file mode 100644
index 0000000..62d6f94
--- /dev/null
+++ b/regex-filtered/benches/regex.rs
@@ -0,0 +1,34 @@
+use criterion::{criterion_group, criterion_main, Criterion};
+
+use regex::Regex;
+
+/// On this trivial syntetic test, the results on an M1P are:
+///
+/// * 18ns for a match failure
+/// * 33ns for a match success
+/// * 44ns for a capture failure
+/// * 111ns for a capture success
+///
+/// Cutoff is at n=1.27 failures average. So really depends how
+/// selective the prefilter is...
+fn bench_regex(c: &mut Criterion) {
+    let r = Regex::new(r"(foo|bar)baz/(\d+)\.(\d+)").unwrap();
+
+    c.bench_function("has match - success", |b| {
+        b.iter(|| r.is_match("foobaz/1.2"))
+    });
+    c.bench_function("has match - failure", |b| {
+        b.iter(|| r.is_match("fooxbaz/1.2"))
+    });
+
+    c.bench_function("match - success", |b| b.iter(|| r.find("foobaz/1.2")));
+    c.bench_function("match - failure", |b| b.iter(|| r.find("fooxbaz/1.2")));
+
+    c.bench_function("capture - success", |b| b.iter(|| r.captures("foobaz/1.2")));
+    c.bench_function("capture - failure", |b| {
+        b.iter(|| r.captures("fooxbaz/1.2"))
+    });
+}
+
+criterion_group!(benches, bench_regex);
+criterion_main!(benches);
diff --git a/regex-filtered/src/lib.rs b/regex-filtered/src/lib.rs
new file mode 100644
index 0000000..9b54604
--- /dev/null
+++ b/regex-filtered/src/lib.rs
@@ -0,0 +1,442 @@
+#![doc = include_str!("../README.md")]
+#![deny(unsafe_code)]
+#![warn(missing_docs)]
+
+use aho_corasick::AhoCorasick;
+
+mod mapper;
+mod model;
+pub use model::Error as ModelError;
+
+/// Builder for the regexes set
+pub struct Builder {
+    regexes: Vec<regex::Regex>,
+    mapper_builder: mapper::Builder,
+}
+
+/// Parser configuration, can be used to tune the regex parsing when
+/// adding it to the [`Builder`]. Every option defaults to `false`
+/// whether through [`Default`] or [`Options::new`].
+///
+/// The parser can also be configured via standard [`regex`] inline
+/// flags.
+#[derive(Default)]
+pub struct Options {
+    case_insensitive: bool,
+    dot_matches_new_line: bool,
+    ignore_whitespace: bool,
+    multi_line: bool,
+    crlf: bool,
+}
+
+impl Options {
+    /// Create a new options object.
+    pub fn new() -> Self {
+        Self::default()
+    }
+    /// Configures case-insensitive matching for the entire pattern.
+    pub fn case_insensitive(&mut self, yes: bool) -> &mut Self {
+        self.case_insensitive = yes;
+        self
+    }
+    /// Configures `.` to match newline characters, by default `.`
+    /// matches everything *except* newline characters.
+    pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Self {
+        self.dot_matches_new_line = yes;
+        self
+    }
+    /// Configures ignoring whitespace inside patterns, as well as `#`
+    /// line comments ("verbose" mode).
+    ///
+    /// Verbose mode is useful to break up complex regexes and improve
+    /// their documentation.
+    pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Self {
+        self.ignore_whitespace = yes;
+        self
+    }
+    /// Configures multi-line mode. When enabled, `^` matches at every
+    /// start of line and `$` at every end of line, by default they
+    /// match only the start and end of the string respectively.ca
+    pub fn multi_line(&mut self, yes: bool) -> &mut Self {
+        self.multi_line = yes;
+        self
+    }
+    /// Allows `\r` as a line terminator, by default only `\n` is a
+    /// line terminator (relevant for [`Self::ignore_whitespace`] and
+    /// [`Self::multi_line`]).
+    pub fn crlf(&mut self, yes: bool) -> &mut Self {
+        self.crlf = yes;
+        self
+    }
+    fn to_regex(&self, pattern: &str) -> Result<regex::Regex, regex::Error> {
+        regex::RegexBuilder::new(pattern)
+            .case_insensitive(self.case_insensitive)
+            .dot_matches_new_line(self.dot_matches_new_line)
+            .ignore_whitespace(self.ignore_whitespace)
+            .multi_line(self.multi_line)
+            .crlf(self.crlf)
+            .build()
+    }
+}
+impl From<Options> for regex_syntax::Parser {
+    fn from(opt: Options) -> Self {
+        Self::from(&opt)
+    }
+}
+impl From<&Options> for regex_syntax::Parser {
+    fn from(
+        Options {
+            case_insensitive,
+            dot_matches_new_line,
+            ignore_whitespace,
+            multi_line,
+            crlf,
+        }: &Options,
+    ) -> Self {
+        regex_syntax::ParserBuilder::new()
+            .case_insensitive(*case_insensitive)
+            .dot_matches_new_line(*dot_matches_new_line)
+            .ignore_whitespace(*ignore_whitespace)
+            .multi_line(*multi_line)
+            .crlf(*crlf)
+            .build()
+    }
+}
+
+/// Parsing error when adding a new regex to the [`Builder`].
+#[derive(Debug)]
+pub enum ParseError {
+    /// An error occurred while parsing the regex or translating it to
+    /// HIR.
+    SyntaxError(String),
+    /// An error occurred while processing the regex for atom
+    /// extraction.
+    ProcessingError(ModelError),
+    /// The regex was too large to compile to the NFA (within the
+    /// default limits).
+    RegexTooLarge(usize),
+}
+impl std::error::Error for ParseError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match self {
+            ParseError::ProcessingError(e) => Some(e),
+            ParseError::SyntaxError(_) => None,
+            ParseError::RegexTooLarge(_) => None,
+        }
+    }
+}
+impl std::fmt::Display for ParseError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{self:?}")
+    }
+}
+impl From<regex_syntax::Error> for ParseError {
+    fn from(value: regex_syntax::Error) -> Self {
+        Self::SyntaxError(value.to_string())
+    }
+}
+impl From<regex::Error> for ParseError {
+    fn from(value: regex::Error) -> Self {
+        match value {
+            regex::Error::CompiledTooBig(v) => Self::RegexTooLarge(v),
+            e => Self::SyntaxError(e.to_string()),
+        }
+    }
+}
+impl From<ModelError> for ParseError {
+    fn from(value: ModelError) -> Self {
+        Self::ProcessingError(value)
+    }
+}
+
+/// Error while compiling the builder to a prefiltered set.
+#[derive(Debug)]
+pub enum BuildError {
+    /// Error while building the prefilter.
+    PrefilterError(aho_corasick::BuildError),
+}
+impl std::error::Error for BuildError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match self {
+            BuildError::PrefilterError(p) => Some(p),
+        }
+    }
+}
+impl std::fmt::Display for BuildError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{self:?}")
+    }
+}
+impl From<aho_corasick::BuildError> for BuildError {
+    fn from(value: aho_corasick::BuildError) -> Self {
+        Self::PrefilterError(value)
+    }
+}
+
+impl Builder {
+    /// Instantiate a builder with the default metadata configuration:
+    ///
+    /// - minimum atom length 3
+    #[must_use]
+    pub fn new() -> Self {
+        Self::new_atom_len(3)
+    }
+
+    /// Instantiate a builder with a custom minimum atom length.
+    /// Increasing the atom length decreases the size and cost of the
+    /// prefilter, but may make more regexes impossible to prefilter,
+    /// which can increase matching costs.
+    #[must_use]
+    pub fn new_atom_len(min_atom_len: usize) -> Self {
+        Self {
+            regexes: Vec::new(),
+            mapper_builder: mapper::Builder::new(min_atom_len),
+        }
+    }
+
+    /// Currently loaded regexes.
+    pub fn regexes(&self) -> &[regex::Regex] {
+        &self.regexes
+    }
+
+    /// Push a single regex into the builder, using the default
+    /// parsing options.
+    pub fn push(self, s: &str) -> Result<Self, ParseError> {
+        self.push_opt(s, &Options::new())
+    }
+
+    /// Push a single regex into the builder, using custom parsing
+    /// options.
+    pub fn push_opt(mut self, regex: &str, opts: &Options) -> Result<Self, ParseError> {
+        let hir = regex_syntax::Parser::from(opts).parse(regex)?;
+        let pf = model::Model::new(&hir)?;
+        self.mapper_builder.push(pf);
+        self.regexes.push(opts.to_regex(regex)?);
+        Ok(self)
+    }
+
+    /// Push a batch of regexes into the builder, using the default
+    /// parsing options.
+    pub fn push_all<T, I>(self, i: I) -> Result<Self, ParseError>
+    where
+        T: AsRef<str>,
+        I: IntoIterator<Item = T>,
+    {
+        i.into_iter().try_fold(self, |b, s| b.push(s.as_ref()))
+    }
+
+    /// Build the regexes set from the current builder.
+    ///
+    /// Building a regexes set from no regexes is useless but not an
+    /// error.
+    pub fn build(self) -> Result<Regexes, BuildError> {
+        let Self {
+            regexes,
+            mapper_builder,
+        } = self;
+        let (mapper, atoms) = mapper_builder.build();
+
+        // Instead of returning a bunch of atoms for the user to
+        // manage, since `regex` depends on aho-corasick by default we
+        // can use that directly and not bother the user.
+        let prefilter = AhoCorasick::builder()
+            .ascii_case_insensitive(true)
+            .prefilter(true)
+            .build(atoms)?;
+
+        Ok(Regexes {
+            regexes,
+            mapper,
+            prefilter,
+        })
+    }
+}
+
+impl Default for Builder {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Regexes set, allows testing inputs against a *large* number of
+/// *non-trivial* regexes.
+pub struct Regexes {
+    regexes: Vec<regex::Regex>,
+    mapper: mapper::Mapper,
+    prefilter: AhoCorasick,
+}
+
+impl Regexes {
+    // TODO:
+    // - number of tokens (prefilter.patterns_len())
+    // - number of regexes
+    // - number of unfiltered regexes (from mapper)
+    // - ratio of checked regexes to successes (cfg-gated)
+    // - total / prefiltered (- unfiltered?) so atom size can be manipulated
+    #[inline]
+    fn prefilter<'a>(&'a self, haystack: &'a str) -> impl Iterator<Item = usize> + 'a {
+        self.prefilter
+            .find_overlapping_iter(haystack)
+            .map(|m| m.pattern().as_usize())
+    }
+
+    #[inline]
+    fn prefiltered(&self, haystack: &str) -> impl Iterator<Item = usize> {
+        self.mapper.atom_to_re(self.prefilter(haystack)).into_iter()
+    }
+
+    /// Returns *whether* any regex in the set matches the haystack.
+    pub fn is_match(&self, haystack: &str) -> bool {
+        eprintln!("{}", self.prefiltered(haystack).count());
+        self.prefiltered(haystack)
+            .any(|idx| self.regexes[idx].is_match(haystack))
+    }
+
+    /// Yields the regexes matching the haystack along with their
+    /// index.
+    ///
+    /// The results are guaranteed to be returned in ascending order.
+    pub fn matching<'a>(
+        &'a self,
+        haystack: &'a str,
+    ) -> impl Iterator<Item = (usize, &regex::Regex)> + 'a {
+        self.prefiltered(haystack).filter_map(move |idx| {
+            let r = &self.regexes[idx];
+            r.is_match(haystack).then_some((idx, r))
+        })
+    }
+
+    /// Returns a reference to all the regexes in the set.
+    pub fn regexes(&self) -> &[regex::Regex] {
+        &self.regexes
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use itertools::Itertools;
+
+    #[test]
+    fn empty_filter() {
+        let f = Builder::new().build().unwrap();
+        assert_eq!(f.prefilter("0123").collect_vec(), vec![]);
+
+        assert_eq!(f.matching("foo").count(), 0);
+    }
+
+    #[test]
+    fn empty_pattern() {
+        let f = Builder::new().push("").unwrap().build().unwrap();
+
+        assert_eq!(f.prefilter("0123").collect_vec(), vec![]);
+
+        assert_eq!(
+            f.matching("0123").map(|(idx, _)| idx).collect_vec(),
+            vec![0]
+        );
+    }
+
+    #[test]
+    fn small_or_test() {
+        let f = Builder::new_atom_len(4)
+            .push("(foo|bar)")
+            .unwrap()
+            .build()
+            .unwrap();
+
+        assert_eq!(f.prefilter("lemurs bar").collect_vec(), vec![]);
+
+        assert_eq!(
+            f.matching("lemurs bar").map(|(idx, _)| idx).collect_vec(),
+            vec![0],
+        );
+
+        let f = Builder::new().push("(foo|bar)").unwrap().build().unwrap();
+
+        assert_eq!(f.prefilter("lemurs bar").collect_vec(), vec![1]);
+
+        assert_eq!(
+            f.matching("lemurs bar").map(|(idx, _)| idx).collect_vec(),
+            vec![0],
+        );
+    }
+
+    #[test]
+    fn basic_matches() {
+        let f = Builder::new()
+            .push("(abc123|abc|defxyz|ghi789|abc1234|xyz).*[x-z]+")
+            .unwrap()
+            .push("abcd..yyy..yyyzzz")
+            .unwrap()
+            .push("mnmnpp[a-z]+PPP")
+            .unwrap()
+            .build()
+            .unwrap();
+
+        assert_eq!(
+            f.matching("abc121212xyz").map(|(idx, _)| idx).collect_vec(),
+            vec![0],
+        );
+
+        assert_eq!(
+            f.matching("abc12312yyyzzz")
+                .map(|(idx, _)| idx)
+                .collect_vec(),
+            vec![0],
+        );
+
+        assert_eq!(
+            f.matching("abcd12yyy32yyyzzz")
+                .map(|(idx, _)| idx)
+                .collect_vec(),
+            vec![0, 1],
+        );
+    }
+
+    #[test]
+    fn basics() {
+        // In re2 this is the `MoveSemantics` test, which is... so not
+        // necessary for us. But it's a pair of extra regexes we can
+        // test
+
+        let f = Builder::new().push("foo\\d+").unwrap().build().unwrap();
+
+        assert_eq!(
+            f.matching("abc foo1 xyz").map(|(idx, _)| idx).collect_vec(),
+            vec![0],
+        );
+        assert_eq!(
+            f.matching("abc bar2 xyz").map(|(idx, _)| idx).collect_vec(),
+            vec![],
+        );
+
+        let f = Builder::new().push("bar\\d+").unwrap().build().unwrap();
+
+        assert_eq!(
+            f.matching("abc foo1 xyz").map(|(idx, _)| idx).collect_vec(),
+            vec![],
+        );
+        assert_eq!(
+            f.matching("abc bar2 xyz").map(|(idx, _)| idx).collect_vec(),
+            vec![0],
+        );
+    }
+
+    #[test]
+    fn bulk_api() {
+        use std::io::BufRead as _;
+
+        Builder::new().push_all(["a", "b"]).unwrap();
+
+        Builder::new()
+            .push_all(vec!["a".to_string(), "b".to_string()])
+            .unwrap();
+
+        Builder::new().push_all("a\nb\nc\nd\n".lines()).unwrap();
+
+        Builder::new()
+            .push_all(b"a\nb\nc\nd\n".lines().map(|l| l.unwrap()))
+            .unwrap();
+    }
+}
diff --git a/regex-filtered/src/mapper.rs b/regex-filtered/src/mapper.rs
new file mode 100644
index 0000000..a582b9c
--- /dev/null
+++ b/regex-filtered/src/mapper.rs
@@ -0,0 +1,429 @@
+use std::collections::{HashMap, HashSet};
+use std::fmt::Display;
+use std::fmt::Formatter;
+
+use indexmap::IndexSet;
+
+use super::model::Model;
+
+pub struct Builder {
+    min_atom_len: usize,
+    models: Vec<Model>,
+    unfiltered: Vec<usize>,
+}
+impl Builder {
+    pub fn new(min_atom_len: usize) -> Self {
+        Self {
+            min_atom_len,
+            models: Vec::new(),
+            unfiltered: Vec::new(),
+        }
+    }
+
+    pub fn push(&mut self, mut pf: Model) {
+        if !self.keep_node(&mut pf) {
+            self.unfiltered.push(self.models.len());
+            // these go into unfiltered: regexes which always pass
+            // through the filter
+            // re2 uses nulls here but that's not us
+            pf = Model::all();
+        }
+        self.models.push(pf);
+    }
+    fn keep_node(&self, pf: &mut Model) -> bool {
+        match pf {
+            Model::All(_) | Model::None(_) => false,
+            Model::Atom(_, s) => s.len() >= self.min_atom_len,
+            Model::And(_, subs) => {
+                subs.retain_mut(|p| self.keep_node(p));
+                !subs.is_empty()
+            }
+            Model::Or(_, subs) => subs.iter_mut().all(|p| self.keep_node(p)),
+        }
+    }
+
+    pub fn build(self) -> (Mapper, Vec<String>) {
+        // inlined `assign_unique_ids` because it doesn't seem super useful... to us
+        #[allow(clippy::mutable_key_type)]
+        let mut nodes = NodeSet::new();
+        let mut atoms = Vec::new();
+        let mut atom_index_to_id = Vec::new();
+        // Build vector of all filter nodes, sorted topologically,
+        // from top to bottom in v add the top-level node of each
+        // regexp model
+        let mut v = self.models.iter().collect::<Vec<_>>();
+
+        // now add all the descendant nodes, this has to be a `while` because we unroll the source
+        let mut i = 0;
+        while i < v.len() {
+            let p = &v[i];
+            i += 1;
+
+            if let Model::And(_, s) | Model::Or(_, s) = &p {
+                v.extend(s.iter());
+            }
+        }
+
+        let mut unique_id = 0..;
+        // identify unique nodes
+        for node in v.iter().rev() {
+            if let Some(canonical) = nodes.get(node) {
+                node.set_unique_id(canonical.unique_id());
+            } else {
+                let uid = unique_id.next().expect("infinite");
+                node.set_unique_id(uid);
+                if let Model::Atom(_, s) = &node {
+                    atoms.push(s.to_string());
+                    atom_index_to_id.push(uid);
+                }
+                nodes.insert(node);
+            }
+        }
+
+        // maybe this could just be a prealloc and we append since id
+        // should be a sequence?
+        let mut entries = vec![Entry::default(); unique_id.next().expect("infinite(ish) sequence")];
+        // Fill the entries
+        for model in v.iter().rev() {
+            if nodes.get(model) != Some(model) {
+                continue;
+            }
+            let id = model.unique_id();
+            match &model {
+                Model::None(_) => unreachable!("no idea why this is an error"),
+                // We replace excluded models by All rather than null,
+                // so those are not unreachable.
+                Model::All(_) => (),
+                Model::Atom(_, _) => {
+                    entries[id].propagate_up_at_count = 1;
+                }
+                // For each child, we append our id to the child's
+                // list of parent ids... unless we happen to have done
+                // so already. The number of appends is the number of
+                // unique children, which allows correct upward
+                // propagation from AND nodes.
+                Model::And(_, s) | Model::Or(_, s) => {
+                    let mut up_count = 0;
+                    for child_id in s.iter().map(|c| c.unique_id()) {
+                        let parents = &mut entries[child_id].parents;
+                        if parents.last() != Some(&id) {
+                            parents.push(id);
+                            up_count += 1;
+                        }
+                    }
+
+                    entries[id].propagate_up_at_count = if matches!(&model, Model::And(..)) {
+                        up_count
+                    } else {
+                        1
+                    };
+                }
+            }
+        }
+
+        // For top level nodes, populate regexp id
+        for (i, tl) in v[..self.models.len()].iter().enumerate() {
+            if let Some(p) = nodes.get(tl) {
+                entries[p.unique_id()].regexps.push(i);
+            }
+        }
+
+        // Lastly, using probability-based heuristics, we identify nodes
+        // that trigger too many parents and then we try to prune edges.
+        // We use logarithms below to avoid the likelihood of underflow.
+        let log_num_regexps = ((self.models.len() - self.unfiltered.len()) as f64).ln();
+        // Hoisted this above the loop so that we don't thrash the heap. (???)
+        let mut entries_by_num_edges = Vec::<(usize, usize)>::new();
+        for model in v.iter().rev() {
+            let Model::And(_, s) = &model else {
+                continue;
+            };
+            if nodes.get(model) != Some(model) {
+                continue;
+            }
+            let id = model.unique_id();
+
+            // Sort the current node's children by the numbers of parents.
+            for child_id in s.iter().map(|c| c.unique_id()) {
+                entries_by_num_edges.push((entries[child_id].parents.len(), child_id));
+            }
+            entries_by_num_edges.sort_unstable();
+
+            // A running estimate of how many regexps will be
+            // triggered by pruning the remaining children's edges to
+            // the current node. Our nominal target is one, so the
+            // threshold is log(1) == 0; pruning occurs iff the child
+            // has more than nine edges left.
+            let mut log_num_triggered = log_num_regexps;
+            for (_, child_id) in entries_by_num_edges.drain(..) {
+                let parents = &mut entries[child_id].parents;
+                if log_num_triggered > 0. {
+                    log_num_triggered += (parents.len() as f64).ln();
+                    log_num_triggered -= log_num_regexps;
+                } else if parents.len() > 9 {
+                    if let Some(idx) = parents.iter().position(|&p| p == id) {
+                        parents.swap_remove(idx);
+                        // re2 uses an `int`, which can go negative,
+                        // we use a usize (because it's based on the
+                        // number of children or sth though it's
+                        // probably unnecessary) but that means we
+                        // can't keep decrementing below 0
+                        entries[id].propagate_up_at_count =
+                            entries[id].propagate_up_at_count.saturating_sub(1);
+                    }
+                }
+            }
+        }
+
+        (
+            Mapper {
+                entries,
+                unfiltered: self.unfiltered,
+                atom_to_entry: atom_index_to_id,
+                regexp_count: self.models.len(),
+            },
+            atoms,
+        )
+    }
+}
+
+impl Display for Mapper {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        writeln!(f, "#Unique Atoms: {}", self.atom_to_entry.len())?;
+        for (i, e) in self.atom_to_entry.iter().copied().enumerate() {
+            writeln!(f, "\tatom {i} -> entry {e}")?;
+            for r in self.propagate_match([e].into()) {
+                writeln!(f, "\t\tregex {r}")?;
+            }
+        }
+
+        writeln!(f, "#Unique Entries: {}", self.entries.len())?;
+        for (i, entry) in self.entries.iter().enumerate() {
+            writeln!(
+                f,
+                "\tEntry: {i} Regexps: {} Threshold: {}",
+                entry.regexps.len(),
+                entry.propagate_up_at_count,
+            )?;
+            for parent in &entry.parents {
+                writeln!(f, "\t\tParent {parent}")?;
+            }
+        }
+        Ok(())
+    }
+}
+
+type NodeSet<'a> = std::collections::HashSet<&'a Model>;
+
+/// Each unique node has a corresponding Entry that helps in passing
+/// the matching trigger information along the tree.
+#[derive(Default, Clone, Debug)]
+struct Entry {
+    /// How many children should match before this node triggers the
+    /// parent. For an atom and an OR node, this is 1 and for an AND
+    /// node, it is the number of unique children.
+    propagate_up_at_count: usize,
+
+    /// When this node is ready to trigger the parent, what are the indices
+    /// of the parent nodes to trigger. The reason there may be more than
+    /// one is because of sharing. For example (abc | def) and (xyz | def)
+    /// are two different nodes, but they share the atom 'def'. So when
+    /// 'def' matches, it triggers two parents, corresponding to the two
+    /// different OR nodes.
+    parents: Vec<usize>,
+
+    /// When this node is ready to trigger the parent, what are the
+    /// regexps that are triggered.
+    regexps: Vec<usize>,
+}
+pub struct Mapper {
+    /// Number of regexes covered by the mapper
+    regexp_count: usize,
+    /// Nodes formed by build, there is one node for each unique atom
+    /// and each unique and/or node
+    entries: Vec<Entry>,
+    /// Indices of regexp which always make it through the filter
+    /// (didn't find distinguishing literals in them)
+    unfiltered: Vec<usize>,
+    /// Atom index to entry id mapping
+    atom_to_entry: Vec<usize>,
+}
+impl Mapper {
+    // name is shit and also needs to see if we can generate stuff on the fly
+    pub fn atom_to_re(&self, atoms: impl IntoIterator<Item = usize>) -> Vec<usize> {
+        let matched_atom_ids = atoms
+            .into_iter()
+            .map(|idx| self.atom_to_entry[idx])
+            .collect();
+        let regexps_map = self.propagate_match(matched_atom_ids);
+
+        let mut regexps = Vec::with_capacity(regexps_map.len() + self.unfiltered.len());
+        regexps.extend(&self.unfiltered);
+        regexps.extend(regexps_map);
+
+        regexps.sort_unstable();
+        regexps
+    }
+
+    fn propagate_match(&self, mut work: IndexSet<usize>) -> HashSet<usize> {
+        work.reserve(self.entries.len() - work.len());
+        let mut count = HashMap::with_capacity(self.entries.len());
+
+        let mut regexps = HashSet::with_capacity(self.regexp_count);
+
+        let mut i = 0;
+        while i < work.len() {
+            let idx = work[i];
+            i += 1;
+
+            let entry = &self.entries[idx];
+            // record regexps triggered
+            regexps.extend(&entry.regexps);
+            // pass trigger up to parents
+            for &j in &entry.parents {
+                let parent = &self.entries[j];
+                // Delay until all the children have succeeded.
+                if parent.propagate_up_at_count > 1 {
+                    let c = count.entry(j).and_modify(|e| *e += 1).or_insert(1);
+                    if *c < parent.propagate_up_at_count {
+                        continue;
+                    }
+                }
+                work.insert(j);
+            }
+        }
+
+        regexps
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::model::Model;
+    use regex_syntax::parse;
+
+    #[test]
+    fn empty_matcher() {
+        let (m, atoms) = Builder::new(3).build();
+        assert_eq!(atoms.len(), 0);
+        assert_eq!(&m.unfiltered, &[]);
+    }
+
+    #[test]
+    fn empty_pattern() {
+        let mut b = Builder::new(3);
+        b.push(Model::new(&parse("").unwrap()).unwrap());
+        let (m, atoms) = b.build();
+        assert_eq!(atoms.len(), 0);
+        assert_eq!(&m.unfiltered, &[0]);
+    }
+
+    #[test]
+    fn small_or_test() {
+        let mut b = Builder::new(4);
+        b.push(Model::new(&parse("(foo|bar)").unwrap()).unwrap());
+        let (m, atoms) = b.build();
+        assert_eq!(atoms.len(), 0);
+        assert_eq!(&m.unfiltered, &[0]);
+        assert_eq!(&m.atom_to_entry, &[])
+    }
+
+    #[test]
+    fn reverse_index() {
+        let mut b = Builder::new(3);
+        b.push(Model::new(&parse("(foo|bar)").unwrap()).unwrap());
+        let (m, _) = b.build();
+
+        assert_eq!(m.entries.len(), 3);
+        assert_eq!(&m.atom_to_entry, &[0, 1]);
+        assert_eq!(m.propagate_match([0].into()), [0].into(),);
+        assert_eq!(m.propagate_match([1].into()), [0].into(),);
+    }
+
+    fn check_patterns(patterns: &'static [&'static str], expected: &'static [&'static str]) {
+        let mut b = Builder::new(3);
+        for pattern in patterns {
+            b.push(Model::new(&parse(pattern).unwrap()).unwrap());
+        }
+        let (_, mut atoms) = b.build();
+
+        atoms.sort();
+        let mut sortspected = expected.to_vec();
+        sortspected.sort();
+        assert_eq!(atoms, sortspected);
+    }
+
+    #[test]
+    fn empty_patterns_are_allowed() {
+        check_patterns(&[""], &[]);
+    }
+
+    #[test]
+    fn all_atoms_greater_than_minlength_are_found_and_none_smaller() {
+        check_patterns(
+            &[
+                "(abc123|def456|ghi789).*mnop[x-z]+",
+                "abc..yyy..zz",
+                "mnmnpp[a-z]+PPP",
+            ],
+            &[
+                "abc123", "def456", "ghi789", "mnop", "abc", "yyy", "mnmnpp", "ppp",
+            ],
+        );
+    }
+    #[test]
+    fn shortest_substrings_are_kept() {
+        check_patterns(
+            &[
+                "(abc123|abc|defxyz|ghi789|abc1234|xyz).*[x-z]+",
+                "abcd..yyy..yyyzzz",
+                "mnmnpp[a-z]+PPP",
+            ],
+            &[
+                "abc", "ghi789", "xyz", "abcd", "yyy", "yyyzzz", "mnmnpp", "ppp",
+            ],
+        );
+    }
+
+    #[test]
+    fn character_class_expansion() {
+        check_patterns(
+            &["m[a-c][d-f]n.*[x-z]+", "[x-y]bcde[ab]"],
+            &[
+                "madn", "maen", "mafn", "mbdn", "mben", "mbfn", "mcdn", "mcen", "mcfn", "xbcdea",
+                "xbcdeb", "ybcdea", "ybcdeb",
+            ],
+        );
+    }
+    #[test]
+    fn non_ascii_casefolding() {
+        check_patterns(
+            &[
+                // re2 apparently does some sort of strange normalisation
+                // pass which regex does not and which does not seem
+                // entirely kosher (might be a unicode-aware but
+                // per-character upper then lower since it gets the final
+                // position sigma "wrong")
+                //"(?i)ΔδΠϖπΣςσ",
+                "ΛΜΝΟΠ",
+                "ψρστυ",
+            ],
+            &[
+                //"δδπππσσσ",
+                "λμνοπ",
+                "ψρστυ",
+            ],
+        );
+    }
+
+    #[test]
+    fn test_empty_string_in_string_set() {
+        let mut b = Builder::new(0);
+        b.push(Model::new(&parse("-R.+(|ADD=;AA){12}}").unwrap()).unwrap());
+        let (_, mut atoms) = b.build();
+        atoms.sort();
+
+        assert_eq!(atoms, vec!["", "-r", "add=;aa", "}"],);
+    }
+}
diff --git a/regex-filtered/src/model.rs b/regex-filtered/src/model.rs
new file mode 100644
index 0000000..1270aff
--- /dev/null
+++ b/regex-filtered/src/model.rs
@@ -0,0 +1,516 @@
+use itertools::iproduct;
+use regex_syntax::hir::{self, visit, Hir, HirKind, Visitor};
+use std::cell::Cell;
+use std::fmt::{Display, Formatter, Write};
+use std::str::Utf8Error;
+use std::{collections::BTreeSet, ops::Deref};
+
+#[derive(Clone, Debug)]
+pub enum Model {
+    /// Everything matches.
+    All(Cell<usize>),
+    /// Nothing matches.
+    None(Cell<usize>),
+    /// The string matches.
+    Atom(Cell<usize>, String),
+    /// All sub-filters must match.
+    And(Cell<usize>, Vec<Model>),
+    /// One sub-filter must match.
+    Or(Cell<usize>, Vec<Model>),
+}
+use Model::{All, And, Atom, None, Or};
+
+impl std::hash::Hash for Model {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        state.write_u8(self.op());
+        match self {
+            All(_) | None(_) => (),
+            Atom(_, s) => s.hash(state),
+            And(_, ps) | Or(_, ps) => {
+                state.write_usize(ps.len());
+                for p in ps {
+                    state.write_usize(p.unique_id());
+                }
+            }
+        }
+    }
+}
+
+impl std::cmp::PartialEq for Model {
+    fn eq(&self, other: &Self) -> bool {
+        match (self, other) {
+            (All(_), All(_)) | (None(_), None(_)) => true,
+            (Atom(_, a), Atom(_, b)) => a == b,
+            (And(_, va), And(_, vb)) | (Or(_, va), Or(_, vb)) => {
+                va.len() == vb.len()
+                    && std::iter::zip(va, vb).all(|(a, b)| a.unique_id() == b.unique_id())
+            }
+            _ => false,
+        }
+    }
+}
+impl Eq for Model {}
+
+impl From<String> for Model {
+    fn from(s: String) -> Self {
+        Atom(Cell::new(usize::MAX), s)
+    }
+}
+
+impl Display for Model {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match &self {
+            All(_) => f.write_str(""),
+            None(_) => f.write_str("*no-matches*"),
+            Atom(_, s) => f.write_str(s),
+            And(_, subs) => {
+                for (i, s) in subs.iter().enumerate() {
+                    if i != 0 {
+                        f.write_char(' ')?;
+                    }
+                    write!(f, "{s}")?;
+                }
+                Ok(())
+            }
+            Or(_, subs) => {
+                f.write_char('(')?;
+                for (i, s) in subs.iter().enumerate() {
+                    if i != 0 {
+                        f.write_char('|')?;
+                    }
+                    write!(f, "{s}")?;
+                }
+                f.write_char(')')
+            }
+        }
+    }
+}
+
+/// Processing errors
+#[derive(Debug)]
+pub enum Error {
+    /// Processing missed or exceeded some of the stack
+    FinalizationError,
+    /// Processing reached HIR nodes limit
+    EarlyStop,
+    /// Literal was not a valid string
+    DecodeError(Utf8Error),
+    /// Non-decodable character class
+    ClassError(hir::ClassBytes),
+}
+impl Display for Error {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{self:?}")
+    }
+}
+impl std::error::Error for Error {}
+impl From<Utf8Error> for Error {
+    fn from(value: Utf8Error) -> Self {
+        Error::DecodeError(value)
+    }
+}
+
+impl Model {
+    pub fn new(r: &Hir) -> Result<Self, Error> {
+        visit(r, InfoVisitor::default())
+    }
+
+    pub fn unique_id(&self) -> usize {
+        match self {
+            All(id) | None(id) | Atom(id, _) | And(id, _) | Or(id, _) => id.get(),
+        }
+    }
+    pub fn set_unique_id(&self, value: usize) {
+        match self {
+            All(id) | None(id) | Atom(id, _) | And(id, _) | Or(id, _) => id.set(value),
+        }
+    }
+
+    pub fn all() -> Self {
+        All(Cell::new(usize::MAX))
+    }
+
+    pub fn none() -> Self {
+        None(Cell::new(usize::MAX))
+    }
+
+    fn or_strings(strings: SSet) -> Self {
+        Model::Or(
+            Cell::new(usize::MAX),
+            simplify_string_set(strings).map(From::from).collect(),
+        )
+    }
+
+    fn op(&self) -> u8 {
+        match self {
+            All(_) => 0,
+            None(_) => 1,
+            Atom(_, _) => 2,
+            And(_, _) => 3,
+            Or(_, _) => 4,
+        }
+    }
+
+    /// Simplifies And and Or nodes
+    fn simplify(self) -> Self {
+        match self {
+            And(uid, v) if v.is_empty() => All(uid),
+            Or(uid, v) if v.is_empty() => None(uid),
+            And(_, mut v) | Or(_, mut v) if v.len() == 1 => {
+                v.pop().expect("we checked the length").simplify()
+            }
+            s => s,
+        }
+    }
+
+    // re2 merges those into separate functions but it only saves on
+    // the header and increases the branching complexity of the rest
+    // so y?
+    fn and(self, mut b: Self) -> Self {
+        let mut a = self.simplify();
+        b = b.simplify();
+
+        // Canonicalize: a->op <= b->op.
+        if a.op() > b.op() {
+            std::mem::swap(&mut a, &mut b);
+        }
+
+        // ALL and NONE are smallest opcodes.
+        a = match a {
+            // ALL and b = b
+            All(..) => return b,
+            // NONE and b = None
+            None(uid) => return None(uid),
+            a => a,
+        };
+
+        match (a, b) {
+            // If a and b match op, merge their contents.
+            (And(unique_id, mut va), And(_, vb)) => {
+                va.extend(vb);
+                And(unique_id, va)
+            }
+            // If a or b matches the operation, merge the other one in
+            (And(unique_id, mut v), vv) | (vv, And(unique_id, mut v)) => {
+                v.push(vv);
+                And(unique_id, v)
+            }
+            (a, b) => And(Cell::new(usize::MAX), vec![a, b]),
+        }
+    }
+
+    fn or(self, mut b: Self) -> Self {
+        let mut a = self.simplify();
+        b = b.simplify();
+
+        // Canonicalize: a->op <= b->op.
+        if a.op() > b.op() {
+            std::mem::swap(&mut a, &mut b);
+        }
+
+        a = match a {
+            // NONE or b = b
+            None(..) => return b,
+            // ALL or b = ALL
+            All(uid) => return All(uid),
+            a => a,
+        };
+
+        match (a, b) {
+            // If a and b match op, merge their contents.
+            (Or(unique_id, mut va), Or(_, vb)) => {
+                va.extend(vb);
+                Or(unique_id, va)
+            }
+            // If a or b matches the operation, merge the other one in
+            (Or(unique_id, mut v), vv) | (vv, Or(unique_id, mut v)) => {
+                v.push(vv);
+                Or(unique_id, v)
+            }
+            (a, b) => Or(Cell::new(usize::MAX), vec![a, b]),
+        }
+    }
+}
+
+// Necessary for simplify_string_set to work: the simplification
+// consists of removing every "superset" of an other string of the
+// set, that is any strings which contains an other (non-empty) string
+// of the set, because the smaller atom will already indicate that the
+// pattern is a candidate, so also matching the larger atom is useless
+//
+// In order to make the implementation simpler and more efficient,
+// visit the smaller strings first that way we only need to visit the
+// following siblings (larger strings which *might* contain the
+// current one).
+#[derive(PartialEq, Eq, Debug, Clone)]
+struct LengthThenLex(pub String);
+impl Deref for LengthThenLex {
+    type Target = String;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+impl Ord for LengthThenLex {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.0
+            .len()
+            .cmp(&other.0.len())
+            .then_with(|| self.0.cmp(&other.0))
+    }
+}
+impl PartialOrd for LengthThenLex {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+type SSet = BTreeSet<LengthThenLex>;
+fn simplify_string_set(strings: SSet) -> impl Iterator<Item = String> {
+    let mut to_keep = vec![true; strings.len()];
+    let mut e = strings.iter().enumerate();
+    while let Some((i, s)) = e.next() {
+        if s.is_empty() || !to_keep[i] {
+            continue;
+        }
+
+        for (keep, (_, s2)) in to_keep[i..].iter_mut().skip(1).zip(e.clone()) {
+            if *keep && s2.len() > s.len() && s2.0.contains(&s.0) {
+                *keep = false;
+            }
+        }
+    }
+
+    std::iter::zip(to_keep, strings)
+        .filter(|v| v.0)
+        .map(|v| v.1 .0)
+}
+
+/// Intermediate information about the set of strings a regex matches,
+/// used for the computation of a prefilter.
+#[derive(Debug)]
+enum Info {
+    Match(Model),
+    Exact(SSet),
+}
+impl Info {
+    fn take_match(self) -> Model {
+        match self {
+            Self::Match(p) => p,
+            Self::Exact(s) => Model::or_strings(s),
+        }
+    }
+
+    fn into_exact(self) -> Option<SSet> {
+        match self {
+            Self::Exact(s) => Some(s),
+            Self::Match(_) => Option::None,
+        }
+    }
+}
+
+struct InfoVisitor {
+    stack: Vec<Info>,
+    max_visits: usize,
+}
+impl Default for InfoVisitor {
+    fn default() -> Self {
+        Self {
+            max_visits: 100_000,
+            stack: Vec::new(),
+        }
+    }
+}
+
+// [`regex_syntax::hir::Visitor`] works pretty differently than
+// `re2::Regexp::Walker` as it does not return / merge anything, so we
+// need to merge down into the stack on post.
+impl Visitor for InfoVisitor {
+    type Output = Model;
+    type Err = Error;
+
+    fn finish(mut self) -> Result<Self::Output, Self::Err> {
+        (self.stack.len() == 1)
+            .then_some(&mut self.stack)
+            .and_then(|s| s.pop())
+            .map(Info::take_match)
+            .ok_or(Error::FinalizationError)
+    }
+
+    fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> {
+        // re2 sets `stopped_early` and calls `ShortVisit` but keeps
+        // on keeping on, not clear why & ultimately BuildInfo only
+        // cares about having stopped early
+        self.max_visits = self.max_visits.checked_sub(1).ok_or(Error::EarlyStop)?;
+
+        Ok(())
+    }
+
+    fn visit_post(&mut self, hir: &Hir) -> Result<(), Self::Err> {
+        match hir.kind() {
+            HirKind::Empty | HirKind::Look(_) => {
+                self.stack
+                    .push(Info::Exact([LengthThenLex(String::new())].into()));
+            }
+            HirKind::Literal(hir::Literal(data)) => {
+                if data.is_empty() {
+                    // NoMatch
+                    self.stack.push(Info::Match(Model::none()));
+                } else {
+                    // re2 does this weird as it performs a cross
+                    // product of individual characters, but as far as
+                    // I understand that's just a complicated way to
+                    // build a singleton set of the payload?
+                    self.stack.push(Info::Exact(
+                        [LengthThenLex(std::str::from_utf8(data)?.to_lowercase())].into(),
+                    ));
+                }
+            }
+            HirKind::Class(cls) => {
+                let uc;
+                let c = match cls {
+                    hir::Class::Unicode(c) => c,
+                    hir::Class::Bytes(b) => {
+                        uc = b
+                            .to_unicode_class()
+                            .ok_or_else(|| Error::ClassError(b.clone()))?;
+                        &uc
+                    }
+                };
+                self.stack
+                    .push(if c.iter().map(|r| r.len()).sum::<usize>() > 10 {
+                        Info::Match(Model::all())
+                    } else {
+                        Info::Exact(
+                            c.iter()
+                                .flat_map(|r| (r.start()..=r.end()))
+                                .map(char::to_lowercase)
+                                .map(String::from_iter)
+                                .map(LengthThenLex)
+                                .collect(),
+                        )
+                    });
+            }
+            // Apparently re2 and regex have inverse choices, re2
+            // normalises repetitions to */+/?, regex normalises
+            // everything to {a, b}, so this may or may make any sense
+            HirKind::Repetition(hir::Repetition { min, .. }) => {
+                if *min == 0 {
+                    // corresponds to */? (star/quest)
+                    self.stack.pop();
+                    self.stack.push(Info::Match(Model::all()));
+                } else {
+                    // corresponds to +
+                    let arg = self
+                        .stack
+                        .pop()
+                        .expect("a repetition to be associated with a pattern to repeat")
+                        .take_match();
+                    self.stack.push(Info::Match(arg));
+                }
+            }
+            // should just leave its child on the stack for whoever
+            // lives up
+            HirKind::Capture(_) => (),
+            HirKind::Alternation(alt) => {
+                // needs to pop alt.len() items from the stack, and if
+                // they're ``exact`` then just merge them, otherwise
+                // ``Prefilter::Or`` them
+
+                // sort the topn to have the exacts at the top, largest top
+                let topn = self.stack.len() - alt.len()..;
+                let infos = &mut self.stack[topn.clone()];
+
+                let matches =
+                    topn.start + infos.iter().filter(|v| matches!(v, Info::Match(_))).count();
+                // I think we can do that because we don't actually
+                // regex match so order should not matter question
+                // mark
+                infos.sort_unstable_by_key(|v| match v {
+                    Info::Match(_) => (false, 0),
+                    Info::Exact(s) => (true, s.len()),
+                });
+                // there are exact matches, merge them
+                let exacts = self
+                    .stack
+                    .drain(matches..)
+                    .rev()
+                    .fold(BTreeSet::new(), |mut s, i| {
+                        s.append(
+                            &mut i
+                                .into_exact()
+                                .expect("the top `matches` records should be exacts"),
+                        );
+                        s
+                    });
+                let mut matches = self
+                    .stack
+                    .drain(topn)
+                    .map(Info::take_match)
+                    .collect::<Vec<_>>();
+                self.stack.push(if matches.is_empty() {
+                    Info::Exact(exacts)
+                } else {
+                    if !exacts.is_empty() {
+                        matches.push(Model::or_strings(exacts));
+                    }
+                    Info::Match(
+                        matches
+                            .into_iter()
+                            .map(From::from)
+                            .fold(Model::none(), Model::or),
+                    )
+                });
+            }
+            // and this one gets really painful, like above we need to
+            // take the topn but unlike the above we can't reorder all
+            // our stuff around
+            HirKind::Concat(c) => {
+                let topn = self.stack.len() - c.len()..;
+
+                // ALL is the identity element of AND
+                let mut result = Info::Match(Model::all());
+                let mut exacts = BTreeSet::new();
+                for info in self.stack.drain(topn) {
+                    match info {
+                        Info::Exact(set) if exacts.is_empty() => {
+                            exacts = set;
+                        }
+                        Info::Exact(set) if set.len() * exacts.len() <= 16 => {
+                            // Not useful to consume the existing
+                            // `exacts` up-front, as each item has to
+                            // be splatted over `set`.
+                            exacts = iproduct!(&exacts, &set)
+                                .map(|(s, ss)| {
+                                    let mut r = String::with_capacity(s.len() + ss.len());
+                                    r.push_str(s);
+                                    r.push_str(ss);
+                                    LengthThenLex(r)
+                                })
+                                .collect();
+                        }
+                        i => {
+                            // here AND the combination of info,
+                            // exact, and the existing garbage
+                            let mut p = result.take_match();
+                            if !exacts.is_empty() {
+                                p = Model::and(p, Model::or_strings(std::mem::take(&mut exacts)));
+                            }
+                            p = Model::and(p, i.take_match());
+                            result = Info::Match(p);
+                        }
+                    }
+                }
+
+                if exacts.is_empty() {
+                    self.stack.push(result);
+                } else {
+                    self.stack.push(Info::Match(Model::and(
+                        result.take_match(),
+                        Model::or_strings(exacts),
+                    )));
+                }
+            }
+        }
+        Ok(())
+    }
+}
diff --git a/ua-parser/Cargo.toml b/ua-parser/Cargo.toml
new file mode 100644
index 0000000..8d6e06b
--- /dev/null
+++ b/ua-parser/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "ua-parser"
+version = "0.1.0"
+edition = "2021"
+license = "Apache-2.0"
+description = "Rust implementation of the User Agent String Parser project"
+keywords = ["ua_parser", "user-agent", "user-agent parser"]
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+regex = "1.10.4"
+regex-filtered = { version = "0.1.0", path = "../regex-filtered" }
+serde = { version = "1.0.203", features = ["derive"] }
+
+[dev-dependencies]
+clap = { version = "4.5.6", features = ["derive"] }
+serde_json = "1.0.117"
+serde_yaml = "0.9.34"
diff --git a/ua-parser/LICENSE b/ua-parser/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/ua-parser/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/ua-parser/examples/bench.rs b/ua-parser/examples/bench.rs
new file mode 100644
index 0000000..460a271
--- /dev/null
+++ b/ua-parser/examples/bench.rs
@@ -0,0 +1,46 @@
+use clap::Parser;
+use std::io::{BufRead, BufReader};
+use std::path::PathBuf;
+
+#[derive(Parser, Debug)]
+struct Args {
+    /// regexes.yaml file to parse the data file with
+    regexes: PathBuf,
+    /// user agents file
+    user_agents: PathBuf,
+    /// number of repetitions through the user agent file
+    #[arg(short, long, default_value_t = 1)]
+    repetitions: usize,
+}
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let Args {
+        regexes,
+        user_agents,
+        repetitions,
+    } = Args::parse();
+
+    let f = std::fs::File::open(regexes)?;
+    let r = ua_parser::Extractor::try_from(serde_yaml::from_reader::<_, ua_parser::Regexes>(f)?)?;
+
+    let uas = BufReader::new(std::fs::File::open(user_agents)?)
+        .lines()
+        .collect::<Result<Vec<String>, _>>()?;
+
+    let duration = std::time::Instant::now();
+    for _ in 0..repetitions {
+        for ua in &uas {
+            drop(r.extract(ua));
+        }
+    }
+
+    let elapsed = duration.elapsed();
+    println!("Lines: {}", repetitions * uas.len());
+    println!("Total time: {elapsed:?}");
+    println!(
+        "{}µs / line",
+        elapsed.as_micros() / (repetitions * uas.len()) as u128
+    );
+
+    Ok(())
+}
diff --git a/ua-parser/src/lib.rs b/ua-parser/src/lib.rs
new file mode 100644
index 0000000..9f9d0d5
--- /dev/null
+++ b/ua-parser/src/lib.rs
@@ -0,0 +1,609 @@
+#![deny(unsafe_code)]
+#![warn(missing_docs)]
+#![allow(clippy::empty_docs)]
+#![doc = include_str!("../../README.md")]
+
+use regex::Captures;
+use serde::Deserialize;
+
+pub use regex_filtered::{BuildError, ParseError};
+
+mod resolvers;
+
+/// Error returned if the conversion of [`Regexes`] to [`Extractor`]
+/// fails.
+#[derive(Debug)]
+pub enum Error {
+    /// Compilation failed because one of the input regexes could not
+    /// be parsed or processed.
+    ParseError(ParseError),
+    /// Compilation failed because one of the prefilters could not be
+    /// built.
+    BuildError(BuildError),
+    /// A replacement template requires a group missing from the regex
+    MissingGroup(usize),
+}
+impl std::error::Error for Error {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match self {
+            Error::ParseError(p) => Some(p),
+            Error::BuildError(b) => Some(b),
+            Error::MissingGroup(_) => None,
+        }
+    }
+}
+impl std::fmt::Display for Error {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{self:?}")
+    }
+}
+impl From<ParseError> for Error {
+    fn from(value: ParseError) -> Self {
+        Self::ParseError(value)
+    }
+}
+impl From<BuildError> for Error {
+    fn from(value: BuildError) -> Self {
+        Self::BuildError(value)
+    }
+}
+
+/// Deserialization target for the parser descriptors, can be used
+/// with the relevant serde implementation to load from `regexes.yaml`
+/// or a conversion thereof.
+///
+/// Can then be compiled to a full [`Extractor`], or an individual
+/// list of parsers can be converted to the corresponding extractor.
+#[allow(missing_docs)]
+#[derive(Deserialize)]
+pub struct Regexes<'a> {
+    pub user_agent_parsers: Vec<user_agent::Parser<'a>>,
+    pub os_parsers: Vec<os::Parser<'a>>,
+    pub device_parsers: Vec<device::Parser<'a>>,
+}
+
+impl<'a> TryFrom<Regexes<'a>> for Extractor<'a> {
+    type Error = Error;
+    /// Compile parsed regexes to the corresponding full extractor.
+    ///
+    /// Prefer using individual builder / extractors if you don't need
+    /// all three domains extracted, as creating the individual
+    /// extractors does have a cost.
+    fn try_from(r: Regexes<'a>) -> Result<Self, Error> {
+        let ua = r
+            .user_agent_parsers
+            .into_iter()
+            .try_fold(user_agent::Builder::new(), |b, p| b.push(p))?
+            .build()?;
+        let os = r
+            .os_parsers
+            .into_iter()
+            .try_fold(os::Builder::new(), |b, p| b.push(p))?
+            .build()?;
+        let dev = r
+            .device_parsers
+            .into_iter()
+            .try_fold(device::Builder::new(), |b, p| b.push(p))?
+            .build()?;
+        Ok(Extractor { ua, os, dev })
+    }
+}
+
+/// Full extractor, simply delegates to the underlying individual
+/// extractors for the actual job.
+#[allow(missing_docs)]
+pub struct Extractor<'a> {
+    pub ua: user_agent::Extractor<'a>,
+    pub os: os::Extractor<'a>,
+    pub dev: device::Extractor<'a>,
+}
+impl<'a> Extractor<'a> {
+    /// Performs the extraction on every sub-extractor in sequence.
+    pub fn extract(
+        &'a self,
+        ua: &'a str,
+    ) -> (
+        Option<user_agent::ValueRef<'a>>,
+        Option<os::ValueRef<'a>>,
+        Option<device::ValueRef<'a>>,
+    ) {
+        (
+            self.ua.extract(ua),
+            self.os.extract(ua),
+            self.dev.extract(ua),
+        )
+    }
+}
+
+/// User agent module.
+///
+/// The user agent is the representation of the browser, in UAP lingo
+/// the user agent is composed of a *family* (the browser project) and
+/// a *version* of up to 4 segments.
+pub mod user_agent {
+    use serde::Deserialize;
+    use std::borrow::Cow;
+
+    use crate::resolvers::{FallbackResolver, FamilyResolver};
+    use regex_filtered::BuildError;
+
+    /// Individual user agent parser description. Plain data which can
+    /// be deserialized from serde-compatible storage, or created
+    /// literally (e.g. using a conversion or build script).
+    #[derive(Deserialize, Default)]
+    pub struct Parser<'a> {
+        /// Regex to check the UA against, if the regex matches the
+        /// parser applies.
+        pub regex: Cow<'a, str>,
+        /// If set, used for the [`ValueRef::family`] field. If it
+        /// contains a `$1` placeholder, that is replaced by the value
+        /// of the first match group.
+        ///
+        /// If unset, the first match group is used directly.
+        pub family_replacement: Option<Cow<'a, str>>,
+        /// If set, provides the value of the major version number,
+        /// otherwise the second match group is used.
+        pub v1_replacement: Option<Cow<'a, str>>,
+        /// If set, provides the value of the minor version number,
+        /// otherwise the third match group is used.
+        pub v2_replacement: Option<Cow<'a, str>>,
+        /// If set, provides the value of the patch version number,
+        /// otherwise the fourth match group is used.
+        pub v3_replacement: Option<Cow<'a, str>>,
+        /// If set, provides the value of the minor patch version
+        /// number, otherwise the fifth match group is used.
+        pub v4_replacement: Option<Cow<'a, str>>,
+    }
+
+    type Repl<'a> = (
+        FamilyResolver<'a>,
+        // Per spec, should actually be restrict-templated (same as
+        // family but for indexes 2-5 instead of 1).
+        FallbackResolver<'a>,
+        FallbackResolver<'a>,
+        FallbackResolver<'a>,
+        FallbackResolver<'a>,
+    );
+
+    /// Extractor builder, used to `push` parsers into before building
+    /// the extractor.
+    #[derive(Default)]
+    pub struct Builder<'a> {
+        builder: regex_filtered::Builder,
+        repl: Vec<Repl<'a>>,
+    }
+    impl<'a> Builder<'a> {
+        /// Initialise an empty builder.
+        pub fn new() -> Self {
+            Self::default()
+        }
+
+        /// Build the extractor, may be called without pushing any
+        /// parser in though that is not very useful.
+        pub fn build(self) -> Result<Extractor<'a>, BuildError> {
+            let Self { builder, repl } = self;
+
+            Ok(Extractor {
+                matcher: builder.build()?,
+                repl,
+            })
+        }
+
+        /// Pushes a parser into the builder, may fail if the
+        /// [`Parser::regex`] is invalid.
+        pub fn push(mut self, ua: Parser<'a>) -> Result<Self, super::Error> {
+            self.builder = self.builder.push(&ua.regex)?;
+            let r = &self.builder.regexes()[self.builder.regexes().len() - 1];
+            // number of groups in regex, excluding implicit entire match group
+            let groups = r.captures_len() - 1;
+            self.repl.push((
+                FamilyResolver::new(ua.family_replacement, groups)?,
+                FallbackResolver::new(ua.v1_replacement, groups, 2),
+                FallbackResolver::new(ua.v2_replacement, groups, 3),
+                FallbackResolver::new(ua.v3_replacement, groups, 4),
+                FallbackResolver::new(ua.v4_replacement, groups, 5),
+            ));
+            Ok(self)
+        }
+
+        /// Bulk loading of parsers into the builder.
+        pub fn push_all<I>(self, ua: I) -> Result<Self, super::Error>
+        where
+            I: IntoIterator<Item = Parser<'a>>,
+        {
+            ua.into_iter().try_fold(self, |s, p| s.push(p))
+        }
+    }
+
+    /// User Agent extractor.
+    pub struct Extractor<'a> {
+        matcher: regex_filtered::Regexes,
+        repl: Vec<Repl<'a>>,
+    }
+    impl<'a> Extractor<'a> {
+        /// Tries the loaded [`Parser`], upon finding the first
+        /// matching [`Parser`] performs data extraction following its
+        /// replacement directives and returns the result.
+        ///
+        /// Returns [`None`] if:
+        ///
+        /// - no matching parser was found
+        /// - the match does not have any matching groups *and*
+        ///   [`Parser::family_replacement`] is unset
+        /// - [`Parser::family_replacement`] has a substitution
+        ///   but there is no group in the regex
+        pub fn extract(&'a self, ua: &'a str) -> Option<ValueRef<'a>> {
+            let (idx, re) = self.matcher.matching(ua).next()?;
+            let c = re.captures(ua)?;
+
+            let (f, v1, v2, v3, v4) = &self.repl[idx];
+
+            Some(ValueRef {
+                family: f.resolve(&c),
+                major: v1.resolve(&c),
+                minor: v2.resolve(&c),
+                patch: v3.resolve(&c),
+                patch_minor: v4.resolve(&c),
+            })
+        }
+    }
+    /// Borrowed extracted value, borrows the content of the original
+    /// parser or the content of the user agent string, unless a
+    /// replacement is performed. (which is only possible for the )
+    #[derive(PartialEq, Eq, Default, Debug)]
+    pub struct ValueRef<'a> {
+        ///
+        pub family: Cow<'a, str>,
+        ///
+        pub major: Option<&'a str>,
+        ///
+        pub minor: Option<&'a str>,
+        ///
+        pub patch: Option<&'a str>,
+        ///
+        pub patch_minor: Option<&'a str>,
+    }
+
+    impl ValueRef<'_> {
+        /// Converts the borrowed result into an owned one,
+        /// independent from both the extractor and the user agent
+        /// string.
+        pub fn into_owned(self) -> Value {
+            Value {
+                family: self.family.into_owned(),
+                major: self.major.map(|c| c.to_string()),
+                minor: self.minor.map(|c| c.to_string()),
+                patch: self.patch.map(|c| c.to_string()),
+                patch_minor: self.patch_minor.map(|c| c.to_string()),
+            }
+        }
+    }
+
+    /// Owned extracted value, identical to [`ValueRef`] but not
+    /// linked to either the UA string or the extractor.
+    #[derive(PartialEq, Eq, Default, Debug)]
+    pub struct Value {
+        ///
+        pub family: String,
+        ///
+        pub major: Option<String>,
+        ///
+        pub minor: Option<String>,
+        ///
+        pub patch: Option<String>,
+        ///
+        pub patch_minor: Option<String>,
+    }
+}
+
+/// OS extraction module
+pub mod os {
+    use serde::Deserialize;
+    use std::borrow::Cow;
+
+    use regex_filtered::{BuildError, ParseError};
+
+    use crate::resolvers::{OptResolver, Resolver};
+
+    /// OS parser configuration
+    #[derive(Deserialize, Default)]
+    pub struct Parser<'a> {
+        ///
+        pub regex: Cow<'a, str>,
+        /// Replacement for the [`ValueRef::os`], must be set if there
+        /// is no capture in the [`Self::regex`], if there are
+        /// captures may be fully templated (with `$n` placeholders
+        /// for any group of the [`Self::regex`]).
+        pub os_replacement: Option<Cow<'a, str>>,
+        /// Replacement for the [`ValueRef::major`], may be fully templated.
+        pub os_v1_replacement: Option<Cow<'a, str>>,
+        /// Replacement for the [`ValueRef::minor`], may be fully templated.
+        pub os_v2_replacement: Option<Cow<'a, str>>,
+        /// Replacement for the [`ValueRef::patch`], may be fully templated.
+        pub os_v3_replacement: Option<Cow<'a, str>>,
+        /// Replacement for the [`ValueRef::patch_minor`], may be fully templated.
+        pub os_v4_replacement: Option<Cow<'a, str>>,
+    }
+    /// Builder for [`Extractor`].
+    #[derive(Default)]
+    pub struct Builder<'a> {
+        builder: regex_filtered::Builder,
+        repl: Vec<(
+            Resolver<'a>,
+            OptResolver<'a>,
+            OptResolver<'a>,
+            OptResolver<'a>,
+            OptResolver<'a>,
+        )>,
+    }
+    impl<'a> Builder<'a> {
+        ///
+        pub fn new() -> Self {
+            Self::default()
+        }
+
+        /// Builds the [`Extractor`], may fail if building the
+        /// prefilter fails.
+        pub fn build(self) -> Result<Extractor<'a>, BuildError> {
+            let Self { builder, repl } = self;
+
+            Ok(Extractor {
+                matcher: builder.build()?,
+                repl,
+            })
+        }
+
+        /// Add a [`Parser`] configuration, fails if the regex can not
+        /// be parsed, or if [`Parser::os_replacement`] is missing and
+        /// the regex has no groups.
+        pub fn push(mut self, os: Parser<'a>) -> Result<Self, ParseError> {
+            self.builder = self.builder.push(&os.regex)?;
+            let r = &self.builder.regexes()[self.builder.regexes().len() - 1];
+            // number of groups in regex, excluding implicit entire match group
+            let groups = r.captures_len() - 1;
+            self.repl.push((
+                Resolver::new(os.os_replacement, groups, 1),
+                OptResolver::new(os.os_v1_replacement, groups, 2),
+                OptResolver::new(os.os_v2_replacement, groups, 3),
+                OptResolver::new(os.os_v3_replacement, groups, 4),
+                OptResolver::new(os.os_v4_replacement, groups, 5),
+            ));
+            Ok(self)
+        }
+
+        /// Bulk loading of parsers into the builder.
+        pub fn push_all<I>(self, ua: I) -> Result<Self, ParseError>
+        where
+            I: IntoIterator<Item = Parser<'a>>,
+        {
+            ua.into_iter().try_fold(self, |s, p| s.push(p))
+        }
+    }
+
+    /// OS extractor structure
+    pub struct Extractor<'a> {
+        matcher: regex_filtered::Regexes,
+        repl: Vec<(
+            Resolver<'a>,
+            OptResolver<'a>,
+            OptResolver<'a>,
+            OptResolver<'a>,
+            OptResolver<'a>,
+        )>,
+    }
+    impl<'a> Extractor<'a> {
+        /// Matches & extracts the OS data for this user agent,
+        /// returns `None` if the UA string could not be matched.
+        pub fn extract(&'a self, ua: &'a str) -> Option<ValueRef<'a>> {
+            let (idx, re) = self.matcher.matching(ua).next()?;
+            let c = re.captures(ua)?;
+
+            let (o, v1, v2, v3, v4) = &self.repl[idx];
+
+            Some(ValueRef {
+                os: o.resolve(&c),
+                major: v1.resolve(&c),
+                minor: v2.resolve(&c),
+                patch: v3.resolve(&c),
+                patch_minor: v4.resolve(&c),
+            })
+        }
+    }
+
+    /// An OS extraction result.
+    #[derive(PartialEq, Eq, Default, Debug)]
+    pub struct ValueRef<'a> {
+        ///
+        pub os: Cow<'a, str>,
+        ///
+        pub major: Option<Cow<'a, str>>,
+        ///
+        pub minor: Option<Cow<'a, str>>,
+        ///
+        pub patch: Option<Cow<'a, str>>,
+        ///
+        pub patch_minor: Option<Cow<'a, str>>,
+    }
+
+    impl ValueRef<'_> {
+        /// Converts a [`ValueRef`] into a [`Value`] to avoid lifetime
+        /// concerns, may need to allocate and copy any data currently
+        /// borrowed from a [`Parser`] or user agent string.
+        pub fn into_owned(self) -> Value {
+            Value {
+                os: self.os.into_owned(),
+                major: self.major.map(|c| c.into_owned()),
+                minor: self.minor.map(|c| c.into_owned()),
+                patch: self.patch.map(|c| c.into_owned()),
+                patch_minor: self.patch_minor.map(|c| c.into_owned()),
+            }
+        }
+    }
+
+    /// Owned version of [`ValueRef`].
+    #[derive(PartialEq, Eq, Default, Debug)]
+    pub struct Value {
+        ///
+        pub os: String,
+        ///
+        pub major: Option<String>,
+        ///
+        pub minor: Option<String>,
+        ///
+        pub patch: Option<String>,
+        ///
+        pub patch_minor: Option<String>,
+    }
+}
+
+/// Extraction module for the device data of the user agent string.
+pub mod device {
+    use serde::Deserialize;
+    use std::borrow::Cow;
+
+    use regex_filtered::{BuildError, ParseError};
+
+    use crate::resolvers::{OptResolver, Resolver};
+
+    /// regex flags
+    #[derive(Deserialize, PartialEq, Eq)]
+    pub enum Flag {
+        /// Enables case-insensitive regex matching, deserializes from
+        /// the string `"i"`
+        #[serde(rename = "i")]
+        IgnoreCase,
+    }
+    /// Device parser description.
+    #[derive(Deserialize, Default)]
+    pub struct Parser<'a> {
+        /// Regex pattern to use for matching and data extraction.
+        pub regex: Cow<'a, str>,
+        /// Configuration flags for the regex, if any.
+        pub regex_flag: Option<Flag>,
+        /// Device replacement data, fully templated, must be present
+        /// *or* the regex must have at least one group, which will be
+        /// used instead.
+        pub device_replacement: Option<Cow<'a, str>>,
+        /// Brand replacement data, fully templated, optional, if
+        /// missing there is no fallback.
+        pub brand_replacement: Option<Cow<'a, str>>,
+        /// Model replacement data, fully templated, optional, if
+        /// missing will be replaced by the first group if the regex
+        /// has one.
+        pub model_replacement: Option<Cow<'a, str>>,
+    }
+
+    /// Extractor builder.
+    #[derive(Default)]
+    pub struct Builder<'a> {
+        builder: regex_filtered::Builder,
+        repl: Vec<(Resolver<'a>, OptResolver<'a>, OptResolver<'a>)>,
+    }
+    impl<'a> Builder<'a> {
+        /// Creates a builder in the default configurtion, which is
+        /// the only configuration.
+        pub fn new() -> Self {
+            Self::default()
+        }
+
+        /// Builds an Extractor, may fail if compiling the prefilter fails.
+        pub fn build(self) -> Result<Extractor<'a>, BuildError> {
+            let Self { builder, repl } = self;
+
+            Ok(Extractor {
+                matcher: builder.build()?,
+                repl,
+            })
+        }
+
+        /// Add a parser to the set, may fail if parsing the regex
+        /// fails *or* if [`Parser::device_replacement`] is unset and
+        /// [`Parser::regex`] does not have at least one group, or a
+        /// templated [`Parser::device_replacement`] requests groups
+        /// which [`Parser::regex`] is missing.
+        pub fn push(mut self, device: Parser<'a>) -> Result<Self, ParseError> {
+            self.builder = self.builder.push_opt(
+                &device.regex,
+                regex_filtered::Options::new()
+                    .case_insensitive(device.regex_flag == Some(Flag::IgnoreCase)),
+            )?;
+            let r = &self.builder.regexes()[self.builder.regexes().len() - 1];
+            // number of groups in regex, excluding implicit entire match group
+            let groups = r.captures_len() - 1;
+            self.repl.push((
+                Resolver::new(device.device_replacement, groups, 1),
+                OptResolver::new(device.brand_replacement, 0, 999),
+                OptResolver::new(device.model_replacement, groups, 1),
+            ));
+            Ok(self)
+        }
+
+        /// Bulk loading of parsers into the builder.
+        pub fn push_all<I>(self, ua: I) -> Result<Self, ParseError>
+        where
+            I: IntoIterator<Item = Parser<'a>>,
+        {
+            ua.into_iter().try_fold(self, |s, p| s.push(p))
+        }
+    }
+
+    /// Device extractor object.
+    pub struct Extractor<'a> {
+        matcher: regex_filtered::Regexes,
+        repl: Vec<(Resolver<'a>, OptResolver<'a>, OptResolver<'a>)>,
+    }
+    impl<'a> Extractor<'a> {
+        /// Perform data extraction from the user agent string,
+        /// returns `None` if no regex in the [`Extractor`] matches
+        /// the input.
+        pub fn extract(&'a self, ua: &'a str) -> Option<ValueRef<'a>> {
+            let (idx, re) = self.matcher.matching(ua).next()?;
+            let c = re.captures(ua)?;
+
+            let (d, v1, v2) = &self.repl[idx];
+
+            Some(ValueRef {
+                device: d.resolve(&c),
+                brand: v1.resolve(&c),
+                model: v2.resolve(&c),
+            })
+        }
+    }
+
+    /// Extracted device content, may borrow from one of the
+    /// [`Parser`] or from the user agent string.
+    #[derive(PartialEq, Eq, Default, Debug)]
+    pub struct ValueRef<'a> {
+        ///
+        pub device: Cow<'a, str>,
+        ///
+        pub brand: Option<Cow<'a, str>>,
+        ///
+        pub model: Option<Cow<'a, str>>,
+    }
+
+    impl ValueRef<'_> {
+        /// Converts [`Self`] to an owned [`Value`] getting rid of
+        /// borrowing concerns, may need to allocate and copy if any
+        /// of the attributes actually borrows from a [`Parser`] or
+        /// the user agent string.
+        pub fn into_owned(self) -> Value {
+            Value {
+                device: self.device.into_owned(),
+                brand: self.brand.map(|c| c.into_owned()),
+                model: self.model.map(|c| c.into_owned()),
+            }
+        }
+    }
+
+    /// Owned version of [`ValueRef`].
+    #[derive(PartialEq, Eq, Default, Debug)]
+    pub struct Value {
+        ///
+        pub device: String,
+        ///
+        pub brand: Option<String>,
+        ///
+        pub model: Option<String>,
+    }
+}
diff --git a/ua-parser/src/resolvers.rs b/ua-parser/src/resolvers.rs
new file mode 100644
index 0000000..9f52161
--- /dev/null
+++ b/ua-parser/src/resolvers.rs
@@ -0,0 +1,171 @@
+// TODO: what happens in case of optional groups?
+//
+// Sadly regex offers no way to actually query that nicely: via
+// static_captures_len it only specifies whether all groups are
+// required, if any group is optional that returns `None`.
+
+use crate::Error;
+use regex::Captures;
+use std::borrow::Cow;
+
+fn get<'s>(c: &Captures<'s>, group: usize) -> Option<&'s str> {
+    c.get(group).map(|g| g.as_str()).filter(|s| !s.is_empty())
+}
+
+// TODO:
+// - memchr?
+// - u16 checks against u16 buffer (check all positions)?
+// - svar/simd?
+fn has_substitution(s: &str) -> bool {
+    debug_assert!(!s.is_empty());
+    std::iter::zip(s.as_bytes(), &s.as_bytes()[1..]).any(|(&d, n)| d == b'$' && n.is_ascii_digit())
+}
+
+/// Resolver with full templating: the template string can contain
+/// $1-9 markers which get replaced by the corresponding regex string.
+///
+/// - if there is a non-null replacement pattern, then it must be used with
+///   match groups as template parameters (at indices 1+)
+///   - the result is stripped
+///   - if it is an empty string, then it's replaced by a null
+/// - otherwise fallback to a (possibly optional) match group
+/// - or null (device brand has no fallback)
+pub(crate) enum Resolver<'a> {
+    Replacement(Cow<'a, str>),
+    Capture(usize),
+    Template(Cow<'a, str>),
+}
+impl<'a> Resolver<'a> {
+    pub(crate) fn new(repl: Option<Cow<'a, str>>, groups: usize, idx: usize) -> Self {
+        if let Some(s) = repl.filter(|s| !s.trim().is_empty()) {
+            if has_substitution(&s) {
+                Self::Template(s)
+            } else {
+                Self::Replacement(s)
+            }
+        } else if groups >= idx {
+            Self::Capture(idx)
+        } else {
+            Self::Replacement("".into())
+        }
+    }
+
+    pub(crate) fn resolve(&'a self, c: &Captures<'a>) -> Cow<'a, str> {
+        match self {
+            Self::Replacement(s) => (**s).into(),
+            Self::Capture(i) => get(c, *i).unwrap_or("").into(),
+            Self::Template(t) => {
+                let mut r = String::new();
+                c.expand(t, &mut r);
+                let trimmed = r.trim();
+                if r.len() == trimmed.len() {
+                    r.into()
+                } else {
+                    trimmed.to_string().into()
+                }
+            }
+        }
+    }
+}
+
+/// Similar to [`Resolver`] but allows a [`None`] aka no resolution.
+pub(crate) enum OptResolver<'a> {
+    None,
+    Replacement(Cow<'a, str>),
+    Capture(usize),
+    Template(Cow<'a, str>),
+}
+impl<'a> OptResolver<'a> {
+    pub(crate) fn new(repl: Option<Cow<'a, str>>, groups: usize, idx: usize) -> Self {
+        if let Some(s) = repl.filter(|s| !s.trim().is_empty()) {
+            if has_substitution(&s) {
+                Self::Template(s)
+            } else {
+                Self::Replacement(s)
+            }
+        } else if groups >= idx {
+            Self::Capture(idx)
+        } else {
+            Self::None
+        }
+    }
+
+    pub(crate) fn resolve(&'a self, c: &Captures<'a>) -> Option<Cow<'a, str>> {
+        match self {
+            Self::None => None,
+            Self::Replacement(s) => Some((**s).into()),
+            Self::Capture(i) => get(c, *i).map(From::from),
+            Self::Template(t) => {
+                let mut r = String::new();
+                c.expand(t, &mut r);
+                let trimmed = r.trim();
+                if trimmed.is_empty() {
+                    None
+                } else if r.len() == trimmed.len() {
+                    Some(r.into())
+                } else {
+                    Some(trimmed.to_string().into())
+                }
+            }
+        }
+    }
+}
+
+/// Dedicated restrict-templated resolver for UserAgent#family:
+/// supports templating in the replacement, but only for the `$1`
+/// value / group.
+pub(crate) enum FamilyResolver<'a> {
+    Capture,
+    Replacement(Cow<'a, str>),
+    Template(Cow<'a, str>),
+}
+impl<'a> FamilyResolver<'a> {
+    pub(crate) fn new(repl: Option<Cow<'a, str>>, groups: usize) -> Result<Self, Error> {
+        match repl {
+            Some(s) if s.contains("$1") => {
+                if groups < 1 {
+                    Err(Error::MissingGroup(1))
+                } else {
+                    Ok(FamilyResolver::Template(s))
+                }
+            }
+            Some(s) if !s.is_empty() => Ok(FamilyResolver::Replacement(s)),
+            _ if groups >= 1 => Ok(FamilyResolver::Capture),
+            _ => Ok(FamilyResolver::Replacement("".into())),
+        }
+    }
+
+    pub(crate) fn resolve(&'a self, c: &super::Captures<'a>) -> Cow<'a, str> {
+        match self {
+            FamilyResolver::Capture => get(c, 1).unwrap_or("").into(),
+            FamilyResolver::Replacement(s) => (**s).into(),
+            FamilyResolver::Template(t) => t.replace("$1", get(c, 1).unwrap_or("")).into(),
+        }
+    }
+}
+
+/// Untemplated resolver, the replacement value is used as-is if
+/// present.
+pub(crate) enum FallbackResolver<'a> {
+    None,
+    Capture(usize),
+    Replacement(Cow<'a, str>),
+}
+impl<'a> FallbackResolver<'a> {
+    pub(crate) fn new(repl: Option<Cow<'a, str>>, groups: usize, idx: usize) -> Self {
+        if let Some(s) = repl.filter(|s| !s.is_empty()) {
+            Self::Replacement(s)
+        } else if groups >= idx {
+            Self::Capture(idx)
+        } else {
+            Self::None
+        }
+    }
+    pub(crate) fn resolve(&'a self, c: &super::Captures<'a>) -> Option<&'a str> {
+        match self {
+            FallbackResolver::None => None,
+            FallbackResolver::Capture(n) => get(c, *n),
+            FallbackResolver::Replacement(r) => Some(r),
+        }
+    }
+}
diff --git a/ua-parser/tests/integration.rs b/ua-parser/tests/integration.rs
new file mode 100644
index 0000000..66429f2
--- /dev/null
+++ b/ua-parser/tests/integration.rs
@@ -0,0 +1,391 @@
+use serde::Deserialize;
+
+fn empty_is_none<'de, D>(deserializer: D) -> Result<Option<String>, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let s: serde_yaml::Value = serde::de::Deserialize::deserialize(deserializer)?;
+    match s {
+        serde_yaml::Value::Null => Ok(None),
+        serde_yaml::Value::String(s) => {
+            if s.is_empty() {
+                Ok(None)
+            } else {
+                Ok(Some(s))
+            }
+        }
+        v => panic!("unexpected value {v:?}"),
+    }
+}
+
+#[derive(Deserialize, PartialEq, Eq, Debug)]
+struct UserAgent {
+    family: String,
+    #[serde(deserialize_with = "empty_is_none")]
+    major: Option<String>,
+    #[serde(deserialize_with = "empty_is_none")]
+    minor: Option<String>,
+    #[serde(deserialize_with = "empty_is_none")]
+    patch: Option<String>,
+    #[serde(default, deserialize_with = "empty_is_none")]
+    patch_minor: Option<String>,
+}
+impl From<ua_parser::user_agent::ValueRef<'_>> for UserAgent {
+    fn from(value: ua_parser::user_agent::ValueRef<'_>) -> Self {
+        let value = value.into_owned();
+        Self {
+            family: value.family,
+            major: value.major,
+            minor: value.minor,
+            patch: value.patch,
+            patch_minor: value.patch_minor,
+        }
+    }
+}
+
+#[derive(Deserialize, PartialEq, Eq, Debug)]
+pub struct OS {
+    pub family: String,
+    pub major: Option<String>,
+    pub minor: Option<String>,
+    pub patch: Option<String>,
+    pub patch_minor: Option<String>,
+}
+impl From<ua_parser::os::ValueRef<'_>> for OS {
+    fn from(value: ua_parser::os::ValueRef<'_>) -> Self {
+        let value = value.into_owned();
+        Self {
+            family: value.os,
+            major: value.major,
+            minor: value.minor,
+            patch: value.patch,
+            patch_minor: value.patch_minor,
+        }
+    }
+}
+
+#[derive(Deserialize, PartialEq, Eq, Debug)]
+pub struct Device {
+    pub family: String,
+    pub brand: Option<String>,
+    pub model: Option<String>,
+}
+impl From<ua_parser::device::ValueRef<'_>> for Device {
+    fn from(value: ua_parser::device::ValueRef<'_>) -> Self {
+        let value = value.into_owned();
+        Self {
+            family: value.device,
+            brand: value.brand,
+            model: value.model,
+        }
+    }
+}
+
+fn get_extractor() -> Result<
+    &'static ua_parser::Extractor<'static>,
+    &'static (dyn std::error::Error + Send + Sync + 'static),
+> {
+    static EXTRACTOR: std::sync::OnceLock<
+        Result<ua_parser::Extractor<'static>, Box<dyn std::error::Error + Send + Sync>>,
+    > = std::sync::OnceLock::new();
+
+    EXTRACTOR
+        .get_or_init(|| {
+            let p: std::path::PathBuf = [env!("CARGO_MANIFEST_DIR"), "uap-core", "regexes.yaml"]
+                .iter()
+                .collect();
+            let rs = serde_yaml::from_reader::<_, ua_parser::Regexes>(std::fs::File::open(p)?)?
+                .try_into()?;
+            Ok(rs)
+        })
+        .as_ref()
+        .map_err(|e| &**e)
+}
+
+#[derive(Deserialize)]
+struct UaTestCases {
+    test_cases: Vec<UaTestCase>,
+}
+#[derive(Deserialize)]
+struct UaTestCase {
+    user_agent_string: String,
+    #[serde(flatten)]
+    ua: UserAgent,
+}
+
+#[test]
+fn test_ua() {
+    let rs = &get_extractor().unwrap().ua;
+
+    let p = [
+        env!("CARGO_MANIFEST_DIR"),
+        "uap-core",
+        "tests",
+        "test_ua.yaml",
+    ]
+    .iter()
+    .collect::<std::path::PathBuf>();
+    let items = serde_yaml::from_reader::<_, UaTestCases>(std::fs::File::open(p).unwrap()).unwrap();
+    for UaTestCase {
+        user_agent_string,
+        ua,
+    } in items.test_cases
+    {
+        let ua_ = rs.extract(&user_agent_string).map_or_else(
+            || UserAgent {
+                family: "Other".to_string(),
+                major: None,
+                minor: None,
+                patch: None,
+                patch_minor: None,
+            },
+            From::from,
+        );
+        assert_eq!(ua, ua_, "{user_agent_string}");
+    }
+}
+
+#[test]
+fn test_ff() {
+    let rs = &get_extractor().unwrap().ua;
+
+    let p = [
+        env!("CARGO_MANIFEST_DIR"),
+        "uap-core",
+        "test_resources",
+        "firefox_user_agent_strings.yaml",
+    ]
+    .iter()
+    .collect::<std::path::PathBuf>();
+    let items = serde_yaml::from_reader::<_, UaTestCases>(std::fs::File::open(p).unwrap()).unwrap();
+    for UaTestCase {
+        user_agent_string,
+        ua,
+    } in items.test_cases
+    {
+        let ua_ = rs.extract(&user_agent_string).map_or_else(
+            || UserAgent {
+                family: "Other".to_string(),
+                major: None,
+                minor: None,
+                patch: None,
+                patch_minor: None,
+            },
+            From::from,
+        );
+        assert_eq!(ua, ua_, "{user_agent_string}");
+    }
+}
+
+#[test]
+fn test_pgts() {
+    let rs = &get_extractor().unwrap().ua;
+
+    let p = [
+        env!("CARGO_MANIFEST_DIR"),
+        "uap-core",
+        "test_resources",
+        "pgts_browser_list.yaml",
+    ]
+    .iter()
+    .collect::<std::path::PathBuf>();
+    let items = serde_yaml::from_reader::<_, UaTestCases>(std::fs::File::open(p).unwrap()).unwrap();
+    for UaTestCase {
+        user_agent_string,
+        ua,
+    } in items.test_cases
+    {
+        let ua_ = rs.extract(&user_agent_string).map_or_else(
+            || UserAgent {
+                family: "Other".to_string(),
+                major: None,
+                minor: None,
+                patch: None,
+                patch_minor: None,
+            },
+            From::from,
+        );
+        assert_eq!(ua, ua_, "{user_agent_string}");
+    }
+}
+
+#[test]
+fn test_opera() {
+    let rs = &get_extractor().unwrap().ua;
+
+    let p = [
+        env!("CARGO_MANIFEST_DIR"),
+        "uap-core",
+        "test_resources",
+        "opera_mini_user_agent_strings.yaml",
+    ]
+    .iter()
+    .collect::<std::path::PathBuf>();
+    let items = serde_yaml::from_reader::<_, UaTestCases>(std::fs::File::open(p).unwrap()).unwrap();
+    for UaTestCase {
+        user_agent_string,
+        ua,
+    } in items.test_cases
+    {
+        let ua_ = rs.extract(&user_agent_string).map_or_else(
+            || UserAgent {
+                family: "Other".to_string(),
+                major: None,
+                minor: None,
+                patch: None,
+                patch_minor: None,
+            },
+            From::from,
+        );
+        assert_eq!(ua, ua_, "{user_agent_string}");
+    }
+}
+
+#[test]
+fn test_podcasting() {
+    let rs = &get_extractor().unwrap().ua;
+
+    let p = [
+        env!("CARGO_MANIFEST_DIR"),
+        "uap-core",
+        "test_resources",
+        "podcasting_user_agent_strings.yaml",
+    ]
+    .iter()
+    .collect::<std::path::PathBuf>();
+    let items = serde_yaml::from_reader::<_, UaTestCases>(std::fs::File::open(p).unwrap()).unwrap();
+    for UaTestCase {
+        user_agent_string,
+        ua,
+    } in items.test_cases
+    {
+        let ua_ = rs.extract(&user_agent_string).map_or_else(
+            || UserAgent {
+                family: "Other".to_string(),
+                major: None,
+                minor: None,
+                patch: None,
+                patch_minor: None,
+            },
+            From::from,
+        );
+        assert_eq!(ua, ua_, "{user_agent_string}");
+    }
+}
+
+#[derive(Deserialize)]
+struct DevTestCases {
+    test_cases: Vec<DevTestCase>,
+}
+#[derive(Deserialize)]
+struct DevTestCase {
+    user_agent_string: String,
+    #[serde(flatten)]
+    dev: Device,
+}
+
+#[test]
+fn test_device() {
+    let rs = &get_extractor().unwrap().dev;
+
+    let p = [
+        env!("CARGO_MANIFEST_DIR"),
+        "uap-core",
+        "tests",
+        "test_device.yaml",
+    ]
+    .iter()
+    .collect::<std::path::PathBuf>();
+    let items =
+        serde_yaml::from_reader::<_, DevTestCases>(std::fs::File::open(p).unwrap()).unwrap();
+    for DevTestCase {
+        user_agent_string,
+        dev,
+    } in items.test_cases
+    {
+        let dev_ = rs.extract(&user_agent_string).map_or_else(
+            || Device {
+                family: "Other".to_string(),
+                brand: None,
+                model: None,
+            },
+            From::from,
+        );
+        assert_eq!(dev, dev_, "{user_agent_string}");
+    }
+}
+
+#[derive(Deserialize)]
+struct OSTestCases {
+    test_cases: Vec<OSTestCase>,
+}
+#[derive(Deserialize)]
+struct OSTestCase {
+    user_agent_string: String,
+    #[serde(flatten)]
+    os: OS,
+}
+
+#[test]
+fn test_os() {
+    let rs = &get_extractor().unwrap().os;
+
+    let p = [
+        env!("CARGO_MANIFEST_DIR"),
+        "uap-core",
+        "tests",
+        "test_os.yaml",
+    ]
+    .iter()
+    .collect::<std::path::PathBuf>();
+    let items = serde_yaml::from_reader::<_, OSTestCases>(std::fs::File::open(p).unwrap()).unwrap();
+    for OSTestCase {
+        user_agent_string,
+        os,
+    } in items.test_cases
+    {
+        let os_ = rs.extract(&user_agent_string).map_or_else(
+            || OS {
+                family: "Other".to_string(),
+                major: None,
+                minor: None,
+                patch: None,
+                patch_minor: None,
+            },
+            From::from,
+        );
+        assert_eq!(os, os_, "{user_agent_string}");
+    }
+}
+
+#[test]
+fn test_additional_os() {
+    let rs = &get_extractor().unwrap().os;
+
+    let p = [
+        env!("CARGO_MANIFEST_DIR"),
+        "uap-core",
+        "test_resources",
+        "additional_os_tests.yaml",
+    ]
+    .iter()
+    .collect::<std::path::PathBuf>();
+    let items = serde_yaml::from_reader::<_, OSTestCases>(std::fs::File::open(p).unwrap()).unwrap();
+    for OSTestCase {
+        user_agent_string,
+        os,
+    } in items.test_cases
+    {
+        let os_ = rs.extract(&user_agent_string).map_or_else(
+            || OS {
+                family: "Other".to_string(),
+                major: None,
+                minor: None,
+                patch: None,
+                patch_minor: None,
+            },
+            From::from,
+        );
+        assert_eq!(os, os_, "{user_agent_string}");
+    }
+}
diff --git a/ua-parser/uap-core b/ua-parser/uap-core
new file mode 160000
index 0000000..ae4c16d
--- /dev/null
+++ b/ua-parser/uap-core
@@ -0,0 +1 @@
+Subproject commit ae4c16ddd81a01c66f396953016c9e06f695d78f