diff --git a/.github/DOCS.md b/.github/DOCS.md
new file mode 100644
index 0000000..962c7ae
--- /dev/null
+++ b/.github/DOCS.md
@@ -0,0 +1,33 @@
+# Github config and workflows
+
+Copied from .
+
+In this folder there is configuration for codecoverage, dependabot, and ci
+workflows that check the library more deeply than the default configurations.
+
+This folder can be or was merged using a --allow-unrelated-histories merge
+strategy from which provides a
+reasonably sensible base for writing your own ci on. By using this strategy
+the history of the CI repo is included in your repo, and future updates to
+the CI can be merged later.
+
+To perform this merge run:
+
+```shell
+git remote add ci https://github.com/kod-kristoff/rust-ci-conf.git
+git fetch ci
+git merge --allow-unrelated-histories ci/main
+```
+
+or
+
+```shell
+git remote add ci git@github.com:kod-kristoff/rust-ci-conf.git
+git fetch ci
+git merge --allow-unrelated-histories ci/main
+```
+
+An overview of the files in this project is available at:
+, which contains some
+rationale for decisions and runs through an example of solving minimal version
+and OpenSSL issues.
diff --git a/.github/codecov.yml b/.github/codecov.yml
new file mode 100644
index 0000000..cd5ce8f
--- /dev/null
+++ b/.github/codecov.yml
@@ -0,0 +1,21 @@
+# ref: https://docs.codecov.com/docs/codecovyml-reference
+coverage:
+ # Hold ourselves to a high bar
+ range: 85..100
+ round: down
+ precision: 1
+ status:
+ # ref: https://docs.codecov.com/docs/commit-status
+ project:
+ default:
+ # Avoid false negatives
+ threshold: 1%
+
+# Test files aren't important for coverage
+ignore:
+ - "tests"
+
+# Make comments less noisy
+comment:
+ layout: "files"
+ require_changes: true
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..d0f091e
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,19 @@
+version: 2
+updates:
+ - package-ecosystem: github-actions
+ directory: /
+ schedule:
+ interval: daily
+ - package-ecosystem: cargo
+ directory: /
+ schedule:
+ interval: daily
+ ignore:
+ - dependency-name: "*"
+ # patch and minor updates don't matter for libraries as consumers of this library build
+ # with their own lockfile, rather than the version specified in this library's lockfile
+ # remove this ignore rule if your package has binaries to ensure that the binaries are
+ # built with the exact set of dependencies and those are up to date.
+ update-types:
+ - "version-update:semver-patch"
+ - "version-update:semver-minor"
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
new file mode 100644
index 0000000..f3b6061
--- /dev/null
+++ b/.github/workflows/check.yml
@@ -0,0 +1,130 @@
+# This workflow runs whenever a PR is opened or updated, or a commit is pushed to main. It runs
+# several checks:
+# - fmt: checks that the code is formatted according to rustfmt
+# - clippy: checks that the code does not contain any clippy warnings
+# - doc: checks that the code can be documented without errors
+# - hack: check combinations of feature flags
+# - msrv: check that the msrv specified in the crate is correct
+permissions:
+ contents: read
+# This configuration allows maintainers of this repo to create a branch and pull request based on
+# the new branch. Restricting the push trigger to the main branch ensures that the PR only gets
+# built once.
+on:
+ push:
+ branches: [main]
+ pull_request:
+ merge_group:
+# If new code is pushed to a PR branch, then cancel in progress workflows for that PR. Ensures that
+# we don't waste CI time, and returns results quicker https://github.com/jonhoo/rust-ci-conf/pull/5
+concurrency:
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+name: check
+jobs:
+ fmt:
+ runs-on: ubuntu-latest
+ name: stable / fmt
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: true
+ - name: Install stable
+ uses: dtolnay/rust-toolchain@stable
+ with:
+ components: rustfmt
+ - name: cargo fmt --check
+ run: cargo fmt --check
+ clippy:
+ runs-on: ubuntu-latest
+ name: ${{ matrix.toolchain }} / clippy
+ permissions:
+ contents: read
+ checks: write
+ strategy:
+ fail-fast: false
+ matrix:
+ # Get early warning of new lints which are regularly introduced in beta channels.
+ toolchain: [stable, beta]
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: true
+ - name: Install ${{ matrix.toolchain }}
+ uses: dtolnay/rust-toolchain@master
+ with:
+ toolchain: ${{ matrix.toolchain }}
+ components: clippy
+ - name: cargo clippy
+ uses: auguwu/clippy-action@1.4.0
+ with:
+ token: ${{secrets.GITHUB_TOKEN}}
+ doc:
+ # run docs generation on nightly rather than stable. This enables features like
+ # https://doc.rust-lang.org/beta/unstable-book/language-features/doc-cfg.html which allows an
+ # API be documented as only available in some specific platforms.
+ runs-on: ubuntu-latest
+ name: nightly / doc
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: true
+ - name: Install nightly
+ uses: dtolnay/rust-toolchain@nightly
+ - name: cargo doc
+ run: cargo doc --no-deps --all-features
+ env:
+ RUSTDOCFLAGS: --cfg docsrs
+ hack:
+ # cargo-hack checks combinations of feature flags to ensure that features are all additive
+ # which is required for feature unification
+ runs-on: ubuntu-latest
+ name: ubuntu / stable / features
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: true
+ - name: Install stable
+ uses: dtolnay/rust-toolchain@stable
+ - name: cargo install cargo-hack
+ uses: taiki-e/install-action@cargo-hack
+ # intentionally no target specifier; see https://github.com/jonhoo/rust-ci-conf/pull/4
+ # --feature-powerset runs for every combination of features
+ - name: cargo hack
+ run: cargo hack --feature-powerset check
+ msrv:
+ # check that we can build using the minimal rust version that is specified by this crate
+ runs-on: ubuntu-latest
+ # we use a matrix here just because env can't be used in job names
+ # https://docs.github.com/en/actions/learn-github-actions/contexts#context-availability
+ strategy:
+ matrix:
+ msrv: ["1.74.0"] # 2021 edition requires 1.56
+ name: ubuntu / ${{ matrix.msrv }}
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: true
+ - name: Install ${{ matrix.msrv }}
+ uses: dtolnay/rust-toolchain@master
+ with:
+ toolchain: ${{ matrix.msrv }}
+ - name: cargo +${{ matrix.msrv }} check
+ run: cargo check
+
+ # https://github.com/marketplace/actions/alls-green#why used for branch protection checks
+ check-check:
+ if: always()
+ needs:
+ - fmt
+ - clippy
+ - doc
+ - hack
+ - msrv
+ runs-on: ubuntu-latest
+ permissions: {}
+ steps:
+ - name: Decide whether the needed jobs succeeded or failed
+ uses: re-actors/alls-green@release/v1
+ with:
+ jobs: ${{ toJSON(needs) }}
diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml
new file mode 100644
index 0000000..82ec91b
--- /dev/null
+++ b/.github/workflows/scheduled.yml
@@ -0,0 +1,72 @@
+# Run scheduled (rolling) jobs on a nightly basis, as your crate may break independently of any
+# given PR. E.g., updates to rust nightly and updates to this crates dependencies. See check.yml for
+# information about how the concurrency cancellation and workflow triggering works
+permissions:
+ contents: read
+on:
+ push:
+ branches: [main]
+ pull_request:
+ schedule:
+ - cron: "7 7 * * *"
+concurrency:
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+name: rolling
+jobs:
+ # https://twitter.com/mycoliza/status/1571295690063753218
+ nightly:
+ runs-on: ubuntu-latest
+ name: ubuntu / nightly
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: true
+ - name: Install nightly
+ uses: dtolnay/rust-toolchain@nightly
+ - name: cargo generate-lockfile
+ if: hashFiles('Cargo.lock') == ''
+ run: cargo generate-lockfile
+ - name: cargo test --locked
+ run: cargo test --locked --all-features --all-targets
+ # https://twitter.com/alcuadrado/status/1571291687837732873
+ update:
+ # This action checks that updating the dependencies of this crate to the latest available that
+ # satisfy the versions in Cargo.toml does not break this crate. This is important as consumers
+ # of this crate will generally use the latest available crates. This is subject to the standard
+ # Cargo semver rules (i.e cargo does not update to a new major version unless explicitly told
+ # to).
+ runs-on: ubuntu-latest
+ name: ubuntu / beta / updated
+ # There's no point running this if no Cargo.lock was checked in in the first place, since we'd
+ # just redo what happened in the regular test job. Unfortunately, hashFiles only works in if on
+ # steps, so we repeat it.
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: true
+ - name: Install beta
+ if: hashFiles('Cargo.lock') != ''
+ uses: dtolnay/rust-toolchain@beta
+ - name: cargo update
+ if: hashFiles('Cargo.lock') != ''
+ run: cargo update
+ - name: cargo test
+ if: hashFiles('Cargo.lock') != ''
+ run: cargo test --locked --all-features --all-targets
+ env:
+ RUSTFLAGS: -D deprecated
+
+ # https://github.com/marketplace/actions/alls-green#why used for branch protection checks
+ scheduled-check:
+ if: always()
+ needs:
+ - nightly
+ - update
+ runs-on: ubuntu-latest
+ permissions: {}
+ steps:
+ - name: Decide whether the needed jobs succeeded or failed
+ uses: re-actors/alls-green@release/v1
+ with:
+ jobs: ${{ toJSON(needs) }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..abb0b91
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,172 @@
+# This is the main CI workflow that runs the test suite on all pushes to main and all pull requests.
+# It runs the following jobs:
+# - required: runs the test suite on ubuntu with stable and beta rust toolchains
+# - minimal: runs the test suite with the minimal versions of the dependencies that satisfy the
+# requirements of this crate, and its dependencies
+# - os-check: runs the test suite on mac and windows
+# - coverage: runs the test suite and collects coverage information
+# See check.yml for information about how the concurrency cancellation and workflow triggering works
+permissions:
+ contents: read
+on:
+ push:
+ branches: [main]
+ pull_request:
+concurrency:
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+name: test
+jobs:
+ required:
+ runs-on: ubuntu-latest
+ name: ubuntu / ${{ matrix.toolchain }}
+ strategy:
+ matrix:
+ # run on stable and beta to ensure that tests won't break on the next version of the rust
+ # toolchain
+ toolchain: [stable, beta]
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: true
+ - name: Install ${{ matrix.toolchain }}
+ uses: dtolnay/rust-toolchain@master
+ with:
+ toolchain: ${{ matrix.toolchain }}
+ - name: cargo generate-lockfile
+ # enable this ci template to run regardless of whether the lockfile is checked in or not
+ if: hashFiles('Cargo.lock') == ''
+ run: cargo generate-lockfile
+ # https://twitter.com/jonhoo/status/1571290371124260865
+ - name: cargo test --locked
+ run: cargo test --locked --all-features --all-targets
+ # https://github.com/rust-lang/cargo/issues/6669
+ - name: cargo test --doc
+ run: cargo test --locked --all-features --doc
+ minimal:
+ # This action chooses the oldest version of the dependencies permitted by Cargo.toml to ensure
+ # that this crate is compatible with the minimal version that this crate and its dependencies
+ # require. This will pickup issues where this create relies on functionality that was introduced
+ # later than the actual version specified (e.g., when we choose just a major version, but a
+ # method was added after this version).
+ #
+ # This particular check can be difficult to get to succeed as often transitive dependencies may
+ # be incorrectly specified (e.g., a dependency specifies 1.0 but really requires 1.1.5). There
+ # is an alternative flag available -Zdirect-minimal-versions that uses the minimal versions for
+ # direct dependencies of this crate, while selecting the maximal versions for the transitive
+ # dependencies. Alternatively, you can add a line in your Cargo.toml to artificially increase
+ # the minimal dependency, which you do with e.g.:
+ # ```toml
+ # # for minimal-versions
+ # [target.'cfg(any())'.dependencies]
+ # openssl = { version = "0.10.55", optional = true } # needed to allow foo to build with -Zminimal-versions
+ # ```
+ # The optional = true is necessary in case that dependency isn't otherwise transitively required
+ # by your library, and the target bit is so that this dependency edge never actually affects
+ # Cargo build order. See also
+ # https://github.com/jonhoo/fantoccini/blob/fde336472b712bc7ebf5b4e772023a7ba71b2262/Cargo.toml#L47-L49.
+ # This action is run on ubuntu with the stable toolchain, as it is not expected to fail
+ runs-on: ubuntu-latest
+ name: ubuntu / stable / minimal-versions
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: true
+ - name: Install stable
+ uses: dtolnay/rust-toolchain@stable
+ - name: Install nightly for -Zminimal-versions
+ uses: dtolnay/rust-toolchain@nightly
+ - name: rustup default stable
+ run: rustup default stable
+ - name: cargo update -Zminimal-versions
+ run: cargo +nightly update -Zdirect-minimal-versions
+ - name: cargo test
+ run: cargo test --locked --all-features --all-targets
+ os-check:
+ # run cargo test on mac and windows
+ runs-on: ${{ matrix.os }}
+ name: ${{ matrix.os }} / stable
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [macos-latest, windows-latest]
+ steps:
+ # if your project needs OpenSSL, uncomment this to fix Windows builds.
+ # it's commented out by default as the install command takes 5-10m.
+ # - run: echo "VCPKG_ROOT=$env:VCPKG_INSTALLATION_ROOT" | Out-File -FilePath $env:GITHUB_ENV -Append
+ # if: runner.os == 'Windows'
+ # - run: vcpkg install openssl:x64-windows-static-md
+ # if: runner.os == 'Windows'
+ - uses: actions/checkout@v4
+ with:
+ submodules: true
+ - name: Install stable
+ uses: dtolnay/rust-toolchain@stable
+ - name: cargo generate-lockfile
+ if: hashFiles('Cargo.lock') == ''
+ run: cargo generate-lockfile
+ - name: cargo test
+ run: cargo test --locked --all-features --all-targets
+ coverage:
+ # use llvm-cov to build and collect coverage and outputs in a format that
+ # is compatible with codecov.io
+ #
+ # note that codecov as of v4 requires that CODECOV_TOKEN from
+ #
+ # https://app.codecov.io/gh///settings
+ #
+ # is set in two places on your repo:
+ #
+ # - https://github.com/jonhoo/guardian/settings/secrets/actions
+ # - https://github.com/jonhoo/guardian/settings/secrets/dependabot
+ #
+ # (the former is needed for codecov uploads to work with Dependabot PRs)
+ #
+ # PRs coming from forks of your repo will not have access to the token, but
+ # for those, codecov allows uploading coverage reports without a token.
+ # it's all a little weird and inconvenient. see
+ #
+ # https://github.com/codecov/feedback/issues/112
+ #
+ # for lots of more discussion
+ runs-on: ubuntu-latest
+ name: ubuntu / stable / coverage
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: true
+ - name: Install stable
+ uses: dtolnay/rust-toolchain@stable
+ with:
+ components: llvm-tools-preview
+ - name: cargo install cargo-llvm-cov
+ uses: taiki-e/install-action@cargo-llvm-cov
+ - name: cargo generate-lockfile
+ if: hashFiles('Cargo.lock') == ''
+ run: cargo generate-lockfile
+ - name: cargo llvm-cov
+ run: cargo llvm-cov --locked --all-features --lcov --output-path lcov.info
+ - name: Record Rust version
+ run: echo "RUST=$(rustc --version)" >> "$GITHUB_ENV"
+ - name: Upload to codecov.io
+ uses: codecov/codecov-action@v4
+ with:
+ fail_ci_if_error: true
+ token: ${{ secrets.CODECOV_TOKEN }}
+ env_vars: OS,RUST
+
+ # https://github.com/marketplace/actions/alls-green#why used for branch protection checks
+ test-check:
+ if: always()
+ needs:
+ - required
+ - minimal
+ - os-check
+ - coverage
+ runs-on: ubuntu-latest
+ permissions: {}
+ steps:
+ - name: Decide whether the needed jobs succeeded or failed
+ uses: re-actors/alls-green@release/v1
+ with:
+ jobs: ${{ toJSON(needs) }}
diff --git a/Cargo.lock b/Cargo.lock
index d298e3a..b3f6b3b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -690,7 +690,6 @@ dependencies = [
"tracing",
"tracing-subscriber",
"webcrawler",
- "workspace-hack",
]
[[package]]
@@ -1248,7 +1247,6 @@ dependencies = [
"itertools",
"minidom-14",
"pretty_assertions",
- "workspace-hack",
]
[[package]]
@@ -1363,7 +1361,6 @@ dependencies = [
"tracing",
"ulid",
"webcrawler",
- "workspace-hack",
]
[[package]]
@@ -1573,7 +1570,6 @@ dependencies = [
"tracing",
"tracing-forest",
"tracing-subscriber",
- "workspace-hack",
]
[[package]]
@@ -2175,7 +2171,6 @@ dependencies = [
"tracing",
"tracing-forest",
"tracing-subscriber",
- "workspace-hack",
]
[[package]]
@@ -2294,7 +2289,6 @@ dependencies = [
"serde",
"serde_yaml",
"tracing",
- "workspace-hack",
]
[[package]]
@@ -2367,7 +2361,6 @@ dependencies = [
"serde-aux",
"serde_json",
"serde_with",
- "workspace-hack",
]
[[package]]
@@ -2391,7 +2384,6 @@ dependencies = [
"swegov-opendata",
"tracing",
"tracing-subscriber",
- "workspace-hack",
"zip",
]
@@ -3145,25 +3137,6 @@ dependencies = [
"windows-sys 0.48.0",
]
-[[package]]
-name = "workspace-hack"
-version = "0.1.0"
-dependencies = [
- "futures-core",
- "memchr",
- "proc-macro2 1.0.86",
- "quick-xml",
- "quote 1.0.37",
- "rand_core 0.6.4",
- "regex",
- "regex-automata 0.4.7",
- "regex-syntax 0.8.4",
- "serde",
- "smallvec",
- "syn 2.0.77",
- "tracing-core",
-]
-
[[package]]
name = "yaml-rust"
version = "0.4.5"
diff --git a/Cargo.toml b/Cargo.toml
index 3f03cd3..b3bc822 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,22 +10,29 @@ members = [
"crates/sparv-extension",
"crates/swegov-opendata",
"crates/swegov-opendata-preprocess",
- "crates/workspace-hack",
]
[workspace.package]
-rust-version = "1.64"
+rust-version = "1.74"
edition = "2021"
license = "MIT OR Apache-2.0"
-authors = ["Kristoffer Andersson "]
+authors = [
+ "Språkbanken Text ",
+ "Kristoffer Andersson ",
+]
[workspace.dependencies]
# local crates
+minidom-extension = { path = "./crates/minidom-extension"}
+opendata-spiders = { path = "./crates/opendata-spiders"}
preprocessors = { path = "./crates/preprocessors" }
+sparv-extension = { path = "./crates/sparv-extension"}
+swegov-opendata = { path = "./crates/swegov-opendata"}
+swegov-opendata-preprocess = { path = "./crates/swegov-opendata-preprocess"}
# external crates
-anyhow = "1.0.71"
-async-trait = "0.1.68"
-chrono = "0.4.31"
+anyhow = "1.0.73"
+async-trait = "0.1.82"
+chrono = "0.4.37"
clap = { version = "4.4.11" }
config = "0.13.3"
deserx = { git = "https://github.com/kod-kristoff/deserx" }
@@ -50,23 +57,20 @@ regex-automata = { version = "0.4", default-features = false }
regex-syntax = "0.8"
reqwest = { version = "0.11.17", default-features = false }
rstest = "0.18.2"
-serde = { version = "1.0.160" }
+serde = { version = "1.0.210" }
serde-aux = "4.2.0"
-serde_json = "1.0.107"
+serde_json = "1.0.128"
serde_with = "3.0.0"
serde_yaml = "0.9.27"
smallvec = { version = "1", default-features = false }
soup = "0.5.1"
syn = "2"
-tokio = "1.28.0"
+tokio = "1.40.0"
tokio-stream = "0.1.14"
-tracing = "0.1.37"
+tracing = "0.1.40"
tracing-core = "0.1"
tracing-forest = "0.1.6"
tracing-subscriber = { version = "0.3.17", features = ["env-filter", "json"] }
ulid = "1.0.0"
webcrawler = { git = "https://github.com/spraakbanken/webcrawler-rs" }
zip = "0.6.6"
-
-[workspace.metadata.cargo-machete]
-ignored = ["workspace-hack"]
diff --git a/Justfile b/Justfile
index c5f79ab..baada3b 100644
--- a/Justfile
+++ b/Justfile
@@ -1,8 +1,8 @@
opendata-quick-dev:
- cargo watch -q -c -w swegov-opendata -x 'run -p swegov-opendata --example quick_dev'
+ cargo watch -q -c -w swegov-opendata -x 'run -p swegov-opendata --example quick_dev_swegov_opendata'
spiders-quick-dev:
- cargo watch -q -c -w opendata-spiders -x 'run -p opendata-spiders --example quick_dev'
+ cargo watch -q -c -w opendata-spiders -x 'run -p opendata-spiders --example quick_dev_opendata_spiders'
quick-dev:
cargo watch -q -c -w src -w sfs-corpus-core -x 'run -- generate xml data/sfs/output/sfs'
diff --git a/README.md b/README.md
index 1f6e364..e1e205a 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,26 @@
# swegov-opendata-rs
-Tool used for collecting SFS (Svensk Författningssamling) from [Riksdagens öppna data](https://data.riksdagen.se).
+Tools used for collecting SFS (Svensk Författningssamling) from [Riksdagens öppna data](https://data.riksdagen.se).
+
+[![MIT licensed][mit-badge]][mit-url]
+
+[![Maturity badge - level 1][scorecard-badge]][scorecard-url]
+
+[![CI(check)][actions-check-badge]][actions-check-url]
+[![CI(scheduled)][actions-scheduled-badge]][actions-scheduled-url]
+[![CI(test)][actions-test-badge]][actions-test-url]
+
+[mit-badge]: https://img.shields.io/badge/license-MIT-blue.svg
+[mit-url]: LICENSE
+[actions-check-badge]: https://github.com/spraakbanken/swegov-opendata-rs/actions/workflows/check.yml/badge.svg
+[actions-check-url]: https://github.com/spraakbanken/swegov-opendata-rs/actions?query=workflow%3Acheck+branch%3Amain
+[actions-scheduled-badge]: https://github.com/spraakbanken/swegov-opendata-rs/actions/workflows/scheduled.yml/badge.svg
+[actions-scheduled-url]: https://github.com/spraakbanken/swegov-opendata-rs/actions?query=workflow%3Ascheduled+branch%3Amain
+[actions-test-badge]: https://github.com/spraakbanken/swegov-opendata-rs/actions/workflows/test.yml/badge.svg
+[actions-test-url]: https://github.com/spraakbanken/swegov-opendata-rs/actions?query=workflow%3Atest+branch%3Amain
+[scorecard-badge]: https://img.shields.io/badge/Maturity-Level%201%20--%20New%20Project-yellow.svg
+[scorecard-url]: https://github.com/spraakbanken/getting-started/blob/main/scorecard.md
-This workspace contains the binary `fetch-sfs` in the root.
## fetch-sfs
@@ -60,6 +78,20 @@ The spiders work in 2 steps,
-# References
+## References
- [Riksdagens öppna data dokumentation](https://data.riksdagen.se/dokumentation/)
+
+## MSRV Policy
+
+The MSRV (Minimum Supported Rust Version) is fixed for a given minor (1.x)
+version. However it can be increased when bumping minor versions, i.e. going
+from 1.0 to 1.1 allows us to increase the MSRV. Users unable to increase their
+Rust version can use an older minor version instead. Below is a list of swegov-opendata-rs versions
+and their MSRV:
+
+ * v0.1: Rust 1.74.
+
+Note however that swegov-opendata-rs also has dependencies, which might have different MSRV
+policies. We try to stick to the above policy when updating dependencies, but
+this is not always possible.
diff --git a/crates/fetch-sfs/Cargo.toml b/crates/fetch-sfs/Cargo.toml
index 0624cbc..2c51294 100644
--- a/crates/fetch-sfs/Cargo.toml
+++ b/crates/fetch-sfs/Cargo.toml
@@ -1,7 +1,10 @@
[package]
name = "fetch-sfs"
version = "0.1.0"
-edition = "2021"
+edition = { workspace = true }
+authors = { workspace = true }
+rust-version = { workspace = true }
+license = { workspace = true }
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -15,14 +18,12 @@ doctest = false
anyhow = { workspace = true }
clap = { workspace = true, features = ["derive", "cargo"] }
config = { workspace = true }
-# error-stack = { workspace = true }
serde = { workspace = true, features = ["derive"] }
tokio = { workspace = true, features = ["macros", "rt-multi-thread", "fs"] }
tracing = { workspace = true }
tracing-subscriber = { workspace = true }
webcrawler = { workspace = true }
# local deps
-opendata-spiders = { version = "0.1.0", path = "../opendata-spiders" }
-workspace-hack = { version = "0.1", path = "../workspace-hack" }
+opendata-spiders = { workspace = true }
preprocessors.workspace = true
diff --git a/crates/fetch-sfs/src/configuration.rs b/crates/fetch-sfs/src/configuration.rs
deleted file mode 100644
index 95dc569..0000000
--- a/crates/fetch-sfs/src/configuration.rs
+++ /dev/null
@@ -1,18 +0,0 @@
-use opendata_spiders::sfs::SfsSpiderOptions;
-
-#[derive(serde::Deserialize, Clone)]
-pub struct Settings {
- pub sfs: SfsSpiderOptions,
-}
-
-pub fn get_configuration() -> Result {
- let base_path =
- std::env::current_dir().expect("configuration: Failed to determine the current directory");
-
- let settings = config::Config::builder()
- .set_default("sfs.output_path", "./output")?
- .add_source(config::File::from(base_path.join("config.json")).required(false))
- .build()?;
-
- settings.try_deserialize::()
-}
diff --git a/crates/fetch-sfs/src/main.rs b/crates/fetch-sfs/src/main.rs
index 75392c7..1406647 100644
--- a/crates/fetch-sfs/src/main.rs
+++ b/crates/fetch-sfs/src/main.rs
@@ -1,4 +1,3 @@
-mod configuration;
mod options;
use std::{io, path::PathBuf, sync::Arc, time::Duration};
@@ -10,9 +9,6 @@ use webcrawler::{crawler, CrawlerOptions};
use crate::options::Args;
-// pub use crate::error::Error;
-const PROCESSED_STATE: &str = "visited.json";
-
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let args = Args::parse();
@@ -25,8 +21,6 @@ async fn main() -> anyhow::Result<()> {
init_tracing()?;
- let config = configuration::get_configuration()?;
-
let spider = Arc::new(opendata_spiders::sfs::SfsSpider::new(
opendata_spiders::sfs::SfsSpiderOptions {
user_agent: Some(APP_USER_AGENT.into()),
diff --git a/crates/minidom-extension/Cargo.toml b/crates/minidom-extension/Cargo.toml
index 1d2fc9d..021a313 100644
--- a/crates/minidom-extension/Cargo.toml
+++ b/crates/minidom-extension/Cargo.toml
@@ -1,7 +1,10 @@
[package]
name = "minidom-extension"
version = "0.1.0"
-edition = "2021"
+edition = { workspace = true }
+authors = { workspace = true }
+rust-version = { workspace = true }
+license = { workspace = true }
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
@@ -11,7 +14,6 @@ doctest = false
minidom-14 = { workspace = true }
pretty_assertions = { workspace = true }
itertools = { workspace = true }
-workspace-hack = { version = "0.1", path = "../workspace-hack" }
[package.metadata.cargo-machete]
ignored = ["minidom-14"]
diff --git a/crates/opendata-spiders/Cargo.toml b/crates/opendata-spiders/Cargo.toml
index b0b3bc2..45787a8 100644
--- a/crates/opendata-spiders/Cargo.toml
+++ b/crates/opendata-spiders/Cargo.toml
@@ -1,14 +1,16 @@
[package]
name = "opendata-spiders"
version = "0.1.0"
-edition = "2021"
+edition = { workspace = true }
+authors = { workspace = true }
+rust-version = { workspace = true }
+license = { workspace = true }
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
doctest = false
[dependencies]
-# anyhow = "1.0.71"
anyhow = { workspace = true }
async-trait = { workspace = true }
deserx = { workspace = true }
@@ -23,9 +25,8 @@ reqwest = { workspace = true, features = [
] }
serde = { workspace = true, features = ["derive"] }
serde_json = { workspace = true }
-swegov-opendata = { version = "0.2.0", path = "../swegov-opendata" }
+swegov-opendata = { workspace = true }
tokio = { workspace = true, features = ["macros", "rt-multi-thread", "fs"] }
tracing = { workspace = true }
ulid = { workspace = true }
webcrawler = { workspace = true }
-workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/crates/opendata-spiders/examples/quick_dev.rs b/crates/opendata-spiders/examples/quick_dev_opendata_spiders.rs
similarity index 100%
rename from crates/opendata-spiders/examples/quick_dev.rs
rename to crates/opendata-spiders/examples/quick_dev_opendata_spiders.rs
diff --git a/crates/preprocessors/Cargo.toml b/crates/preprocessors/Cargo.toml
index 4a1d1af..498759d 100644
--- a/crates/preprocessors/Cargo.toml
+++ b/crates/preprocessors/Cargo.toml
@@ -1,7 +1,10 @@
[package]
name = "preprocessors"
version = "0.1.0"
-edition = "2021"
+edition = { workspace = true }
+authors = { workspace = true }
+rust-version = { workspace = true }
+license = { workspace = true }
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[[bin]]
@@ -20,8 +23,7 @@ prodash = { workspace = true, features = [
"render-line",
"render-line-crossterm",
] }
-swegov-opendata-preprocess = { version = "0.1.0", path = "../swegov-opendata-preprocess" }
+swegov-opendata-preprocess = { workspace = true}
tracing = { workspace = true }
tracing-forest = { workspace = true }
tracing-subscriber = { workspace = true }
-workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/crates/sfs-preprocess/Cargo.toml b/crates/sfs-preprocess/Cargo.toml
index a0fb461..dd2ac2d 100644
--- a/crates/sfs-preprocess/Cargo.toml
+++ b/crates/sfs-preprocess/Cargo.toml
@@ -1,7 +1,10 @@
[package]
name = "sfs-preprocess"
version = "0.1.0"
-edition = "2021"
+edition = { workspace = true }
+authors = { workspace = true }
+rust-version = { workspace = true }
+license = { workspace = true }
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[[bin]]
@@ -21,6 +24,5 @@ tracing-forest = { workspace = true }
tracing-subscriber = { workspace = true }
# local deps
-preprocessors.workspace = true
-swegov-opendata-preprocess = { version = "0.1.0", path = "../swegov-opendata-preprocess" }
-workspace-hack = { version = "0.1", path = "../workspace-hack" }
+preprocessors = {workspace = true}
+swegov-opendata-preprocess = { workspace = true }
diff --git a/crates/sparv-extension/Cargo.toml b/crates/sparv-extension/Cargo.toml
index 69dcc5f..160d4e2 100644
--- a/crates/sparv-extension/Cargo.toml
+++ b/crates/sparv-extension/Cargo.toml
@@ -1,7 +1,10 @@
[package]
name = "sparv-extension"
version = "0.1.0"
-edition = "2021"
+edition = { workspace = true }
+authors = { workspace = true }
+rust-version = { workspace = true }
+license = { workspace = true }
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
@@ -12,4 +15,3 @@ error-stack = { workspace = true }
serde_yaml = { workspace = true }
serde = { workspace = true, features = ["derive"] }
tracing = { workspace = true }
-workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/crates/swegov-opendata-preprocess/Cargo.toml b/crates/swegov-opendata-preprocess/Cargo.toml
index b70f3ed..4b5429d 100644
--- a/crates/swegov-opendata-preprocess/Cargo.toml
+++ b/crates/swegov-opendata-preprocess/Cargo.toml
@@ -1,7 +1,10 @@
[package]
name = "swegov-opendata-preprocess"
version = "0.1.0"
-edition = "2021"
+edition = { workspace = true }
+authors = { workspace = true }
+rust-version = { workspace = true }
+license = { workspace = true }
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
@@ -16,19 +19,18 @@ soup = { workspace = true }
zip = { workspace = true }
tracing = { workspace = true }
tracing-subscriber = { workspace = true }
-swegov-opendata = { version = "0.2.0", path = "../swegov-opendata" }
+swegov-opendata = { workspace = true }
chrono = { workspace = true }
flate2 = { workspace = true }
pretty_assertions = { workspace = true }
once_cell = { workspace = true }
itertools = { workspace = true }
-minidom-extension = { version = "0.1.0", path = "../minidom-extension" }
-sparv-extension = { version = "0.1.0", path = "../sparv-extension" }
+minidom-extension = { workspace = true }
+sparv-extension = { workspace = true }
prodash = { workspace = true, features = [
"render-line",
"render-line-crossterm",
] }
-workspace-hack = { version = "0.1", path = "../workspace-hack" }
[dev-dependencies]
itertools = { workspace = true }
diff --git a/crates/swegov-opendata-preprocess/src/preprocess_rd/html.rs b/crates/swegov-opendata-preprocess/src/preprocess_rd/html.rs
index 703a9b2..09cd380 100644
--- a/crates/swegov-opendata-preprocess/src/preprocess_rd/html.rs
+++ b/crates/swegov-opendata-preprocess/src/preprocess_rd/html.rs
@@ -208,7 +208,7 @@ fn process_node(node: &rcdom::Handle) -> ProcessNodeOutput {
for child in node.children() {
let ProcessNodeOutput {
nodes: child_nodes,
- attrs,
+ attrs: _,
} = process_node(&child);
for child_node in child_nodes {
match child_node {
diff --git a/crates/swegov-opendata-preprocess/src/preprocess_rd/xml.rs b/crates/swegov-opendata-preprocess/src/preprocess_rd/xml.rs
index 24e40c8..bfc3811 100644
--- a/crates/swegov-opendata-preprocess/src/preprocess_rd/xml.rs
+++ b/crates/swegov-opendata-preprocess/src/preprocess_rd/xml.rs
@@ -37,10 +37,8 @@ pub fn preprocess_xml(
let mut in_dokument = false;
let mut in_html = false;
let mut found_html = false;
- let mut collect_doc_attr = false;
let mut doc_attr = None;
let mut text_attr = None;
- let mut collect_text_attr = false;
let mut reader = Reader::from_str(xml_string);
loop {
match reader.read_event() {
@@ -60,7 +58,6 @@ pub fn preprocess_xml(
| b"dokumentstatus_url_xml"
| b"dokument_url_text"
| b"dokument_url_html" => {
- collect_doc_attr = true;
doc_attr = Some(String::from_utf8(e.name().as_ref().to_vec()).unwrap());
tracing::trace!("found doc attr '{:?}'", doc_attr);
diff --git a/crates/swegov-opendata-preprocess/src/preprocess_sfs/sfs_json/sfs_div_dok.rs b/crates/swegov-opendata-preprocess/src/preprocess_sfs/sfs_json/sfs_div_dok.rs
index 0a0672d..23efe46 100644
--- a/crates/swegov-opendata-preprocess/src/preprocess_sfs/sfs_json/sfs_div_dok.rs
+++ b/crates/swegov-opendata-preprocess/src/preprocess_sfs/sfs_json/sfs_div_dok.rs
@@ -1,5 +1,3 @@
-use std::borrow::Cow;
-
use error_stack::Report;
use minidom::{
quick_xml::{events::Event, Reader},
@@ -227,7 +225,7 @@ pub fn extract_page(
_ => todo!("handle End {:?} state={:?}", e, state),
}
}
- Ok(Event::Comment(e)) => {
+ Ok(Event::Comment(_e)) => {
continue;
}
Ok(e) => todo!("handle {:?} state={:?}", e, state),
diff --git a/crates/swegov-opendata-preprocess/src/shared.rs b/crates/swegov-opendata-preprocess/src/shared.rs
index 6f1dc74..f482dfc 100644
--- a/crates/swegov-opendata-preprocess/src/shared.rs
+++ b/crates/swegov-opendata-preprocess/src/shared.rs
@@ -1,5 +1,5 @@
use itertools::Itertools;
-use minidom::{Element, ElementBuilder, Node};
+use minidom::{Element, Node};
use minidom_extension::{elem_is_empty, minidom};
pub fn clean_element(elem: &Element) -> Element {
@@ -23,7 +23,7 @@ pub fn clean_element(elem: &Element) -> Element {
new_elem
}
-pub fn clean_texts(elem: &mut Element) {}
+pub fn clean_texts(_elem: &mut Element) {}
fn clean_nodes(new_elem: &mut Element, elem: &Element) {
for node in elem.nodes() {
@@ -56,11 +56,12 @@ fn clean_nodes(new_elem: &mut Element, elem: &Element) {
pub fn clean_text(text: &str) -> String {
let text = text.replace('\u{AD}', "");
- text.split_whitespace()
- // .split(char::is_whitespace)
- .intersperse(" ")
- // .filter(|part| !part.trim().is_empty())
- .collect()
+ Itertools::intersperse(text.split_whitespace(), " ").collect()
+ // text.split_whitespace()
+ // // .split(char::is_whitespace)
+ // .intersperse(" ")
+ // // .filter(|part| !part.trim().is_empty())
+ // .collect()
}
#[cfg(test)]
diff --git a/crates/swegov-opendata-preprocess/tests/it/preprocess_sfs_json.rs b/crates/swegov-opendata-preprocess/tests/it/preprocess_sfs_json.rs
index 592a83c..7902430 100644
--- a/crates/swegov-opendata-preprocess/tests/it/preprocess_sfs_json.rs
+++ b/crates/swegov-opendata-preprocess/tests/it/preprocess_sfs_json.rs
@@ -35,7 +35,7 @@ fn test_preprocess_sfs_json() -> PreprocessResult<()> {
// Assert
let mut reader = Reader::from_reader(actual.as_slice());
- let actual = Element::from_reader(&mut reader)
+ let _actual = Element::from_reader(&mut reader)
.change_context(PreprocessError)
.attach_printable("failed to read actual")?;
@@ -45,7 +45,7 @@ fn test_preprocess_sfs_json() -> PreprocessResult<()> {
.attach_printable_lazy(|| example1_expected_path.to_string())?;
let reader = BufReader::new(example1_expected_file);
let mut reader = Reader::from_reader(reader);
- let expected = Element::from_reader(&mut reader)
+ let _expected = Element::from_reader(&mut reader)
.change_context(PreprocessError)
.attach_printable("failed to read expected")?;
diff --git a/crates/swegov-opendata/Cargo.toml b/crates/swegov-opendata/Cargo.toml
index 2a30757..6137d1d 100644
--- a/crates/swegov-opendata/Cargo.toml
+++ b/crates/swegov-opendata/Cargo.toml
@@ -1,7 +1,10 @@
[package]
name = "swegov-opendata"
version = "0.2.0"
-edition = "2021"
+edition = { workspace = true }
+authors = { workspace = true }
+rust-version = { workspace = true }
+license = { workspace = true }
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
@@ -16,7 +19,6 @@ serde = { workspace = true, features = ["derive"] }
serde-aux = { workspace = true }
serde_json = { workspace = true }
serde_with = { workspace = true }
-workspace-hack = { version = "0.1", path = "../workspace-hack" }
[dev-dependencies]
quick-xml = { workspace = true, features = ["serialize"] }
diff --git a/crates/swegov-opendata/examples/quick_dev.rs b/crates/swegov-opendata/examples/quick_dev_swegov_opendata.rs
similarity index 100%
rename from crates/swegov-opendata/examples/quick_dev.rs
rename to crates/swegov-opendata/examples/quick_dev_swegov_opendata.rs
diff --git a/crates/workspace-hack/.gitattributes b/crates/workspace-hack/.gitattributes
deleted file mode 100644
index 3e9dba4..0000000
--- a/crates/workspace-hack/.gitattributes
+++ /dev/null
@@ -1,4 +0,0 @@
-# Avoid putting conflict markers in the generated Cargo.toml file, since their presence breaks
-# Cargo.
-# Also do not check out the file as CRLF on Windows, as that's what hakari needs.
-Cargo.toml merge=binary -crlf
diff --git a/crates/workspace-hack/Cargo.toml b/crates/workspace-hack/Cargo.toml
deleted file mode 100644
index 33a460c..0000000
--- a/crates/workspace-hack/Cargo.toml
+++ /dev/null
@@ -1,38 +0,0 @@
-# This file is generated by `cargo hakari`.
-# To regenerate, run:
-# cargo hakari generate
-
-[package]
-name = "workspace-hack"
-version = "0.1.0"
-description = "workspace-hack package, managed by hakari"
-# You can choose to publish this crate: see https://docs.rs/cargo-hakari/latest/cargo_hakari/publishing.
-publish = false
-
-# The parts of the file between the BEGIN HAKARI SECTION and END HAKARI SECTION comments
-# are managed by hakari.
-
-### BEGIN HAKARI SECTION
-[dependencies]
-futures-core = { workspace = true }
-memchr = { workspace = true }
-quick-xml = { workspace = true, features = ["serialize"] }
-rand_core = { workspace = true, features = ["std"] }
-regex = { workspace = true }
-regex-automata = { workspace = true, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
-regex-syntax = { workspace = true }
-serde = { workspace = true, features = ["alloc", "derive"] }
-smallvec = { workspace = true, features = ["write"] }
-tracing-core = { workspace = true }
-
-[build-dependencies]
-memchr = { workspace = true }
-proc-macro2 = { workspace = true }
-quote = { workspace = true }
-regex = { workspace = true }
-regex-automata = { workspace = true, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
-regex-syntax = { workspace = true }
-serde = { workspace = true, features = ["alloc", "derive"] }
-syn = { workspace = true, features = ["extra-traits", "full", "visit", "visit-mut"] }
-
-### END HAKARI SECTION
diff --git a/crates/workspace-hack/build.rs b/crates/workspace-hack/build.rs
deleted file mode 100644
index 92518ef..0000000
--- a/crates/workspace-hack/build.rs
+++ /dev/null
@@ -1,2 +0,0 @@
-// A build script is required for cargo to consider build dependencies.
-fn main() {}
diff --git a/crates/workspace-hack/src/lib.rs b/crates/workspace-hack/src/lib.rs
deleted file mode 100644
index 22489f6..0000000
--- a/crates/workspace-hack/src/lib.rs
+++ /dev/null
@@ -1 +0,0 @@
-// This is a stub lib.rs.