diff --git a/.github/DOCS.md b/.github/DOCS.md new file mode 100644 index 0000000..962c7ae --- /dev/null +++ b/.github/DOCS.md @@ -0,0 +1,33 @@ +# Github config and workflows + +Copied from . + +In this folder there is configuration for codecoverage, dependabot, and ci +workflows that check the library more deeply than the default configurations. + +This folder can be or was merged using a --allow-unrelated-histories merge +strategy from which provides a +reasonably sensible base for writing your own ci on. By using this strategy +the history of the CI repo is included in your repo, and future updates to +the CI can be merged later. + +To perform this merge run: + +```shell +git remote add ci https://github.com/kod-kristoff/rust-ci-conf.git +git fetch ci +git merge --allow-unrelated-histories ci/main +``` + +or + +```shell +git remote add ci git@github.com:kod-kristoff/rust-ci-conf.git +git fetch ci +git merge --allow-unrelated-histories ci/main +``` + +An overview of the files in this project is available at: +, which contains some +rationale for decisions and runs through an example of solving minimal version +and OpenSSL issues. diff --git a/.github/codecov.yml b/.github/codecov.yml new file mode 100644 index 0000000..cd5ce8f --- /dev/null +++ b/.github/codecov.yml @@ -0,0 +1,21 @@ +# ref: https://docs.codecov.com/docs/codecovyml-reference +coverage: + # Hold ourselves to a high bar + range: 85..100 + round: down + precision: 1 + status: + # ref: https://docs.codecov.com/docs/commit-status + project: + default: + # Avoid false negatives + threshold: 1% + +# Test files aren't important for coverage +ignore: + - "tests" + +# Make comments less noisy +comment: + layout: "files" + require_changes: true diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..d0f091e --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,19 @@ +version: 2 +updates: + - package-ecosystem: github-actions + directory: / + schedule: + interval: daily + - package-ecosystem: cargo + directory: / + schedule: + interval: daily + ignore: + - dependency-name: "*" + # patch and minor updates don't matter for libraries as consumers of this library build + # with their own lockfile, rather than the version specified in this library's lockfile + # remove this ignore rule if your package has binaries to ensure that the binaries are + # built with the exact set of dependencies and those are up to date. + update-types: + - "version-update:semver-patch" + - "version-update:semver-minor" diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml new file mode 100644 index 0000000..f3b6061 --- /dev/null +++ b/.github/workflows/check.yml @@ -0,0 +1,130 @@ +# This workflow runs whenever a PR is opened or updated, or a commit is pushed to main. It runs +# several checks: +# - fmt: checks that the code is formatted according to rustfmt +# - clippy: checks that the code does not contain any clippy warnings +# - doc: checks that the code can be documented without errors +# - hack: check combinations of feature flags +# - msrv: check that the msrv specified in the crate is correct +permissions: + contents: read +# This configuration allows maintainers of this repo to create a branch and pull request based on +# the new branch. Restricting the push trigger to the main branch ensures that the PR only gets +# built once. +on: + push: + branches: [main] + pull_request: + merge_group: +# If new code is pushed to a PR branch, then cancel in progress workflows for that PR. Ensures that +# we don't waste CI time, and returns results quicker https://github.com/jonhoo/rust-ci-conf/pull/5 +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true +name: check +jobs: + fmt: + runs-on: ubuntu-latest + name: stable / fmt + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install stable + uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt + - name: cargo fmt --check + run: cargo fmt --check + clippy: + runs-on: ubuntu-latest + name: ${{ matrix.toolchain }} / clippy + permissions: + contents: read + checks: write + strategy: + fail-fast: false + matrix: + # Get early warning of new lints which are regularly introduced in beta channels. + toolchain: [stable, beta] + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install ${{ matrix.toolchain }} + uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ matrix.toolchain }} + components: clippy + - name: cargo clippy + uses: auguwu/clippy-action@1.4.0 + with: + token: ${{secrets.GITHUB_TOKEN}} + doc: + # run docs generation on nightly rather than stable. This enables features like + # https://doc.rust-lang.org/beta/unstable-book/language-features/doc-cfg.html which allows an + # API be documented as only available in some specific platforms. + runs-on: ubuntu-latest + name: nightly / doc + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install nightly + uses: dtolnay/rust-toolchain@nightly + - name: cargo doc + run: cargo doc --no-deps --all-features + env: + RUSTDOCFLAGS: --cfg docsrs + hack: + # cargo-hack checks combinations of feature flags to ensure that features are all additive + # which is required for feature unification + runs-on: ubuntu-latest + name: ubuntu / stable / features + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install stable + uses: dtolnay/rust-toolchain@stable + - name: cargo install cargo-hack + uses: taiki-e/install-action@cargo-hack + # intentionally no target specifier; see https://github.com/jonhoo/rust-ci-conf/pull/4 + # --feature-powerset runs for every combination of features + - name: cargo hack + run: cargo hack --feature-powerset check + msrv: + # check that we can build using the minimal rust version that is specified by this crate + runs-on: ubuntu-latest + # we use a matrix here just because env can't be used in job names + # https://docs.github.com/en/actions/learn-github-actions/contexts#context-availability + strategy: + matrix: + msrv: ["1.74.0"] # 2021 edition requires 1.56 + name: ubuntu / ${{ matrix.msrv }} + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install ${{ matrix.msrv }} + uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ matrix.msrv }} + - name: cargo +${{ matrix.msrv }} check + run: cargo check + + # https://github.com/marketplace/actions/alls-green#why used for branch protection checks + check-check: + if: always() + needs: + - fmt + - clippy + - doc + - hack + - msrv + runs-on: ubuntu-latest + permissions: {} + steps: + - name: Decide whether the needed jobs succeeded or failed + uses: re-actors/alls-green@release/v1 + with: + jobs: ${{ toJSON(needs) }} diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml new file mode 100644 index 0000000..82ec91b --- /dev/null +++ b/.github/workflows/scheduled.yml @@ -0,0 +1,72 @@ +# Run scheduled (rolling) jobs on a nightly basis, as your crate may break independently of any +# given PR. E.g., updates to rust nightly and updates to this crates dependencies. See check.yml for +# information about how the concurrency cancellation and workflow triggering works +permissions: + contents: read +on: + push: + branches: [main] + pull_request: + schedule: + - cron: "7 7 * * *" +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true +name: rolling +jobs: + # https://twitter.com/mycoliza/status/1571295690063753218 + nightly: + runs-on: ubuntu-latest + name: ubuntu / nightly + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install nightly + uses: dtolnay/rust-toolchain@nightly + - name: cargo generate-lockfile + if: hashFiles('Cargo.lock') == '' + run: cargo generate-lockfile + - name: cargo test --locked + run: cargo test --locked --all-features --all-targets + # https://twitter.com/alcuadrado/status/1571291687837732873 + update: + # This action checks that updating the dependencies of this crate to the latest available that + # satisfy the versions in Cargo.toml does not break this crate. This is important as consumers + # of this crate will generally use the latest available crates. This is subject to the standard + # Cargo semver rules (i.e cargo does not update to a new major version unless explicitly told + # to). + runs-on: ubuntu-latest + name: ubuntu / beta / updated + # There's no point running this if no Cargo.lock was checked in in the first place, since we'd + # just redo what happened in the regular test job. Unfortunately, hashFiles only works in if on + # steps, so we repeat it. + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install beta + if: hashFiles('Cargo.lock') != '' + uses: dtolnay/rust-toolchain@beta + - name: cargo update + if: hashFiles('Cargo.lock') != '' + run: cargo update + - name: cargo test + if: hashFiles('Cargo.lock') != '' + run: cargo test --locked --all-features --all-targets + env: + RUSTFLAGS: -D deprecated + + # https://github.com/marketplace/actions/alls-green#why used for branch protection checks + scheduled-check: + if: always() + needs: + - nightly + - update + runs-on: ubuntu-latest + permissions: {} + steps: + - name: Decide whether the needed jobs succeeded or failed + uses: re-actors/alls-green@release/v1 + with: + jobs: ${{ toJSON(needs) }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..abb0b91 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,172 @@ +# This is the main CI workflow that runs the test suite on all pushes to main and all pull requests. +# It runs the following jobs: +# - required: runs the test suite on ubuntu with stable and beta rust toolchains +# - minimal: runs the test suite with the minimal versions of the dependencies that satisfy the +# requirements of this crate, and its dependencies +# - os-check: runs the test suite on mac and windows +# - coverage: runs the test suite and collects coverage information +# See check.yml for information about how the concurrency cancellation and workflow triggering works +permissions: + contents: read +on: + push: + branches: [main] + pull_request: +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true +name: test +jobs: + required: + runs-on: ubuntu-latest + name: ubuntu / ${{ matrix.toolchain }} + strategy: + matrix: + # run on stable and beta to ensure that tests won't break on the next version of the rust + # toolchain + toolchain: [stable, beta] + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install ${{ matrix.toolchain }} + uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ matrix.toolchain }} + - name: cargo generate-lockfile + # enable this ci template to run regardless of whether the lockfile is checked in or not + if: hashFiles('Cargo.lock') == '' + run: cargo generate-lockfile + # https://twitter.com/jonhoo/status/1571290371124260865 + - name: cargo test --locked + run: cargo test --locked --all-features --all-targets + # https://github.com/rust-lang/cargo/issues/6669 + - name: cargo test --doc + run: cargo test --locked --all-features --doc + minimal: + # This action chooses the oldest version of the dependencies permitted by Cargo.toml to ensure + # that this crate is compatible with the minimal version that this crate and its dependencies + # require. This will pickup issues where this create relies on functionality that was introduced + # later than the actual version specified (e.g., when we choose just a major version, but a + # method was added after this version). + # + # This particular check can be difficult to get to succeed as often transitive dependencies may + # be incorrectly specified (e.g., a dependency specifies 1.0 but really requires 1.1.5). There + # is an alternative flag available -Zdirect-minimal-versions that uses the minimal versions for + # direct dependencies of this crate, while selecting the maximal versions for the transitive + # dependencies. Alternatively, you can add a line in your Cargo.toml to artificially increase + # the minimal dependency, which you do with e.g.: + # ```toml + # # for minimal-versions + # [target.'cfg(any())'.dependencies] + # openssl = { version = "0.10.55", optional = true } # needed to allow foo to build with -Zminimal-versions + # ``` + # The optional = true is necessary in case that dependency isn't otherwise transitively required + # by your library, and the target bit is so that this dependency edge never actually affects + # Cargo build order. See also + # https://github.com/jonhoo/fantoccini/blob/fde336472b712bc7ebf5b4e772023a7ba71b2262/Cargo.toml#L47-L49. + # This action is run on ubuntu with the stable toolchain, as it is not expected to fail + runs-on: ubuntu-latest + name: ubuntu / stable / minimal-versions + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install stable + uses: dtolnay/rust-toolchain@stable + - name: Install nightly for -Zminimal-versions + uses: dtolnay/rust-toolchain@nightly + - name: rustup default stable + run: rustup default stable + - name: cargo update -Zminimal-versions + run: cargo +nightly update -Zdirect-minimal-versions + - name: cargo test + run: cargo test --locked --all-features --all-targets + os-check: + # run cargo test on mac and windows + runs-on: ${{ matrix.os }} + name: ${{ matrix.os }} / stable + strategy: + fail-fast: false + matrix: + os: [macos-latest, windows-latest] + steps: + # if your project needs OpenSSL, uncomment this to fix Windows builds. + # it's commented out by default as the install command takes 5-10m. + # - run: echo "VCPKG_ROOT=$env:VCPKG_INSTALLATION_ROOT" | Out-File -FilePath $env:GITHUB_ENV -Append + # if: runner.os == 'Windows' + # - run: vcpkg install openssl:x64-windows-static-md + # if: runner.os == 'Windows' + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install stable + uses: dtolnay/rust-toolchain@stable + - name: cargo generate-lockfile + if: hashFiles('Cargo.lock') == '' + run: cargo generate-lockfile + - name: cargo test + run: cargo test --locked --all-features --all-targets + coverage: + # use llvm-cov to build and collect coverage and outputs in a format that + # is compatible with codecov.io + # + # note that codecov as of v4 requires that CODECOV_TOKEN from + # + # https://app.codecov.io/gh///settings + # + # is set in two places on your repo: + # + # - https://github.com/jonhoo/guardian/settings/secrets/actions + # - https://github.com/jonhoo/guardian/settings/secrets/dependabot + # + # (the former is needed for codecov uploads to work with Dependabot PRs) + # + # PRs coming from forks of your repo will not have access to the token, but + # for those, codecov allows uploading coverage reports without a token. + # it's all a little weird and inconvenient. see + # + # https://github.com/codecov/feedback/issues/112 + # + # for lots of more discussion + runs-on: ubuntu-latest + name: ubuntu / stable / coverage + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - name: Install stable + uses: dtolnay/rust-toolchain@stable + with: + components: llvm-tools-preview + - name: cargo install cargo-llvm-cov + uses: taiki-e/install-action@cargo-llvm-cov + - name: cargo generate-lockfile + if: hashFiles('Cargo.lock') == '' + run: cargo generate-lockfile + - name: cargo llvm-cov + run: cargo llvm-cov --locked --all-features --lcov --output-path lcov.info + - name: Record Rust version + run: echo "RUST=$(rustc --version)" >> "$GITHUB_ENV" + - name: Upload to codecov.io + uses: codecov/codecov-action@v4 + with: + fail_ci_if_error: true + token: ${{ secrets.CODECOV_TOKEN }} + env_vars: OS,RUST + + # https://github.com/marketplace/actions/alls-green#why used for branch protection checks + test-check: + if: always() + needs: + - required + - minimal + - os-check + - coverage + runs-on: ubuntu-latest + permissions: {} + steps: + - name: Decide whether the needed jobs succeeded or failed + uses: re-actors/alls-green@release/v1 + with: + jobs: ${{ toJSON(needs) }} diff --git a/Cargo.lock b/Cargo.lock index d298e3a..b3f6b3b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -690,7 +690,6 @@ dependencies = [ "tracing", "tracing-subscriber", "webcrawler", - "workspace-hack", ] [[package]] @@ -1248,7 +1247,6 @@ dependencies = [ "itertools", "minidom-14", "pretty_assertions", - "workspace-hack", ] [[package]] @@ -1363,7 +1361,6 @@ dependencies = [ "tracing", "ulid", "webcrawler", - "workspace-hack", ] [[package]] @@ -1573,7 +1570,6 @@ dependencies = [ "tracing", "tracing-forest", "tracing-subscriber", - "workspace-hack", ] [[package]] @@ -2175,7 +2171,6 @@ dependencies = [ "tracing", "tracing-forest", "tracing-subscriber", - "workspace-hack", ] [[package]] @@ -2294,7 +2289,6 @@ dependencies = [ "serde", "serde_yaml", "tracing", - "workspace-hack", ] [[package]] @@ -2367,7 +2361,6 @@ dependencies = [ "serde-aux", "serde_json", "serde_with", - "workspace-hack", ] [[package]] @@ -2391,7 +2384,6 @@ dependencies = [ "swegov-opendata", "tracing", "tracing-subscriber", - "workspace-hack", "zip", ] @@ -3145,25 +3137,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "workspace-hack" -version = "0.1.0" -dependencies = [ - "futures-core", - "memchr", - "proc-macro2 1.0.86", - "quick-xml", - "quote 1.0.37", - "rand_core 0.6.4", - "regex", - "regex-automata 0.4.7", - "regex-syntax 0.8.4", - "serde", - "smallvec", - "syn 2.0.77", - "tracing-core", -] - [[package]] name = "yaml-rust" version = "0.4.5" diff --git a/Cargo.toml b/Cargo.toml index 3f03cd3..b3bc822 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,22 +10,29 @@ members = [ "crates/sparv-extension", "crates/swegov-opendata", "crates/swegov-opendata-preprocess", - "crates/workspace-hack", ] [workspace.package] -rust-version = "1.64" +rust-version = "1.74" edition = "2021" license = "MIT OR Apache-2.0" -authors = ["Kristoffer Andersson "] +authors = [ + "Språkbanken Text ", + "Kristoffer Andersson ", +] [workspace.dependencies] # local crates +minidom-extension = { path = "./crates/minidom-extension"} +opendata-spiders = { path = "./crates/opendata-spiders"} preprocessors = { path = "./crates/preprocessors" } +sparv-extension = { path = "./crates/sparv-extension"} +swegov-opendata = { path = "./crates/swegov-opendata"} +swegov-opendata-preprocess = { path = "./crates/swegov-opendata-preprocess"} # external crates -anyhow = "1.0.71" -async-trait = "0.1.68" -chrono = "0.4.31" +anyhow = "1.0.73" +async-trait = "0.1.82" +chrono = "0.4.37" clap = { version = "4.4.11" } config = "0.13.3" deserx = { git = "https://github.com/kod-kristoff/deserx" } @@ -50,23 +57,20 @@ regex-automata = { version = "0.4", default-features = false } regex-syntax = "0.8" reqwest = { version = "0.11.17", default-features = false } rstest = "0.18.2" -serde = { version = "1.0.160" } +serde = { version = "1.0.210" } serde-aux = "4.2.0" -serde_json = "1.0.107" +serde_json = "1.0.128" serde_with = "3.0.0" serde_yaml = "0.9.27" smallvec = { version = "1", default-features = false } soup = "0.5.1" syn = "2" -tokio = "1.28.0" +tokio = "1.40.0" tokio-stream = "0.1.14" -tracing = "0.1.37" +tracing = "0.1.40" tracing-core = "0.1" tracing-forest = "0.1.6" tracing-subscriber = { version = "0.3.17", features = ["env-filter", "json"] } ulid = "1.0.0" webcrawler = { git = "https://github.com/spraakbanken/webcrawler-rs" } zip = "0.6.6" - -[workspace.metadata.cargo-machete] -ignored = ["workspace-hack"] diff --git a/Justfile b/Justfile index c5f79ab..baada3b 100644 --- a/Justfile +++ b/Justfile @@ -1,8 +1,8 @@ opendata-quick-dev: - cargo watch -q -c -w swegov-opendata -x 'run -p swegov-opendata --example quick_dev' + cargo watch -q -c -w swegov-opendata -x 'run -p swegov-opendata --example quick_dev_swegov_opendata' spiders-quick-dev: - cargo watch -q -c -w opendata-spiders -x 'run -p opendata-spiders --example quick_dev' + cargo watch -q -c -w opendata-spiders -x 'run -p opendata-spiders --example quick_dev_opendata_spiders' quick-dev: cargo watch -q -c -w src -w sfs-corpus-core -x 'run -- generate xml data/sfs/output/sfs' diff --git a/README.md b/README.md index 1f6e364..e1e205a 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,26 @@ # swegov-opendata-rs -Tool used for collecting SFS (Svensk Författningssamling) from [Riksdagens öppna data](https://data.riksdagen.se). +Tools used for collecting SFS (Svensk Författningssamling) from [Riksdagens öppna data](https://data.riksdagen.se). + +[![MIT licensed][mit-badge]][mit-url] + +[![Maturity badge - level 1][scorecard-badge]][scorecard-url] + +[![CI(check)][actions-check-badge]][actions-check-url] +[![CI(scheduled)][actions-scheduled-badge]][actions-scheduled-url] +[![CI(test)][actions-test-badge]][actions-test-url] + +[mit-badge]: https://img.shields.io/badge/license-MIT-blue.svg +[mit-url]: LICENSE +[actions-check-badge]: https://github.com/spraakbanken/swegov-opendata-rs/actions/workflows/check.yml/badge.svg +[actions-check-url]: https://github.com/spraakbanken/swegov-opendata-rs/actions?query=workflow%3Acheck+branch%3Amain +[actions-scheduled-badge]: https://github.com/spraakbanken/swegov-opendata-rs/actions/workflows/scheduled.yml/badge.svg +[actions-scheduled-url]: https://github.com/spraakbanken/swegov-opendata-rs/actions?query=workflow%3Ascheduled+branch%3Amain +[actions-test-badge]: https://github.com/spraakbanken/swegov-opendata-rs/actions/workflows/test.yml/badge.svg +[actions-test-url]: https://github.com/spraakbanken/swegov-opendata-rs/actions?query=workflow%3Atest+branch%3Amain +[scorecard-badge]: https://img.shields.io/badge/Maturity-Level%201%20--%20New%20Project-yellow.svg +[scorecard-url]: https://github.com/spraakbanken/getting-started/blob/main/scorecard.md -This workspace contains the binary `fetch-sfs` in the root. ## fetch-sfs @@ -60,6 +78,20 @@ The spiders work in 2 steps, -# References +## References - [Riksdagens öppna data dokumentation](https://data.riksdagen.se/dokumentation/) + +## MSRV Policy + +The MSRV (Minimum Supported Rust Version) is fixed for a given minor (1.x) +version. However it can be increased when bumping minor versions, i.e. going +from 1.0 to 1.1 allows us to increase the MSRV. Users unable to increase their +Rust version can use an older minor version instead. Below is a list of swegov-opendata-rs versions +and their MSRV: + + * v0.1: Rust 1.74. + +Note however that swegov-opendata-rs also has dependencies, which might have different MSRV +policies. We try to stick to the above policy when updating dependencies, but +this is not always possible. diff --git a/crates/fetch-sfs/Cargo.toml b/crates/fetch-sfs/Cargo.toml index 0624cbc..2c51294 100644 --- a/crates/fetch-sfs/Cargo.toml +++ b/crates/fetch-sfs/Cargo.toml @@ -1,7 +1,10 @@ [package] name = "fetch-sfs" version = "0.1.0" -edition = "2021" +edition = { workspace = true } +authors = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -15,14 +18,12 @@ doctest = false anyhow = { workspace = true } clap = { workspace = true, features = ["derive", "cargo"] } config = { workspace = true } -# error-stack = { workspace = true } serde = { workspace = true, features = ["derive"] } tokio = { workspace = true, features = ["macros", "rt-multi-thread", "fs"] } tracing = { workspace = true } tracing-subscriber = { workspace = true } webcrawler = { workspace = true } # local deps -opendata-spiders = { version = "0.1.0", path = "../opendata-spiders" } -workspace-hack = { version = "0.1", path = "../workspace-hack" } +opendata-spiders = { workspace = true } preprocessors.workspace = true diff --git a/crates/fetch-sfs/src/configuration.rs b/crates/fetch-sfs/src/configuration.rs deleted file mode 100644 index 95dc569..0000000 --- a/crates/fetch-sfs/src/configuration.rs +++ /dev/null @@ -1,18 +0,0 @@ -use opendata_spiders::sfs::SfsSpiderOptions; - -#[derive(serde::Deserialize, Clone)] -pub struct Settings { - pub sfs: SfsSpiderOptions, -} - -pub fn get_configuration() -> Result { - let base_path = - std::env::current_dir().expect("configuration: Failed to determine the current directory"); - - let settings = config::Config::builder() - .set_default("sfs.output_path", "./output")? - .add_source(config::File::from(base_path.join("config.json")).required(false)) - .build()?; - - settings.try_deserialize::() -} diff --git a/crates/fetch-sfs/src/main.rs b/crates/fetch-sfs/src/main.rs index 75392c7..1406647 100644 --- a/crates/fetch-sfs/src/main.rs +++ b/crates/fetch-sfs/src/main.rs @@ -1,4 +1,3 @@ -mod configuration; mod options; use std::{io, path::PathBuf, sync::Arc, time::Duration}; @@ -10,9 +9,6 @@ use webcrawler::{crawler, CrawlerOptions}; use crate::options::Args; -// pub use crate::error::Error; -const PROCESSED_STATE: &str = "visited.json"; - #[tokio::main] async fn main() -> anyhow::Result<()> { let args = Args::parse(); @@ -25,8 +21,6 @@ async fn main() -> anyhow::Result<()> { init_tracing()?; - let config = configuration::get_configuration()?; - let spider = Arc::new(opendata_spiders::sfs::SfsSpider::new( opendata_spiders::sfs::SfsSpiderOptions { user_agent: Some(APP_USER_AGENT.into()), diff --git a/crates/minidom-extension/Cargo.toml b/crates/minidom-extension/Cargo.toml index 1d2fc9d..021a313 100644 --- a/crates/minidom-extension/Cargo.toml +++ b/crates/minidom-extension/Cargo.toml @@ -1,7 +1,10 @@ [package] name = "minidom-extension" version = "0.1.0" -edition = "2021" +edition = { workspace = true } +authors = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] @@ -11,7 +14,6 @@ doctest = false minidom-14 = { workspace = true } pretty_assertions = { workspace = true } itertools = { workspace = true } -workspace-hack = { version = "0.1", path = "../workspace-hack" } [package.metadata.cargo-machete] ignored = ["minidom-14"] diff --git a/crates/opendata-spiders/Cargo.toml b/crates/opendata-spiders/Cargo.toml index b0b3bc2..45787a8 100644 --- a/crates/opendata-spiders/Cargo.toml +++ b/crates/opendata-spiders/Cargo.toml @@ -1,14 +1,16 @@ [package] name = "opendata-spiders" version = "0.1.0" -edition = "2021" +edition = { workspace = true } +authors = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] doctest = false [dependencies] -# anyhow = "1.0.71" anyhow = { workspace = true } async-trait = { workspace = true } deserx = { workspace = true } @@ -23,9 +25,8 @@ reqwest = { workspace = true, features = [ ] } serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } -swegov-opendata = { version = "0.2.0", path = "../swegov-opendata" } +swegov-opendata = { workspace = true } tokio = { workspace = true, features = ["macros", "rt-multi-thread", "fs"] } tracing = { workspace = true } ulid = { workspace = true } webcrawler = { workspace = true } -workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/crates/opendata-spiders/examples/quick_dev.rs b/crates/opendata-spiders/examples/quick_dev_opendata_spiders.rs similarity index 100% rename from crates/opendata-spiders/examples/quick_dev.rs rename to crates/opendata-spiders/examples/quick_dev_opendata_spiders.rs diff --git a/crates/preprocessors/Cargo.toml b/crates/preprocessors/Cargo.toml index 4a1d1af..498759d 100644 --- a/crates/preprocessors/Cargo.toml +++ b/crates/preprocessors/Cargo.toml @@ -1,7 +1,10 @@ [package] name = "preprocessors" version = "0.1.0" -edition = "2021" +edition = { workspace = true } +authors = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [[bin]] @@ -20,8 +23,7 @@ prodash = { workspace = true, features = [ "render-line", "render-line-crossterm", ] } -swegov-opendata-preprocess = { version = "0.1.0", path = "../swegov-opendata-preprocess" } +swegov-opendata-preprocess = { workspace = true} tracing = { workspace = true } tracing-forest = { workspace = true } tracing-subscriber = { workspace = true } -workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/crates/sfs-preprocess/Cargo.toml b/crates/sfs-preprocess/Cargo.toml index a0fb461..dd2ac2d 100644 --- a/crates/sfs-preprocess/Cargo.toml +++ b/crates/sfs-preprocess/Cargo.toml @@ -1,7 +1,10 @@ [package] name = "sfs-preprocess" version = "0.1.0" -edition = "2021" +edition = { workspace = true } +authors = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [[bin]] @@ -21,6 +24,5 @@ tracing-forest = { workspace = true } tracing-subscriber = { workspace = true } # local deps -preprocessors.workspace = true -swegov-opendata-preprocess = { version = "0.1.0", path = "../swegov-opendata-preprocess" } -workspace-hack = { version = "0.1", path = "../workspace-hack" } +preprocessors = {workspace = true} +swegov-opendata-preprocess = { workspace = true } diff --git a/crates/sparv-extension/Cargo.toml b/crates/sparv-extension/Cargo.toml index 69dcc5f..160d4e2 100644 --- a/crates/sparv-extension/Cargo.toml +++ b/crates/sparv-extension/Cargo.toml @@ -1,7 +1,10 @@ [package] name = "sparv-extension" version = "0.1.0" -edition = "2021" +edition = { workspace = true } +authors = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] @@ -12,4 +15,3 @@ error-stack = { workspace = true } serde_yaml = { workspace = true } serde = { workspace = true, features = ["derive"] } tracing = { workspace = true } -workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/crates/swegov-opendata-preprocess/Cargo.toml b/crates/swegov-opendata-preprocess/Cargo.toml index b70f3ed..4b5429d 100644 --- a/crates/swegov-opendata-preprocess/Cargo.toml +++ b/crates/swegov-opendata-preprocess/Cargo.toml @@ -1,7 +1,10 @@ [package] name = "swegov-opendata-preprocess" version = "0.1.0" -edition = "2021" +edition = { workspace = true } +authors = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] @@ -16,19 +19,18 @@ soup = { workspace = true } zip = { workspace = true } tracing = { workspace = true } tracing-subscriber = { workspace = true } -swegov-opendata = { version = "0.2.0", path = "../swegov-opendata" } +swegov-opendata = { workspace = true } chrono = { workspace = true } flate2 = { workspace = true } pretty_assertions = { workspace = true } once_cell = { workspace = true } itertools = { workspace = true } -minidom-extension = { version = "0.1.0", path = "../minidom-extension" } -sparv-extension = { version = "0.1.0", path = "../sparv-extension" } +minidom-extension = { workspace = true } +sparv-extension = { workspace = true } prodash = { workspace = true, features = [ "render-line", "render-line-crossterm", ] } -workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] itertools = { workspace = true } diff --git a/crates/swegov-opendata-preprocess/src/preprocess_rd/html.rs b/crates/swegov-opendata-preprocess/src/preprocess_rd/html.rs index 703a9b2..09cd380 100644 --- a/crates/swegov-opendata-preprocess/src/preprocess_rd/html.rs +++ b/crates/swegov-opendata-preprocess/src/preprocess_rd/html.rs @@ -208,7 +208,7 @@ fn process_node(node: &rcdom::Handle) -> ProcessNodeOutput { for child in node.children() { let ProcessNodeOutput { nodes: child_nodes, - attrs, + attrs: _, } = process_node(&child); for child_node in child_nodes { match child_node { diff --git a/crates/swegov-opendata-preprocess/src/preprocess_rd/xml.rs b/crates/swegov-opendata-preprocess/src/preprocess_rd/xml.rs index 24e40c8..bfc3811 100644 --- a/crates/swegov-opendata-preprocess/src/preprocess_rd/xml.rs +++ b/crates/swegov-opendata-preprocess/src/preprocess_rd/xml.rs @@ -37,10 +37,8 @@ pub fn preprocess_xml( let mut in_dokument = false; let mut in_html = false; let mut found_html = false; - let mut collect_doc_attr = false; let mut doc_attr = None; let mut text_attr = None; - let mut collect_text_attr = false; let mut reader = Reader::from_str(xml_string); loop { match reader.read_event() { @@ -60,7 +58,6 @@ pub fn preprocess_xml( | b"dokumentstatus_url_xml" | b"dokument_url_text" | b"dokument_url_html" => { - collect_doc_attr = true; doc_attr = Some(String::from_utf8(e.name().as_ref().to_vec()).unwrap()); tracing::trace!("found doc attr '{:?}'", doc_attr); diff --git a/crates/swegov-opendata-preprocess/src/preprocess_sfs/sfs_json/sfs_div_dok.rs b/crates/swegov-opendata-preprocess/src/preprocess_sfs/sfs_json/sfs_div_dok.rs index 0a0672d..23efe46 100644 --- a/crates/swegov-opendata-preprocess/src/preprocess_sfs/sfs_json/sfs_div_dok.rs +++ b/crates/swegov-opendata-preprocess/src/preprocess_sfs/sfs_json/sfs_div_dok.rs @@ -1,5 +1,3 @@ -use std::borrow::Cow; - use error_stack::Report; use minidom::{ quick_xml::{events::Event, Reader}, @@ -227,7 +225,7 @@ pub fn extract_page( _ => todo!("handle End {:?} state={:?}", e, state), } } - Ok(Event::Comment(e)) => { + Ok(Event::Comment(_e)) => { continue; } Ok(e) => todo!("handle {:?} state={:?}", e, state), diff --git a/crates/swegov-opendata-preprocess/src/shared.rs b/crates/swegov-opendata-preprocess/src/shared.rs index 6f1dc74..f482dfc 100644 --- a/crates/swegov-opendata-preprocess/src/shared.rs +++ b/crates/swegov-opendata-preprocess/src/shared.rs @@ -1,5 +1,5 @@ use itertools::Itertools; -use minidom::{Element, ElementBuilder, Node}; +use minidom::{Element, Node}; use minidom_extension::{elem_is_empty, minidom}; pub fn clean_element(elem: &Element) -> Element { @@ -23,7 +23,7 @@ pub fn clean_element(elem: &Element) -> Element { new_elem } -pub fn clean_texts(elem: &mut Element) {} +pub fn clean_texts(_elem: &mut Element) {} fn clean_nodes(new_elem: &mut Element, elem: &Element) { for node in elem.nodes() { @@ -56,11 +56,12 @@ fn clean_nodes(new_elem: &mut Element, elem: &Element) { pub fn clean_text(text: &str) -> String { let text = text.replace('\u{AD}', ""); - text.split_whitespace() - // .split(char::is_whitespace) - .intersperse(" ") - // .filter(|part| !part.trim().is_empty()) - .collect() + Itertools::intersperse(text.split_whitespace(), " ").collect() + // text.split_whitespace() + // // .split(char::is_whitespace) + // .intersperse(" ") + // // .filter(|part| !part.trim().is_empty()) + // .collect() } #[cfg(test)] diff --git a/crates/swegov-opendata-preprocess/tests/it/preprocess_sfs_json.rs b/crates/swegov-opendata-preprocess/tests/it/preprocess_sfs_json.rs index 592a83c..7902430 100644 --- a/crates/swegov-opendata-preprocess/tests/it/preprocess_sfs_json.rs +++ b/crates/swegov-opendata-preprocess/tests/it/preprocess_sfs_json.rs @@ -35,7 +35,7 @@ fn test_preprocess_sfs_json() -> PreprocessResult<()> { // Assert let mut reader = Reader::from_reader(actual.as_slice()); - let actual = Element::from_reader(&mut reader) + let _actual = Element::from_reader(&mut reader) .change_context(PreprocessError) .attach_printable("failed to read actual")?; @@ -45,7 +45,7 @@ fn test_preprocess_sfs_json() -> PreprocessResult<()> { .attach_printable_lazy(|| example1_expected_path.to_string())?; let reader = BufReader::new(example1_expected_file); let mut reader = Reader::from_reader(reader); - let expected = Element::from_reader(&mut reader) + let _expected = Element::from_reader(&mut reader) .change_context(PreprocessError) .attach_printable("failed to read expected")?; diff --git a/crates/swegov-opendata/Cargo.toml b/crates/swegov-opendata/Cargo.toml index 2a30757..6137d1d 100644 --- a/crates/swegov-opendata/Cargo.toml +++ b/crates/swegov-opendata/Cargo.toml @@ -1,7 +1,10 @@ [package] name = "swegov-opendata" version = "0.2.0" -edition = "2021" +edition = { workspace = true } +authors = { workspace = true } +rust-version = { workspace = true } +license = { workspace = true } # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] @@ -16,7 +19,6 @@ serde = { workspace = true, features = ["derive"] } serde-aux = { workspace = true } serde_json = { workspace = true } serde_with = { workspace = true } -workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] quick-xml = { workspace = true, features = ["serialize"] } diff --git a/crates/swegov-opendata/examples/quick_dev.rs b/crates/swegov-opendata/examples/quick_dev_swegov_opendata.rs similarity index 100% rename from crates/swegov-opendata/examples/quick_dev.rs rename to crates/swegov-opendata/examples/quick_dev_swegov_opendata.rs diff --git a/crates/workspace-hack/.gitattributes b/crates/workspace-hack/.gitattributes deleted file mode 100644 index 3e9dba4..0000000 --- a/crates/workspace-hack/.gitattributes +++ /dev/null @@ -1,4 +0,0 @@ -# Avoid putting conflict markers in the generated Cargo.toml file, since their presence breaks -# Cargo. -# Also do not check out the file as CRLF on Windows, as that's what hakari needs. -Cargo.toml merge=binary -crlf diff --git a/crates/workspace-hack/Cargo.toml b/crates/workspace-hack/Cargo.toml deleted file mode 100644 index 33a460c..0000000 --- a/crates/workspace-hack/Cargo.toml +++ /dev/null @@ -1,38 +0,0 @@ -# This file is generated by `cargo hakari`. -# To regenerate, run: -# cargo hakari generate - -[package] -name = "workspace-hack" -version = "0.1.0" -description = "workspace-hack package, managed by hakari" -# You can choose to publish this crate: see https://docs.rs/cargo-hakari/latest/cargo_hakari/publishing. -publish = false - -# The parts of the file between the BEGIN HAKARI SECTION and END HAKARI SECTION comments -# are managed by hakari. - -### BEGIN HAKARI SECTION -[dependencies] -futures-core = { workspace = true } -memchr = { workspace = true } -quick-xml = { workspace = true, features = ["serialize"] } -rand_core = { workspace = true, features = ["std"] } -regex = { workspace = true } -regex-automata = { workspace = true, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } -regex-syntax = { workspace = true } -serde = { workspace = true, features = ["alloc", "derive"] } -smallvec = { workspace = true, features = ["write"] } -tracing-core = { workspace = true } - -[build-dependencies] -memchr = { workspace = true } -proc-macro2 = { workspace = true } -quote = { workspace = true } -regex = { workspace = true } -regex-automata = { workspace = true, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } -regex-syntax = { workspace = true } -serde = { workspace = true, features = ["alloc", "derive"] } -syn = { workspace = true, features = ["extra-traits", "full", "visit", "visit-mut"] } - -### END HAKARI SECTION diff --git a/crates/workspace-hack/build.rs b/crates/workspace-hack/build.rs deleted file mode 100644 index 92518ef..0000000 --- a/crates/workspace-hack/build.rs +++ /dev/null @@ -1,2 +0,0 @@ -// A build script is required for cargo to consider build dependencies. -fn main() {} diff --git a/crates/workspace-hack/src/lib.rs b/crates/workspace-hack/src/lib.rs deleted file mode 100644 index 22489f6..0000000 --- a/crates/workspace-hack/src/lib.rs +++ /dev/null @@ -1 +0,0 @@ -// This is a stub lib.rs.