feat: ruff as formatter (#322)

* ruff formatter instead of black * refactor: ruff reformatted files * feat: more complete ruff adjustments * refactor: all codebase to comply with ruff rules * chore: update lock * feat: more stringent docstring rules * revert: remove isort and black from Makefile
opentargets · Dec 12, 2023 · 6798153 · 6798153
1 parent 0bd02aa
commit 6798153
Show file tree

Hide file tree

Showing 51 changed files with 192 additions and 213 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -31,19 +31,10 @@ repos:
     rev: v0.1.6
     hooks:
       - id: ruff
-
-  - repo: https://github.com/pycqa/isort
-    rev: 5.12.0
-    hooks:
-      - id: isort
         args:
-          [--add-import, from __future__ import annotations, --profile, black]
-
-  - repo: https://github.com/psf/black
-    rev: 23.11.0
-    hooks:
-      - id: black
-
+          - --fix
+      - id: ruff-format
+        files: ^((otg|utils|tests)/.+)?[^/]+\.py$
   - repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
     rev: v9.10.0
     hooks:

diff --git a/.vscode/extensions.json b/.vscode/extensions.json
@@ -1,10 +1,8 @@
 {
   "recommendations": [
     "charliermarsh.ruff",
-    "ms-python.isort",
     "ms-python.mypy-type-checker",
     "ms-python.python",
-    "ms-python.black-formatter",
     "esbenp.prettier-vscode"
   ],
   "unwantedRecommendations": ["ms-python.flake8"]

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,6 +1,6 @@
 {
   "[python]": {
-    "editor.defaultFormatter": "ms-python.black-formatter",
+    "editor.defaultFormatter": "charliermarsh.ruff",
     "editor.formatOnPaste": false,
     "editor.formatOnSave": true,
     "editor.codeActionsOnSave": {
@@ -19,14 +19,11 @@
     }
   },
   "json.format.keepLines": true,
-  "isort.args": ["--profile", "black"],
   "autoDocstring.docstringFormat": "google",
   "python.testing.pytestArgs": [".", "--doctest-modules", "--cov=src/"],
   "python.testing.unittestEnabled": false,
   "python.testing.pytestEnabled": true,
   "mypy-type-checker.severity": {
     "error": "Information"
-  },
-  "ruff.fixAll": false,
-  "ruff.organizeImports": false
+  }
 }
diff --git a/Makefile b/Makefile
@@ -26,9 +26,6 @@ check: ## Lint and format code
 	@echo "Linting docstrings..."
 	@poetry run pydoclint --config=pyproject.toml src
 	@poetry run pydoclint --config=pyproject.toml --skip-checking-short-docstrings=true tests
-	@echo "Formatting..."
-	@poetry run black src/otg .
-	@poetry run isort src/otg .
 
 test: ## Run tests
 	@echo "Running Tests..."

diff --git a/README.md b/README.md
@@ -2,7 +2,6 @@
 [![docs](https://github.com/opentargets/genetics_etl_python/actions/workflows/docs.yaml/badge.svg)](https://opentargets.github.io/genetics_etl_python/)
 [![codecov](https://codecov.io/gh/opentargets/genetics_etl_python/branch/main/graph/badge.svg?token=5ixzgu8KFP)](https://codecov.io/gh/opentargets/genetics_etl_python)
 [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
-[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/opentargets/genetics_etl_python/main.svg)](https://results.pre-commit.ci/badge/github/opentargets/genetics_etl_python)
 
 # Genetics Portal Data Pipeline (experimental)

diff --git a/docs/index.md b/docs/index.md
@@ -21,7 +21,6 @@ hide:
 ![docs](https://github.com/opentargets/genetics_etl_python/actions/workflows/docs.yaml/badge.svg)
 [![codecov](https://codecov.io/gh/opentargets/genetics_etl_python/branch/main/graph/badge.svg?token=5ixzgu8KFP)](https://codecov.io/gh/opentargets/genetics_etl_python)
 [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
-[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/opentargets/genetics_etl_python/main.svg)](https://results.pre-commit.ci/badge/github/opentargets/genetics_etl_python)
 
 ---

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,7 +28,6 @@ scikit-learn = "^1.3.2"
 
 [tool.poetry.dev-dependencies]
 pre-commit = "^3.6.0"
-black = {version = "^22.12.0", allow-prereleases = true}
 mypy = "^1.7"
 pep8-naming = "^0.13.2"
 interrogate = "^1.5.0"
@@ -84,9 +83,6 @@ upload_to_release = false
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
 
-[tool.isort]
-profile = "black"
-
 [tool.deptry]
 extend_exclude = ["src/conftest.py", "src/airflow", "src/utils"]
 
@@ -135,12 +131,94 @@ ignore_missing_imports = true
 
 
 [tool.ruff]
-select = ["D", "E", "F", "B"]
-ignore = ["D417", "E501", "E203"]
-fixable = ["F401"]
-unfixable = ["B"]
-line-length = 88
-target-version = "py310"
+select = [
+    "B002", # Python does not support the unary prefix increment
+    "B007", # Loop control variable {name} not used within loop body
+    "B014", # Exception handler with duplicate exception
+    "B023", # Function definition does not bind loop variable {name}
+    "B026", # Star-arg unpacking after a keyword argument is strongly discouraged
+    "C",  # complexity
+    "COM818", # Trailing comma on bare tuple prohibited
+    "D",  # docstrings
+    "DTZ003",  # Use datetime.now(tz=) instead of datetime.utcnow()
+    "DTZ004",  # Use datetime.fromtimestamp(ts, tz=) instead of datetime.utcfromtimestamp(ts)
+    "E",  # pycodestyle
+    "F",  # pyflakes/autoflake
+    "G", # flake8-logging-format
+    "I",  # isort
+    "ICN001", # import concentions; {name} should be imported as {asname}
+    "ISC001", # Implicitly concatenated string literals on one line
+    "N804", # First argument of a class method should be named cls
+    "N805", # First argument of a method should be named self
+    "N815", # Variable {name} in class scope should not be mixedCase
+    "PGH001", # No builtin eval() allowed
+    "PGH004",  # Use specific rule codes when using noqa
+    "PLC0414", # Useless import alias. Import alias does not rename original package.
+    "PLC", # pylint
+    "PLE", # pylint
+    "PLR", # pylint
+    "PLW", # pylint
+    "Q000", # Double quotes found but single quotes preferred
+    "RUF006", # Store a reference to the return value of asyncio.create_task
+    "S102", # Use of exec detected
+    "S103",  # bad-file-permissions
+    "S108",  # hardcoded-temp-file
+    "S306",  # suspicious-mktemp-usage
+    "S307",  # suspicious-eval-usage
+    "S313",  # suspicious-xmlc-element-tree-usage
+    "S314",  # suspicious-xml-element-tree-usage
+    "S315",  # suspicious-xml-expat-reader-usage
+    "S316",  # suspicious-xml-expat-builder-usage
+    "S317",  # suspicious-xml-sax-usage
+    "S318",  # suspicious-xml-mini-dom-usage
+    "S319",  # suspicious-xml-pull-dom-usage
+    "S320",  # suspicious-xmle-tree-usage
+    "S601",  # paramiko-call
+    "S602",  # subprocess-popen-with-shell-equals-true
+    "S604",  # call-with-shell-equals-true
+    "S608",  # hardcoded-sql-expression
+    "S609",  # unix-command-wildcard-injection
+    "SIM105", # Use contextlib.suppress({exception}) instead of try-except-pass
+    "SIM117", # Merge with-statements that use the same scope
+    "SIM118", # Use {key} in {dict} instead of {key} in {dict}.keys()
+    "SIM201", # Use {left} != {right} instead of not {left} == {right}
+    "SIM208", # Use {expr} instead of not (not {expr})
+    "SIM212", # Use {a} if {a} else {b} instead of {b} if not {a} else {a}
+    "SIM300", # Yoda conditions. Use 'age == 42' instead of '42 == age'.
+    "SIM401", # Use get from dict with default instead of an if block
+    "T100", # Trace found: {name} used
+    "T20",  # flake8-print
+    "TID251", # Banned imports
+    "TRY004", # Prefer TypeError exception for invalid type
+    "TRY200", # Use raise from to specify exception cause
+    "TRY302", # Remove exception handler; error is immediately re-raised
+    "UP",  # pyupgrade
+    "W",  # pycodestyle
+]
+
+ignore = [
+    "E501",  # line too long
+    "E731",  # do not assign a lambda expression, use a def
+
+    # Ignore ignored, as the rule is now back in preview/nursery, which cannot
+    # be ignored anymore without warnings.
+    # https://github.com/astral-sh/ruff/issues/7491
+    # "PLC1901", # Lots of false positives
+
+    # False positives https://github.com/astral-sh/ruff/issues/5386
+    "PLC0208", # Use a sequence type instead of a `set` when iterating over values
+    "PLR0911", # Too many return statements ({returns} > {max_returns})
+    "PLR0912", # Too many branches ({branches} > {max_branches})
+    "PLR0913", # Too many arguments to function call ({c_args} > {max_args})
+    "PLR0915", # Too many statements ({statements} > {max_statements})
+    "PLR2004",  # Magic value used in comparison, consider replacing {value} with a constant variable
+    "PLW2901", # Outer {outer_kind} variable {name} overwritten by inner {inner_kind} target
+    "UP006", # keep type annotation style as is
+    "UP007", # keep type annotation style as is
+    # Ignored due to performance: https://github.com/charliermarsh/ruff/issues/2923
+    "UP038", # Use `X | Y` in `isinstance` call instead of `(X, Y)`
+
+]
 
 [tool.ruff.per-file-ignores]
 "__init__.py" = ["E402"]

diff --git a/src/airflow/dags/common_airflow.py b/src/airflow/dags/common_airflow.py
@@ -48,16 +48,16 @@
 
 
 # Shared DAG construction parameters.
-shared_dag_args = dict(
-    owner="Open Targets Data Team",
-    retries=0,
-)
-shared_dag_kwargs = dict(
-    tags=["genetics_etl", "experimental"],
-    start_date=pendulum.now(tz="Europe/London").subtract(days=1),
-    schedule="@once",
-    catchup=False,
-)
+shared_dag_args = {
+    "owner": "Open Targets Data Team",
+    "retries": 0,
+}
+shared_dag_kwargs = {
+    "tags": ["genetics_etl", "experimental"],
+    "start_date": pendulum.now(tz="Europe/London").subtract(days=1),
+    "schedule": "@once",
+    "catchup": False,
+}
 
 
 def create_cluster(
@@ -110,7 +110,7 @@ def create_cluster(
     if num_local_ssds:
         for worker_section in ("worker_config", "secondary_worker_config"):
             # Create a disk config section if it does not exist.
-            cluster_config[worker_section].setdefault("disk_config", dict())
+            cluster_config[worker_section].setdefault("disk_config", {})
             # Specify the number of local SSDs.
             cluster_config[worker_section]["disk_config"][
                 "num_local_ssds"
@@ -288,7 +288,7 @@ def read_yaml_config(config_path: Path) -> Any:
         Any: Parsed YAML config file.
     """
     assert config_path.exists(), f"YAML config path {config_path} does not exist."
-    with open(config_path, "r") as config_file:
+    with open(config_path) as config_file:
         return yaml.safe_load(config_file)
 
 
@@ -349,8 +349,6 @@ def submit_pyspark_job_no_operator(
             },
         },
     }
-    res = job_client.submit_job(
+    job_client.submit_job(
         project_id=GCP_PROJECT, region=GCP_REGION, job=job_description
     )
-    job_id = res.reference.job_id
-    print(f"Submitted job ID {job_id}.")
diff --git a/src/airflow/dags/gwas_catalog_harmonisation.py b/src/airflow/dags/gwas_catalog_harmonisation.py
@@ -52,20 +52,20 @@ def create_to_do_list(**kwargs: Any) -> Any:
         raw_harmonised = ti.xcom_pull(
             task_ids="list_raw_harmonised", key="return_value"
         )
-        print("Number of raw harmonised files: ", len(raw_harmonised))
+        print("Number of raw harmonised files: ", len(raw_harmonised))  # noqa: T201
         to_do_list = []
         # Remove the ones that have been processed
         parquets = ti.xcom_pull(task_ids="list_harmonised_parquet", key="return_value")
-        print("Number of parquet files: ", len(parquets))
+        print("Number of parquet files: ", len(parquets))  # noqa: T201
         for path in raw_harmonised:
             match_result = re.search(
-                "raw-harmonised/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", path
+                r"raw-harmonised/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", path
             )
             if match_result:
                 study_id = match_result.group(2)
                 if f"harmonised/{study_id}.parquet/_SUCCESS" not in parquets:
                     to_do_list.append(path)
-        print("Number of jobs to submit: ", len(to_do_list))
+        print("Number of jobs to submit: ", len(to_do_list))  # noqa: T201
         ti.xcom_push(key="to_do_list", value=to_do_list)
 
     # Submit jobs to dataproc
@@ -78,18 +78,18 @@ def submit_jobs(**kwargs: Any) -> None:
         """
         ti = kwargs["ti"]
         todo = ti.xcom_pull(task_ids="create_to_do_list", key="to_do_list")
-        print("Number of jobs to submit: ", len(todo))
+        print("Number of jobs to submit: ", len(todo))  # noqa: T201
         for i in range(len(todo)):
             # Not to exceed default quota 400 jobs per minute
             if i > 0 and i % 399 == 0:
                 time.sleep(60)
             input_path = todo[i]
             match_result = re.search(
-                "raw-harmonised/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", input_path
+                r"raw-harmonised/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", input_path
             )
             if match_result:
                 study_id = match_result.group(2)
-            print("Submitting job for study: ", study_id)
+            print("Submitting job for study: ", study_id)  # noqa: T201
             common.submit_pyspark_job_no_operator(
                 cluster_name=CLUSTER_NAME,
                 step_id="gwas_catalog_sumstat_preprocess",

diff --git a/src/otg/cli.py b/src/otg/cli.py
@@ -13,7 +13,7 @@ def main(cfg: DictConfig) -> None:
     Args:
         cfg (DictConfig): hydra configuration object
     """
-    print(OmegaConf.to_yaml(cfg))
+    print(OmegaConf.to_yaml(cfg))  # noqa: T201
     # Initialise and run step
     instantiate(cfg.step)