Skip to content

Commit

Permalink
Change API for getting article dataframes and add pretoken analysis (#12
Browse files Browse the repository at this point in the history
)
  • Loading branch information
cthoyt authored Feb 5, 2024
1 parent 3b472cd commit e4d324e
Show file tree
Hide file tree
Showing 5 changed files with 178 additions and 45 deletions.
7 changes: 4 additions & 3 deletions src/biolexica/literature/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
annotate_abstracts_from_pubmeds,
annotate_abstracts_from_search,
)
from .retrieve import get_pubmed_dataframe
from .search import query_pubmed
from .retrieve import get_article_dataframe_from_pubmeds
from .search import get_article_dataframe_from_search, query_pubmed

__all__ = [
"query_pubmed",
"get_pubmed_dataframe",
"get_article_dataframe_from_pubmeds",
"get_article_dataframe_from_search",
"AnnotatedArticle",
"Annotation",
"annotate_abstracts_from_pubmeds",
Expand Down
63 changes: 63 additions & 0 deletions src/biolexica/literature/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
from curies import Reference

from .annotate import AnnotatedArticle
from ..api import GrounderHint, load_grounder

__all__ = [
"count_references",
"count_cooccurrences",
"analyze_pretokens",
]


Expand All @@ -35,3 +37,64 @@ def count_cooccurrences(
for annotated_article in annotated_articles
for pair in combinations(annotated_article.count_references(), 2)
)


def analyze_pretokens(
text: str, *, grounder: GrounderHint, min_length: int = 1, max_length: int = 4
) -> t.Counter[str]:
"""Take a histogram over tokens appearing before matches to identify more detailed terms for curation.
:param text: The text to analyze
:param grounder: The grounder
:param min_length: The minimum number of pre-tokens to keep a histogram
:param max_length: The maximum number of pre-tokens to keep a histogram
:returns: A counter of pre-tokens in the given length range
Here's an example where we look at recent literature about dementia and try and
identify if there are:
1. synonyms that could be curated in one of the upstream first-party lexical resources
or third-party lexical resources like Biosynonyms
2. terms that can be added to upstream ontologies, databases, etc.
.. code-block:: python
from collections import Counter
from tabulate import tabulate
import biolexica
from biolexica.literature import get_article_dataframe_from_search
from biolexica.literature.analyze import analyze_pretokens
grounder = biolexica.load_grounder("phenotype")
df = get_article_dataframe_from_search("dementia")
counter = Counter()
for abstract in df["abstract"]:
counter.update(analyze_pretokens(abstract, grounder=grounder))
table = tabulate(counter.most_common(), headers=["phrase", "count"], tablefmt="github")
print(table)
"""
from gilda.ner import stop_words

grounder = load_grounder(grounder)
text = text.replace("\n", " ").replace(" ", " ")
rv: t.Counter[str] = Counter()
for annotation in grounder.annotate(text):
parts = text[: annotation.start].split()
for i in range(min_length, max_length + 1):
reduced_parts = parts[-i:]
if len(reduced_parts) < min_length:
continue
if reduced_parts[0].lower() in stop_words:
# doesn't make sense for a named entity to start
# with one of these words, like "of"
continue
if reduced_parts[0].isnumeric():
continue
if any(part.strip().endswith(".") for part in reduced_parts):
# If any of the parts ends with a dot, it means that this
# set of pre-words goes into the previous sentence, so skip
continue
pre = " ".join(reduced_parts)
rv[pre] += 1
return rv
55 changes: 19 additions & 36 deletions src/biolexica/literature/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,16 @@
from __future__ import annotations

import logging
import time
import typing as t
from collections import Counter
from typing import List, Optional, Union

from curies import Reference
from more_itertools import batched
from pydantic import BaseModel
from tqdm.auto import tqdm

from biolexica.api import Annotation, GrounderHint, load_grounder
from biolexica.literature.retrieve import get_pubmed_dataframe
from biolexica.literature.retrieve import _iter_dataframes_from_pubmeds
from biolexica.literature.search import query_pubmed

__all__ = [
Expand Down Expand Up @@ -63,48 +61,33 @@ def annotate_abstracts_from_pubmeds(
grounder: GrounderHint,
*,
use_indra_db: bool = True,
batch_size: int = 20_000,
batch_size: Optional[int] = None,
show_progress: bool = True,
) -> List[AnnotatedArticle]:
"""Annotate the given articles using the given Gilda grounder."""
n_pmids = len(pubmed_ids)

rv: List[AnnotatedArticle] = []

grounder = load_grounder(grounder)

outer_it = tqdm(
batched(pubmed_ids, batch_size),
total=1 + n_pmids // batch_size,
unit="batch",
desc="Annotating articles",
disable=not show_progress,
df_iterator = _iter_dataframes_from_pubmeds(
pubmed_ids=pubmed_ids,
batch_size=batch_size,
use_indra_db=use_indra_db,
show_progress=show_progress,
)
for i, pubmed_batch in enumerate(outer_it, start=1):
t = time.time()
pubmed_batch = list(pubmed_batch)
articles_df = get_pubmed_dataframe(pubmed_batch, use_indra_db=use_indra_db).reset_index()
n_retrieved = len(articles_df.index)
tqdm.write(
f"[batch {i}] Got {n_retrieved:,} articles "
f"({n_retrieved/len(pubmed_batch):.1%}) in {time.time() - t:.2f} seconds"
rv: List[AnnotatedArticle] = [
AnnotatedArticle(
pubmed=pubmed,
title=title,
abstract=abstract,
annotations=grounder.annotate(abstract),
)
for pmid, title, abstract in tqdm(
articles_df.values,
for i, df in enumerate(df_iterator, start=1)
for pubmed, title, abstract in tqdm(
df.itertuples(),
desc=f"Annotating batch {i}",
unit_scale=True,
unit="article",
total=n_retrieved,
total=len(df.index),
leave=False,
disable=not show_progress,
):
rv.append(
AnnotatedArticle(
pubmed=pmid,
title=title,
abstract=abstract,
annotations=grounder.annotate(abstract),
)
)

)
]
return rv
65 changes: 60 additions & 5 deletions src/biolexica/literature/retrieve.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
from __future__ import annotations

import logging
from typing import Dict, Iterable, List, Union
import time
from typing import Dict, Iterable, List, Optional, Union

import pandas as pd
from more_itertools import batched
from tqdm.asyncio import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm

__all__ = [
"get_pubmed_dataframe",
"get_article_dataframe_from_pubmeds",
"PUBMED_DATAFRAME_COLUMNS",
"clean_df",
]
Expand All @@ -21,10 +23,27 @@
PUBMED_DATAFRAME_COLUMNS = ["pubmed", "title", "abstract"]


def get_pubmed_dataframe(
pubmed_ids: Iterable[Union[str, int]], *, use_indra_db: bool = True, db=None
def get_article_dataframe_from_pubmeds(
pubmed_ids: Iterable[Union[str, int]],
*,
use_indra_db: bool = True,
db=None,
batch_size: Optional[int] = None,
show_progress: bool = True,
) -> pd.DataFrame:
"""Get a dataframe indexed by PubMed identifier (str) with title and abstract columns."""
return pd.concat(
_iter_dataframes_from_pubmeds(
pubmed_ids=pubmed_ids,
use_indra_db=use_indra_db,
db=db,
batch_size=batch_size,
show_progress=show_progress,
)
)


def _get_batch(pubmed_ids: Iterable[Union[str, int]], *, use_indra_db: bool = True, db=None):
if use_indra_db:
try:
return _from_indra_db(pubmed_ids, db=db)
Expand All @@ -36,6 +55,41 @@ def get_pubmed_dataframe(
return _from_api(pubmed_ids)


def _iter_dataframes_from_pubmeds(
pubmed_ids: Iterable[Union[str, int]],
*,
use_indra_db: bool = True,
db=None,
batch_size: Optional[int] = None,
show_progress: bool = True,
) -> Iterable[pd.DataFrame]:
"""Query PubMed for article identifiers based on a given search and get a dataframe."""
if batch_size is None:
batch_size = 20_000

pubmed_ids = _clean_pubmeds(pubmed_ids)
if len(pubmed_ids) < batch_size:
# only a single batch, iterator not needed
show_progress = False
outer_it = tqdm(
batched(pubmed_ids, batch_size),
total=1 + len(pubmed_ids) // batch_size,
unit="batch",
desc="Getting articles",
disable=not show_progress,
)
for i, pubmed_batch in enumerate(outer_it, start=1):
pubmed_batch = list(pubmed_batch)
t = time.time()
df = _get_batch(pubmed_batch, use_indra_db=use_indra_db, db=db)
n_retrieved = len(df.index)
outer_it.write(
f"[batch {i}] Got {n_retrieved:,} articles "
f"({n_retrieved/len(pubmed_batch):.1%}) in {time.time() - t:.2f} seconds"
)
yield df


def _clean_pubmeds(pubmeds: Iterable[Union[str, int]]) -> List[str]:
return sorted(map(str, pubmeds), key=int)

Expand All @@ -58,7 +112,8 @@ def _from_api(pmids: Iterable[Union[str, int]]) -> pd.DataFrame:
desc="Getting PubMed titles/abstracts",
)
]
df = pd.DataFrame(rows, columns=PUBMED_DATAFRAME_COLUMNS).set_index("pubmed")
df = pd.DataFrame(rows, columns=PUBMED_DATAFRAME_COLUMNS)
df = df.set_index("pubmed")
df = clean_df(df)
return df

Expand Down
33 changes: 32 additions & 1 deletion src/biolexica/literature/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,46 @@
import subprocess
from typing import Any, List, Literal, Optional

import pandas as pd

from .retrieve import get_article_dataframe_from_pubmeds

__all__ = [
"get_article_dataframe_from_search",
"query_pubmed",
]

Method = Literal["api", "esearch"]


def get_article_dataframe_from_search(
search_term: str,
*,
method: Optional[Method] = None,
use_indra_db: bool = True,
db=None,
batch_size: Optional[int] = None,
show_progress: bool = True,
limit: Optional[int] = None,
**kwargs: Any,
) -> pd.DataFrame:
"""Query PubMed for article identifiers based on a given search and get a dataframe."""
pubmed_ids = query_pubmed(search_term, method=method, **kwargs)
if limit:
pubmed_ids = pubmed_ids[:limit]
return get_article_dataframe_from_pubmeds(
pubmed_ids,
use_indra_db=use_indra_db,
db=db,
batch_size=batch_size,
show_progress=show_progress,
)


def query_pubmed(
search_term: str,
*,
method: Optional[Literal["api", "esearch"]] = None,
method: Optional[Method] = None,
**kwargs: Any,
) -> List[str]:
"""Query PubMed for article identifiers based on a given search."""
Expand Down

0 comments on commit e4d324e

Please sign in to comment.