Skip to content

Commit

Permalink
Add documentation updates (#21)
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt authored Jan 18, 2024
1 parent 133407a commit 18c0c18
Show file tree
Hide file tree
Showing 4 changed files with 210 additions and 26 deletions.
26 changes: 12 additions & 14 deletions src/eliater/discover_latent_nodes.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
r"""This module contains methods to discover nuisance nodes in a network.
Given an acyclic directed mixed graph (ADMG), along with the treatment and the outcome
of interest, certain observable variables can be regarded as nuisances. This
classification arises because they do not have any impact on the outcome and should not
be involved in the estimation of the treatment's effect on the outcome. These specific
variables are descendants of the variables on all causal paths that are not ancestors of
the outcome. A causal path, in this context, refers to a directed path that starts from the
treatment and leads to the outcome such that all the arrows on the path have the same direction.
This module is designed to identify these variables.
This process enables us to concentrate on the fundamental variables needed to estimate the
treatment's impact on the outcome. This focus results in more precise estimates with reduced
of interest, certain observable variables do not have any impact on the outcome and should not
be involved in the estimation of the treatment's effect on the outcome. We call such variables
nuisance variables. These specific variables are descendants of the variables on all causal paths
that are not ancestors of the outcome. A causal path, in this context, refers to a directed path
that starts from the treatment and leads to the outcome such that all the arrows on the path have
the same direction. This module is designed to identify these variables.
This process enables us to concentrate on the fundamental and necessary variables needed to estimate the
treatment's impact on the outcome. This focus results in more precise causal query estimates with reduced
variance and bias. In addition, if this process is combined with the simplification function
:func:`y0.algorithm.simplify_latent.simplify_latent_dag` it can help to remove the nuisance variables
from the graph which leads to simpler, more interpretable, and visually more appealing result.
:func:`y0.algorithm.simplify_latent.simplify_latent_dag` it can help to create a new graph that does not
contain nuisance variables. This simplification leads to simpler, more interpretable, and visually more
appealing result.
Example
-------
Expand Down Expand Up @@ -141,8 +141,6 @@
$Z_4$ is removed as its children are a subset of $Z_1$'s children.
"""

# FIXME @sara the explanation section is not easy to follow. please revise it.

import itertools
from typing import Iterable, Optional, Set, Union

Expand Down
26 changes: 18 additions & 8 deletions src/eliater/examples/ecoli.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
"""Examples for transcriptional Escherichia coli K-12 regulatory network."""
"""Examples for transcriptional Escherichia coli K-12 regulatory network.
# FIXME add the following documentation. DO NOT remove this fixme without review and confirmation.
# 1. Where did this network come from? What physical experimentation was used to create it?
# What database was it in and how was the database accessed (via code? by hand?)
# 2. What is the biological phenomena described here? More detail needed.
# 3. Is there associated data to go with this graph? Commit in the examples repository
The E. Coli regulatory network was extracted manually (by hand) from the EcoCyc
database [Keseler2021]_ . The nodes represent genes, and the edges represent
regulatory relationships.
.. [Keseler2021] `The EcoCyc database in 2021 <https://doi.org/10.3389/fmicb.2021.711077>`_
"""

# FIXME what kind of regulatory relationships?
# FIXME What kinds of experiments did they come from?
# FIXME What was the method that these experiments got into the database?
# FIXME How was the network extracted manually? What was the thought process?
# FIXME why do the explanations in the module docstring and the example not match?

from y0.algorithm.identify import Query
from y0.examples import Example
Expand Down Expand Up @@ -118,8 +125,11 @@
" ... & Vitek, O. (2023). Optimal adjustment sets for causal query estimation in partially"
" observed biomolecular networks. Bioinformatics, 39(Supplement_1), i494-i503.",
graph=graph,
description="This is the transcriptional E. Coli regulatory network"
" obtained from EcoCyc database ",
description="This is the transcriptional E. Coli regulatory network obtained from EcoCyc database. "
"The experimental data were 260 RNA-seq normalized expression profiles of E. coli K-12"
" MG1655 and BW25113 across 154 unique experimental conditions, extracted from the PRECISE"
" database by (Sastry et al., 2019) from this paper: 'The Escherichia coli transcriptome mostly"
" consists of independently regulated modules' ",
example_queries=[Query.from_str(treatments="fur", outcomes="dpiA")],
)

Expand Down
177 changes: 174 additions & 3 deletions src/eliater/examples/sars.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
"""Examples for SARS-CoV-2."""

__all__ = [
"sars_cov_2_example",
]
import numpy as np
import pandas as pd

from y0.algorithm.identify import Query
from y0.dsl import Variable
from y0.examples import Example
from y0.graph import NxMixedGraph

__all__ = [
"sars_cov_2_example",
]

graph = NxMixedGraph.from_str_edges(
directed=[
("SARS_COV2", "ACE2"),
Expand Down Expand Up @@ -39,6 +43,172 @@
],
)


def _r_exp(x):
return 1 / (1 + np.exp(x))


def generate_discrete(
num_samples: int, treatments: dict[Variable, float] | None = None, *, seed: int | None = None
) -> pd.DataFrame:
"""Generate testing data for the SARS-CoV-2 large graph.
:param num_samples: The number of samples to generate. Try 1000.
:param treatments: An optional dictionary of the values to fix each variable to.
:param seed: An optional random seed for reproducibility purposes
:returns: A pandas Dataframe with columns corresponding
to the variable names SARS-CoV-2 large graph
"""
if treatments is None:
treatments = {}
generator = np.random.default_rng(seed)

gefi = generator.binomial(n=1, p=0.9, size=num_samples)
toci = generator.binomial(n=1, p=0.7, size=num_samples)
u_sars_ang = generator.binomial(n=1, p=0.6, size=num_samples)
u_prr_nfkb = generator.binomial(n=1, p=0.6, size=num_samples)
u_egf_egfr = generator.binomial(n=1, p=0.6, size=num_samples)
u_tnf_egfr = generator.binomial(n=1, p=0.6, size=num_samples)
u_il6_stat_egfr = generator.binomial(n=1, p=0.6, size=num_samples)
u_adam17_sil6_ra = generator.binomial(n=1, p=0.6, size=num_samples)

beta0_u_sars_ang_to_sars_cov2 = -1.8
beta_u_sars_ang = 0.05 # positive
loc_sars_cov2 = np.array(_r_exp(-beta0_u_sars_ang_to_sars_cov2 - u_sars_ang * beta_u_sars_ang))
sars_cov2 = generator.binomial(n=1, p=loc_sars_cov2, size=num_samples)

beta0_sars_cov2_to_ace2 = 1.5
beta_sars_cov2_to_ace2 = -0.04 # negative
loc_ace2 = _r_exp(-beta0_sars_cov2_to_ace2 - sars_cov2 * beta_sars_cov2_to_ace2)
ace2 = generator.binomial(n=1, p=loc_ace2, size=num_samples)

beta0_ang = 1.1
beta_ace2_to_ang = -0.06 # negative
beta_u_sars_ang = 0.05 # positive
loc_ang = _r_exp(-beta0_ang - ace2 * beta_ace2_to_ang - u_sars_ang * beta_u_sars_ang)
ang = generator.binomial(n=1, p=loc_ang, size=num_samples)

beta0_agtr1 = -1.5
beta_ang_to_agtr1 = 0.08
loc_agtr1 = _r_exp(-beta0_agtr1 - ang * beta_ang_to_agtr1)
agtr1 = generator.binomial(n=1, p=loc_agtr1, size=num_samples)

beta0_adam17 = -1
beta_agtr1_to_adam17 = 0.04
beta_u_adam17_sil6r_to_adam17 = 0.04
loc_adam17 = _r_exp(
-beta0_adam17
- agtr1 * beta_agtr1_to_adam17
- u_adam17_sil6_ra * beta_u_adam17_sil6r_to_adam17
)
adam17 = generator.binomial(n=1, p=loc_adam17, size=num_samples)

beta0_sil6r = -1.9
beta_adam17_to_sil6r = 0.03
beta_u_to_sil6r = 0.05
beta_toci_to_sil6r = -0.04 # negative
loc_sil6r = _r_exp(
-beta0_sil6r
- adam17 * beta_adam17_to_sil6r
- u_adam17_sil6_ra * beta_u_to_sil6r
- toci * beta_toci_to_sil6r
)
sil6r = generator.binomial(n=1, p=loc_sil6r, size=num_samples)

beta0_egf = -1.6
beta_adam17_to_egf = 0.03
beta_u_to_egf = 0.05
loc_egf = _r_exp(-beta0_egf - adam17 * beta_adam17_to_egf - u_egf_egfr * beta_u_to_egf)
egf = generator.binomial(n=1, p=loc_egf, size=num_samples)

beta0_tnf = -1.8
beta_adam17_to_tnf = 0.05
beta_u_to_tnf = 0.06
loc_tnf = _r_exp(-beta0_tnf - adam17 * beta_adam17_to_tnf - u_tnf_egfr * beta_u_to_tnf)
tnf = generator.binomial(n=1, p=loc_tnf, size=num_samples)

beta0_egfr = -1.9
beta_egf_to_egfr = 0.03
beta_u1_to_egfr = 0.05
beta_u2_to_egfr = 0.02
beta_u3_to_egfr = 0.04
beta_gefi_to_egfr = -0.08 # negative
if Variable("EGFR") in treatments:
egfr = np.full(num_samples, treatments[Variable("EGFR")])
else:
p = _r_exp(
-beta0_egfr
- egf * beta_egf_to_egfr
- u_il6_stat_egfr * beta_u1_to_egfr
- u_tnf_egfr * beta_u2_to_egfr
- u_egf_egfr * beta_u3_to_egfr
- gefi * beta_gefi_to_egfr
)
egfr = generator.binomial(1, p, size=num_samples)

beta0_prr = -1.4
beta_sars_cov2_to_prr = 0.05
beta_u_to_prr = 0.02
loc_prr = _r_exp(-beta0_prr - sars_cov2 * beta_sars_cov2_to_prr - u_prr_nfkb * beta_u_to_prr)
prr = generator.binomial(n=1, p=loc_prr, size=num_samples)

beta0_nfkb = -1.8
beta_prr_to_nfkb = 0.01
beta_u_to_nfkb = 0.02
beta_egfr_to_nfkb = 0.7
beta_tnf_to_nfkb = 0.01
loc_nfkb = _r_exp(
-beta0_nfkb
- prr * beta_prr_to_nfkb
- u_prr_nfkb * beta_u_to_nfkb
- egfr * beta_egfr_to_nfkb
- tnf * beta_tnf_to_nfkb
)
nfkb = generator.binomial(n=1, p=loc_nfkb, size=num_samples)

beta0_il6_stat3 = -1.6
beta_u_to_il6_stat3 = -0.05
beta_sil6r_to_il6_stat3 = 0.04
loc_il6_stat3 = _r_exp(
-beta0_il6_stat3 - u_il6_stat_egfr * beta_u_to_il6_stat3 - sil6r * beta_sil6r_to_il6_stat3
)
il6_stat3 = generator.binomial(n=1, p=loc_il6_stat3, size=num_samples)

beta0_il6_amp = -1.1
beta_nfkb_to_il6_amp = -5
beta_il6_stat3_to_il6_amp = 0.03
loc_il6_amp = _r_exp(
-beta0_il6_amp - nfkb * beta_nfkb_to_il6_amp - il6_stat3 * beta_il6_stat3_to_il6_amp
)
il6_amp = generator.binomial(n=1, p=loc_il6_amp, size=num_samples)

beta0_cytok = -1.1
beta_il6_amp_tocytok = 4
loc_cytok = _r_exp(-beta0_cytok - il6_amp * beta_il6_amp_tocytok)
cytok = generator.binomial(n=1, p=loc_cytok, size=num_samples)

data = {
"SARS_COV2": sars_cov2,
"ACE2": ace2,
"Ang": ang,
"AGTR1": agtr1,
"ADAM17": adam17,
"Toci": toci,
"Sil6r": sil6r,
"EGF": egf,
"TNF": tnf,
"Gefi": gefi,
"EGFR": egfr,
"PRR": prr,
"NFKB": nfkb,
"IL6STAT3": il6_stat3,
"IL6AMP": il6_amp,
"cytok": cytok,
}
df = pd.DataFrame(data)
return df


sars_cov_2_example = Example(
name="SARS-CoV-2 Graph",
reference="Mohammad-Taheri, S., Zucker, J., Hoyt, C. T., Sachs, K., Tewari, V., Ness, R., & Vitek,"
Expand All @@ -51,6 +221,7 @@
" Reasoner and Assembler (INDRA) workflow, and by quering and expressing the corresponding causal"
" statements in the Biological Expression Language (BEL). Presence of latent variables was determined"
" by querying pairs of entities in the network for common causes in the corpus.",
generate_data=generate_discrete,
# FIXME 100% detail on exactly how this network was constructed. What queries were run on INDRA,
# what code was used to do it, what parts were done by hand
example_queries=[
Expand Down
7 changes: 6 additions & 1 deletion src/eliater/examples/t_cell_signaling_pathway.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,12 @@
graph=graph,
description="This is an example of a protein signaling network of the T cell signaling pathway"
"It models the molecular mechanisms and regulatory processes of human cells involved"
"in T cell activation, proliferation, and function.",
"in T cell activation, proliferation, and function. The observational data consisted of quantitative"
" multivariate flow cytometry measureents of phosphorylated proteins derived from thousands of individual"
" primary immune system cells. The cells were subjected to general stimuli meant to activate the desired "
"paths. The distributions of measurements of individual proeins were skewed, and pairs of proteins exhibited"
" nonlinear relationships. To account for that, the data were binned into two levels corresponding to low, and"
" high concentrations to preserve the dependence structure of the original data.",
example_queries=[Query.from_str(treatments="Raf", outcomes="Erk")],
)

Expand Down

0 comments on commit 18c0c18

Please sign in to comment.