Add documentation updates (#21)

y0-causal-inference · Jan 18, 2024 · 18c0c18 · 18c0c18
1 parent 133407a
commit 18c0c18
Show file tree

Hide file tree

Showing 4 changed files with 210 additions and 26 deletions.
diff --git a/src/eliater/discover_latent_nodes.py b/src/eliater/discover_latent_nodes.py
@@ -1,19 +1,19 @@
 r"""This module contains methods to discover nuisance nodes in a network.
 
 Given an acyclic directed mixed graph (ADMG), along with the treatment and the outcome
-of interest, certain observable variables can be regarded as nuisances. This
-classification arises because they do not have any impact on the outcome and should not
-be involved in the estimation of the treatment's effect on the outcome. These specific
-variables are descendants of the variables on all causal paths that are not ancestors of
-the outcome. A causal path, in this context, refers to a directed path that starts from the
-treatment and leads to the outcome such that all the arrows on the path have the same direction.
-This module is designed to identify these variables.
-
-This process enables us to concentrate on the fundamental variables needed to estimate the
-treatment's impact on the outcome. This focus results in more precise estimates with reduced
+of interest, certain observable variables do not have any impact on the outcome and should not
+be involved in the estimation of the treatment's effect on the outcome. We call such variables
+nuisance variables. These specific variables are descendants of the variables on all causal paths
+that are not ancestors of the outcome. A causal path, in this context, refers to a directed path
+that starts from the treatment and leads to the outcome such that all the arrows on the path have
+the same direction. This module is designed to identify these variables.
+
+This process enables us to concentrate on the fundamental and necessary variables needed to estimate the
+treatment's impact on the outcome. This focus results in more precise causal query estimates with reduced
 variance and bias. In addition, if this process is combined with the simplification function
-:func:`y0.algorithm.simplify_latent.simplify_latent_dag` it can help to remove the nuisance variables
-from the graph which leads to simpler, more interpretable, and visually more appealing result.
+:func:`y0.algorithm.simplify_latent.simplify_latent_dag` it can help to  create a new graph that does not
+contain nuisance variables. This simplification leads to simpler, more interpretable, and visually more
+appealing result.
 
 Example
 -------
@@ -141,8 +141,6 @@
 $Z_4$ is removed as its children are a subset of $Z_1$'s children.
 """
 
-# FIXME @sara the explanation section is not easy to follow. please revise it.
-
 import itertools
 from typing import Iterable, Optional, Set, Union
 

diff --git a/src/eliater/examples/ecoli.py b/src/eliater/examples/ecoli.py
@@ -1,10 +1,17 @@
-"""Examples for transcriptional Escherichia coli K-12 regulatory network."""
+"""Examples for transcriptional Escherichia coli K-12 regulatory network.
 
-# FIXME add the following documentation. DO NOT remove this fixme without review and confirmation.
-#  1. Where did this network come from? What physical experimentation was used to create it?
-#     What database was it in and how was the database accessed (via code? by hand?)
-#  2. What is the biological phenomena described here? More detail needed.
-#  3. Is there associated data to go with this graph? Commit in the examples repository
+The E. Coli regulatory network was extracted manually (by hand) from the EcoCyc
+database [Keseler2021]_ . The nodes represent genes, and the edges represent
+regulatory relationships.
+
+.. [Keseler2021] `The EcoCyc database in 2021 <https://doi.org/10.3389/fmicb.2021.711077>`_
+"""
+
+# FIXME what kind of regulatory relationships?
+# FIXME What kinds of experiments did they come from?
+# FIXME What was the method that these experiments got into the database?
+# FIXME How was the network extracted manually? What was the thought process?
+# FIXME why do the explanations in the module docstring and the example not match?
 
 from y0.algorithm.identify import Query
 from y0.examples import Example
@@ -118,8 +125,11 @@
     " ... & Vitek, O. (2023). Optimal adjustment sets for causal query estimation in partially"
     " observed biomolecular networks. Bioinformatics, 39(Supplement_1), i494-i503.",
     graph=graph,
-    description="This is the transcriptional E. Coli regulatory network"
-    " obtained from EcoCyc database ",
+    description="This is the transcriptional E. Coli regulatory network obtained from EcoCyc database. "
+    "The experimental data were 260 RNA-seq normalized expression profiles of E. coli K-12"
+    " MG1655 and BW25113 across 154 unique experimental conditions, extracted from the PRECISE"
+    " database by (Sastry et al., 2019) from this paper: 'The Escherichia coli transcriptome mostly"
+    " consists of independently regulated modules' ",
     example_queries=[Query.from_str(treatments="fur", outcomes="dpiA")],
 )
 

diff --git a/src/eliater/examples/sars.py b/src/eliater/examples/sars.py
@@ -1,13 +1,17 @@
 """Examples for SARS-CoV-2."""
 
-__all__ = [
-    "sars_cov_2_example",
-]
+import numpy as np
+import pandas as pd
 
 from y0.algorithm.identify import Query
+from y0.dsl import Variable
 from y0.examples import Example
 from y0.graph import NxMixedGraph
 
+__all__ = [
+    "sars_cov_2_example",
+]
+
 graph = NxMixedGraph.from_str_edges(
     directed=[
         ("SARS_COV2", "ACE2"),
@@ -39,6 +43,172 @@
     ],
 )
 
+
+def _r_exp(x):
+    return 1 / (1 + np.exp(x))
+
+
+def generate_discrete(
+    num_samples: int, treatments: dict[Variable, float] | None = None, *, seed: int | None = None
+) -> pd.DataFrame:
+    """Generate testing data for the SARS-CoV-2 large graph.
+
+    :param num_samples: The number of samples to generate. Try 1000.
+    :param treatments: An optional dictionary of the values to fix each variable to.
+    :param seed: An optional random seed for reproducibility purposes
+    :returns: A pandas Dataframe with columns corresponding
+        to the variable names SARS-CoV-2 large graph
+    """
+    if treatments is None:
+        treatments = {}
+    generator = np.random.default_rng(seed)
+
+    gefi = generator.binomial(n=1, p=0.9, size=num_samples)
+    toci = generator.binomial(n=1, p=0.7, size=num_samples)
+    u_sars_ang = generator.binomial(n=1, p=0.6, size=num_samples)
+    u_prr_nfkb = generator.binomial(n=1, p=0.6, size=num_samples)
+    u_egf_egfr = generator.binomial(n=1, p=0.6, size=num_samples)
+    u_tnf_egfr = generator.binomial(n=1, p=0.6, size=num_samples)
+    u_il6_stat_egfr = generator.binomial(n=1, p=0.6, size=num_samples)
+    u_adam17_sil6_ra = generator.binomial(n=1, p=0.6, size=num_samples)
+
+    beta0_u_sars_ang_to_sars_cov2 = -1.8
+    beta_u_sars_ang = 0.05  # positive
+    loc_sars_cov2 = np.array(_r_exp(-beta0_u_sars_ang_to_sars_cov2 - u_sars_ang * beta_u_sars_ang))
+    sars_cov2 = generator.binomial(n=1, p=loc_sars_cov2, size=num_samples)
+
+    beta0_sars_cov2_to_ace2 = 1.5
+    beta_sars_cov2_to_ace2 = -0.04  # negative
+    loc_ace2 = _r_exp(-beta0_sars_cov2_to_ace2 - sars_cov2 * beta_sars_cov2_to_ace2)
+    ace2 = generator.binomial(n=1, p=loc_ace2, size=num_samples)
+
+    beta0_ang = 1.1
+    beta_ace2_to_ang = -0.06  # negative
+    beta_u_sars_ang = 0.05  # positive
+    loc_ang = _r_exp(-beta0_ang - ace2 * beta_ace2_to_ang - u_sars_ang * beta_u_sars_ang)
+    ang = generator.binomial(n=1, p=loc_ang, size=num_samples)
+
+    beta0_agtr1 = -1.5
+    beta_ang_to_agtr1 = 0.08
+    loc_agtr1 = _r_exp(-beta0_agtr1 - ang * beta_ang_to_agtr1)
+    agtr1 = generator.binomial(n=1, p=loc_agtr1, size=num_samples)
+
+    beta0_adam17 = -1
+    beta_agtr1_to_adam17 = 0.04
+    beta_u_adam17_sil6r_to_adam17 = 0.04
+    loc_adam17 = _r_exp(
+        -beta0_adam17
+        - agtr1 * beta_agtr1_to_adam17
+        - u_adam17_sil6_ra * beta_u_adam17_sil6r_to_adam17
+    )
+    adam17 = generator.binomial(n=1, p=loc_adam17, size=num_samples)
+
+    beta0_sil6r = -1.9
+    beta_adam17_to_sil6r = 0.03
+    beta_u_to_sil6r = 0.05
+    beta_toci_to_sil6r = -0.04  # negative
+    loc_sil6r = _r_exp(
+        -beta0_sil6r
+        - adam17 * beta_adam17_to_sil6r
+        - u_adam17_sil6_ra * beta_u_to_sil6r
+        - toci * beta_toci_to_sil6r
+    )
+    sil6r = generator.binomial(n=1, p=loc_sil6r, size=num_samples)
+
+    beta0_egf = -1.6
+    beta_adam17_to_egf = 0.03
+    beta_u_to_egf = 0.05
+    loc_egf = _r_exp(-beta0_egf - adam17 * beta_adam17_to_egf - u_egf_egfr * beta_u_to_egf)
+    egf = generator.binomial(n=1, p=loc_egf, size=num_samples)
+
+    beta0_tnf = -1.8
+    beta_adam17_to_tnf = 0.05
+    beta_u_to_tnf = 0.06
+    loc_tnf = _r_exp(-beta0_tnf - adam17 * beta_adam17_to_tnf - u_tnf_egfr * beta_u_to_tnf)
+    tnf = generator.binomial(n=1, p=loc_tnf, size=num_samples)
+
+    beta0_egfr = -1.9
+    beta_egf_to_egfr = 0.03
+    beta_u1_to_egfr = 0.05
+    beta_u2_to_egfr = 0.02
+    beta_u3_to_egfr = 0.04
+    beta_gefi_to_egfr = -0.08  # negative
+    if Variable("EGFR") in treatments:
+        egfr = np.full(num_samples, treatments[Variable("EGFR")])
+    else:
+        p = _r_exp(
+            -beta0_egfr
+            - egf * beta_egf_to_egfr
+            - u_il6_stat_egfr * beta_u1_to_egfr
+            - u_tnf_egfr * beta_u2_to_egfr
+            - u_egf_egfr * beta_u3_to_egfr
+            - gefi * beta_gefi_to_egfr
+        )
+        egfr = generator.binomial(1, p, size=num_samples)
+
+    beta0_prr = -1.4
+    beta_sars_cov2_to_prr = 0.05
+    beta_u_to_prr = 0.02
+    loc_prr = _r_exp(-beta0_prr - sars_cov2 * beta_sars_cov2_to_prr - u_prr_nfkb * beta_u_to_prr)
+    prr = generator.binomial(n=1, p=loc_prr, size=num_samples)
+
+    beta0_nfkb = -1.8
+    beta_prr_to_nfkb = 0.01
+    beta_u_to_nfkb = 0.02
+    beta_egfr_to_nfkb = 0.7
+    beta_tnf_to_nfkb = 0.01
+    loc_nfkb = _r_exp(
+        -beta0_nfkb
+        - prr * beta_prr_to_nfkb
+        - u_prr_nfkb * beta_u_to_nfkb
+        - egfr * beta_egfr_to_nfkb
+        - tnf * beta_tnf_to_nfkb
+    )
+    nfkb = generator.binomial(n=1, p=loc_nfkb, size=num_samples)
+
+    beta0_il6_stat3 = -1.6
+    beta_u_to_il6_stat3 = -0.05
+    beta_sil6r_to_il6_stat3 = 0.04
+    loc_il6_stat3 = _r_exp(
+        -beta0_il6_stat3 - u_il6_stat_egfr * beta_u_to_il6_stat3 - sil6r * beta_sil6r_to_il6_stat3
+    )
+    il6_stat3 = generator.binomial(n=1, p=loc_il6_stat3, size=num_samples)
+
+    beta0_il6_amp = -1.1
+    beta_nfkb_to_il6_amp = -5
+    beta_il6_stat3_to_il6_amp = 0.03
+    loc_il6_amp = _r_exp(
+        -beta0_il6_amp - nfkb * beta_nfkb_to_il6_amp - il6_stat3 * beta_il6_stat3_to_il6_amp
+    )
+    il6_amp = generator.binomial(n=1, p=loc_il6_amp, size=num_samples)
+
+    beta0_cytok = -1.1
+    beta_il6_amp_tocytok = 4
+    loc_cytok = _r_exp(-beta0_cytok - il6_amp * beta_il6_amp_tocytok)
+    cytok = generator.binomial(n=1, p=loc_cytok, size=num_samples)
+
+    data = {
+        "SARS_COV2": sars_cov2,
+        "ACE2": ace2,
+        "Ang": ang,
+        "AGTR1": agtr1,
+        "ADAM17": adam17,
+        "Toci": toci,
+        "Sil6r": sil6r,
+        "EGF": egf,
+        "TNF": tnf,
+        "Gefi": gefi,
+        "EGFR": egfr,
+        "PRR": prr,
+        "NFKB": nfkb,
+        "IL6STAT3": il6_stat3,
+        "IL6AMP": il6_amp,
+        "cytok": cytok,
+    }
+    df = pd.DataFrame(data)
+    return df
+
+
 sars_cov_2_example = Example(
     name="SARS-CoV-2 Graph",
     reference="Mohammad-Taheri, S., Zucker, J., Hoyt, C. T., Sachs, K., Tewari, V., Ness, R., & Vitek,"
@@ -51,6 +221,7 @@
     " Reasoner and Assembler (INDRA) workflow, and by quering and expressing the corresponding causal"
     " statements in the Biological Expression Language (BEL). Presence of latent variables was determined"
     " by querying pairs of entities in the network for common causes in the corpus.",
+    generate_data=generate_discrete,
     # FIXME 100% detail on exactly how this network was constructed. What queries were run on INDRA,
     #  what code was used to do it, what parts were done by hand
     example_queries=[

diff --git a/src/eliater/examples/t_cell_signaling_pathway.py b/src/eliater/examples/t_cell_signaling_pathway.py
@@ -42,7 +42,12 @@
     graph=graph,
     description="This is an example of a protein signaling network of the T cell signaling pathway"
     "It models the molecular mechanisms and regulatory processes of human cells involved"
-    "in T cell activation, proliferation, and function.",
+    "in T cell activation, proliferation, and function. The observational data consisted of quantitative"
+    " multivariate flow cytometry measureents of phosphorylated proteins derived from thousands of individual"
+    " primary immune system cells. The cells were subjected to general stimuli meant to activate the desired "
+    "paths. The distributions of measurements of individual proeins were skewed, and pairs of proteins exhibited"
+    " nonlinear relationships. To account for that, the data were binned into two levels corresponding to low, and"
+    " high concentrations to preserve the dependence structure of the original data.",
     example_queries=[Query.from_str(treatments="Raf", outcomes="Erk")],
 )