diff --git a/src/eliater/discover_latent_nodes.py b/src/eliater/discover_latent_nodes.py index b582293..5cad59f 100644 --- a/src/eliater/discover_latent_nodes.py +++ b/src/eliater/discover_latent_nodes.py @@ -1,19 +1,19 @@ r"""This module contains methods to discover nuisance nodes in a network. Given an acyclic directed mixed graph (ADMG), along with the treatment and the outcome -of interest, certain observable variables can be regarded as nuisances. This -classification arises because they do not have any impact on the outcome and should not -be involved in the estimation of the treatment's effect on the outcome. These specific -variables are descendants of the variables on all causal paths that are not ancestors of -the outcome. A causal path, in this context, refers to a directed path that starts from the -treatment and leads to the outcome such that all the arrows on the path have the same direction. -This module is designed to identify these variables. - -This process enables us to concentrate on the fundamental variables needed to estimate the -treatment's impact on the outcome. This focus results in more precise estimates with reduced +of interest, certain observable variables do not have any impact on the outcome and should not +be involved in the estimation of the treatment's effect on the outcome. We call such variables +nuisance variables. These specific variables are descendants of the variables on all causal paths +that are not ancestors of the outcome. A causal path, in this context, refers to a directed path +that starts from the treatment and leads to the outcome such that all the arrows on the path have +the same direction. This module is designed to identify these variables. + +This process enables us to concentrate on the fundamental and necessary variables needed to estimate the +treatment's impact on the outcome. This focus results in more precise causal query estimates with reduced variance and bias. In addition, if this process is combined with the simplification function -:func:`y0.algorithm.simplify_latent.simplify_latent_dag` it can help to remove the nuisance variables -from the graph which leads to simpler, more interpretable, and visually more appealing result. +:func:`y0.algorithm.simplify_latent.simplify_latent_dag` it can help to create a new graph that does not +contain nuisance variables. This simplification leads to simpler, more interpretable, and visually more +appealing result. Example ------- @@ -141,8 +141,6 @@ $Z_4$ is removed as its children are a subset of $Z_1$'s children. """ -# FIXME @sara the explanation section is not easy to follow. please revise it. - import itertools from typing import Iterable, Optional, Set, Union diff --git a/src/eliater/examples/ecoli.py b/src/eliater/examples/ecoli.py index 43fc48e..222e112 100644 --- a/src/eliater/examples/ecoli.py +++ b/src/eliater/examples/ecoli.py @@ -1,10 +1,17 @@ -"""Examples for transcriptional Escherichia coli K-12 regulatory network.""" +"""Examples for transcriptional Escherichia coli K-12 regulatory network. -# FIXME add the following documentation. DO NOT remove this fixme without review and confirmation. -# 1. Where did this network come from? What physical experimentation was used to create it? -# What database was it in and how was the database accessed (via code? by hand?) -# 2. What is the biological phenomena described here? More detail needed. -# 3. Is there associated data to go with this graph? Commit in the examples repository +The E. Coli regulatory network was extracted manually (by hand) from the EcoCyc +database [Keseler2021]_ . The nodes represent genes, and the edges represent +regulatory relationships. + +.. [Keseler2021] `The EcoCyc database in 2021 `_ +""" + +# FIXME what kind of regulatory relationships? +# FIXME What kinds of experiments did they come from? +# FIXME What was the method that these experiments got into the database? +# FIXME How was the network extracted manually? What was the thought process? +# FIXME why do the explanations in the module docstring and the example not match? from y0.algorithm.identify import Query from y0.examples import Example @@ -118,8 +125,11 @@ " ... & Vitek, O. (2023). Optimal adjustment sets for causal query estimation in partially" " observed biomolecular networks. Bioinformatics, 39(Supplement_1), i494-i503.", graph=graph, - description="This is the transcriptional E. Coli regulatory network" - " obtained from EcoCyc database ", + description="This is the transcriptional E. Coli regulatory network obtained from EcoCyc database. " + "The experimental data were 260 RNA-seq normalized expression profiles of E. coli K-12" + " MG1655 and BW25113 across 154 unique experimental conditions, extracted from the PRECISE" + " database by (Sastry et al., 2019) from this paper: 'The Escherichia coli transcriptome mostly" + " consists of independently regulated modules' ", example_queries=[Query.from_str(treatments="fur", outcomes="dpiA")], ) diff --git a/src/eliater/examples/sars.py b/src/eliater/examples/sars.py index a766e07..14c2ecd 100644 --- a/src/eliater/examples/sars.py +++ b/src/eliater/examples/sars.py @@ -1,13 +1,17 @@ """Examples for SARS-CoV-2.""" -__all__ = [ - "sars_cov_2_example", -] +import numpy as np +import pandas as pd from y0.algorithm.identify import Query +from y0.dsl import Variable from y0.examples import Example from y0.graph import NxMixedGraph +__all__ = [ + "sars_cov_2_example", +] + graph = NxMixedGraph.from_str_edges( directed=[ ("SARS_COV2", "ACE2"), @@ -39,6 +43,172 @@ ], ) + +def _r_exp(x): + return 1 / (1 + np.exp(x)) + + +def generate_discrete( + num_samples: int, treatments: dict[Variable, float] | None = None, *, seed: int | None = None +) -> pd.DataFrame: + """Generate testing data for the SARS-CoV-2 large graph. + + :param num_samples: The number of samples to generate. Try 1000. + :param treatments: An optional dictionary of the values to fix each variable to. + :param seed: An optional random seed for reproducibility purposes + :returns: A pandas Dataframe with columns corresponding + to the variable names SARS-CoV-2 large graph + """ + if treatments is None: + treatments = {} + generator = np.random.default_rng(seed) + + gefi = generator.binomial(n=1, p=0.9, size=num_samples) + toci = generator.binomial(n=1, p=0.7, size=num_samples) + u_sars_ang = generator.binomial(n=1, p=0.6, size=num_samples) + u_prr_nfkb = generator.binomial(n=1, p=0.6, size=num_samples) + u_egf_egfr = generator.binomial(n=1, p=0.6, size=num_samples) + u_tnf_egfr = generator.binomial(n=1, p=0.6, size=num_samples) + u_il6_stat_egfr = generator.binomial(n=1, p=0.6, size=num_samples) + u_adam17_sil6_ra = generator.binomial(n=1, p=0.6, size=num_samples) + + beta0_u_sars_ang_to_sars_cov2 = -1.8 + beta_u_sars_ang = 0.05 # positive + loc_sars_cov2 = np.array(_r_exp(-beta0_u_sars_ang_to_sars_cov2 - u_sars_ang * beta_u_sars_ang)) + sars_cov2 = generator.binomial(n=1, p=loc_sars_cov2, size=num_samples) + + beta0_sars_cov2_to_ace2 = 1.5 + beta_sars_cov2_to_ace2 = -0.04 # negative + loc_ace2 = _r_exp(-beta0_sars_cov2_to_ace2 - sars_cov2 * beta_sars_cov2_to_ace2) + ace2 = generator.binomial(n=1, p=loc_ace2, size=num_samples) + + beta0_ang = 1.1 + beta_ace2_to_ang = -0.06 # negative + beta_u_sars_ang = 0.05 # positive + loc_ang = _r_exp(-beta0_ang - ace2 * beta_ace2_to_ang - u_sars_ang * beta_u_sars_ang) + ang = generator.binomial(n=1, p=loc_ang, size=num_samples) + + beta0_agtr1 = -1.5 + beta_ang_to_agtr1 = 0.08 + loc_agtr1 = _r_exp(-beta0_agtr1 - ang * beta_ang_to_agtr1) + agtr1 = generator.binomial(n=1, p=loc_agtr1, size=num_samples) + + beta0_adam17 = -1 + beta_agtr1_to_adam17 = 0.04 + beta_u_adam17_sil6r_to_adam17 = 0.04 + loc_adam17 = _r_exp( + -beta0_adam17 + - agtr1 * beta_agtr1_to_adam17 + - u_adam17_sil6_ra * beta_u_adam17_sil6r_to_adam17 + ) + adam17 = generator.binomial(n=1, p=loc_adam17, size=num_samples) + + beta0_sil6r = -1.9 + beta_adam17_to_sil6r = 0.03 + beta_u_to_sil6r = 0.05 + beta_toci_to_sil6r = -0.04 # negative + loc_sil6r = _r_exp( + -beta0_sil6r + - adam17 * beta_adam17_to_sil6r + - u_adam17_sil6_ra * beta_u_to_sil6r + - toci * beta_toci_to_sil6r + ) + sil6r = generator.binomial(n=1, p=loc_sil6r, size=num_samples) + + beta0_egf = -1.6 + beta_adam17_to_egf = 0.03 + beta_u_to_egf = 0.05 + loc_egf = _r_exp(-beta0_egf - adam17 * beta_adam17_to_egf - u_egf_egfr * beta_u_to_egf) + egf = generator.binomial(n=1, p=loc_egf, size=num_samples) + + beta0_tnf = -1.8 + beta_adam17_to_tnf = 0.05 + beta_u_to_tnf = 0.06 + loc_tnf = _r_exp(-beta0_tnf - adam17 * beta_adam17_to_tnf - u_tnf_egfr * beta_u_to_tnf) + tnf = generator.binomial(n=1, p=loc_tnf, size=num_samples) + + beta0_egfr = -1.9 + beta_egf_to_egfr = 0.03 + beta_u1_to_egfr = 0.05 + beta_u2_to_egfr = 0.02 + beta_u3_to_egfr = 0.04 + beta_gefi_to_egfr = -0.08 # negative + if Variable("EGFR") in treatments: + egfr = np.full(num_samples, treatments[Variable("EGFR")]) + else: + p = _r_exp( + -beta0_egfr + - egf * beta_egf_to_egfr + - u_il6_stat_egfr * beta_u1_to_egfr + - u_tnf_egfr * beta_u2_to_egfr + - u_egf_egfr * beta_u3_to_egfr + - gefi * beta_gefi_to_egfr + ) + egfr = generator.binomial(1, p, size=num_samples) + + beta0_prr = -1.4 + beta_sars_cov2_to_prr = 0.05 + beta_u_to_prr = 0.02 + loc_prr = _r_exp(-beta0_prr - sars_cov2 * beta_sars_cov2_to_prr - u_prr_nfkb * beta_u_to_prr) + prr = generator.binomial(n=1, p=loc_prr, size=num_samples) + + beta0_nfkb = -1.8 + beta_prr_to_nfkb = 0.01 + beta_u_to_nfkb = 0.02 + beta_egfr_to_nfkb = 0.7 + beta_tnf_to_nfkb = 0.01 + loc_nfkb = _r_exp( + -beta0_nfkb + - prr * beta_prr_to_nfkb + - u_prr_nfkb * beta_u_to_nfkb + - egfr * beta_egfr_to_nfkb + - tnf * beta_tnf_to_nfkb + ) + nfkb = generator.binomial(n=1, p=loc_nfkb, size=num_samples) + + beta0_il6_stat3 = -1.6 + beta_u_to_il6_stat3 = -0.05 + beta_sil6r_to_il6_stat3 = 0.04 + loc_il6_stat3 = _r_exp( + -beta0_il6_stat3 - u_il6_stat_egfr * beta_u_to_il6_stat3 - sil6r * beta_sil6r_to_il6_stat3 + ) + il6_stat3 = generator.binomial(n=1, p=loc_il6_stat3, size=num_samples) + + beta0_il6_amp = -1.1 + beta_nfkb_to_il6_amp = -5 + beta_il6_stat3_to_il6_amp = 0.03 + loc_il6_amp = _r_exp( + -beta0_il6_amp - nfkb * beta_nfkb_to_il6_amp - il6_stat3 * beta_il6_stat3_to_il6_amp + ) + il6_amp = generator.binomial(n=1, p=loc_il6_amp, size=num_samples) + + beta0_cytok = -1.1 + beta_il6_amp_tocytok = 4 + loc_cytok = _r_exp(-beta0_cytok - il6_amp * beta_il6_amp_tocytok) + cytok = generator.binomial(n=1, p=loc_cytok, size=num_samples) + + data = { + "SARS_COV2": sars_cov2, + "ACE2": ace2, + "Ang": ang, + "AGTR1": agtr1, + "ADAM17": adam17, + "Toci": toci, + "Sil6r": sil6r, + "EGF": egf, + "TNF": tnf, + "Gefi": gefi, + "EGFR": egfr, + "PRR": prr, + "NFKB": nfkb, + "IL6STAT3": il6_stat3, + "IL6AMP": il6_amp, + "cytok": cytok, + } + df = pd.DataFrame(data) + return df + + sars_cov_2_example = Example( name="SARS-CoV-2 Graph", reference="Mohammad-Taheri, S., Zucker, J., Hoyt, C. T., Sachs, K., Tewari, V., Ness, R., & Vitek," @@ -51,6 +221,7 @@ " Reasoner and Assembler (INDRA) workflow, and by quering and expressing the corresponding causal" " statements in the Biological Expression Language (BEL). Presence of latent variables was determined" " by querying pairs of entities in the network for common causes in the corpus.", + generate_data=generate_discrete, # FIXME 100% detail on exactly how this network was constructed. What queries were run on INDRA, # what code was used to do it, what parts were done by hand example_queries=[ diff --git a/src/eliater/examples/t_cell_signaling_pathway.py b/src/eliater/examples/t_cell_signaling_pathway.py index d34f887..24f506d 100644 --- a/src/eliater/examples/t_cell_signaling_pathway.py +++ b/src/eliater/examples/t_cell_signaling_pathway.py @@ -42,7 +42,12 @@ graph=graph, description="This is an example of a protein signaling network of the T cell signaling pathway" "It models the molecular mechanisms and regulatory processes of human cells involved" - "in T cell activation, proliferation, and function.", + "in T cell activation, proliferation, and function. The observational data consisted of quantitative" + " multivariate flow cytometry measureents of phosphorylated proteins derived from thousands of individual" + " primary immune system cells. The cells were subjected to general stimuli meant to activate the desired " + "paths. The distributions of measurements of individual proeins were skewed, and pairs of proteins exhibited" + " nonlinear relationships. To account for that, the data were binned into two levels corresponding to low, and" + " high concentrations to preserve the dependence structure of the original data.", example_queries=[Query.from_str(treatments="Raf", outcomes="Erk")], )