Merge pull request #132 from theislab/development

Data downloading URL, actions, and dependencies.
theislab · Aug 24, 2024 · 0a2efa2 · 0a2efa2
2 parents cc7e67f + 2612564
commit 0a2efa2
Show file tree

Hide file tree

Showing 8 changed files with 2,198 additions and 5,655 deletions.
diff --git a/docs/notebooks/single_cell/02_2_1_scatac_multiome_pancreas_priors_train.ipynb b/docs/notebooks/single_cell/02_2_1_scatac_multiome_pancreas_priors_train.ipynb
diff --git a/docs/notebooks/single_cell/02_2_1_scrna_dentategyrus_train.ipynb b/docs/notebooks/single_cell/02_2_1_scrna_dentategyrus_train.ipynb
@@ -5,7 +5,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Dentate gyrus (scRNA-seq) | Training with a RNA-dynamics kNN-graph"
+    "## Dentate gyrus (scRNA-seq) | Training with an RNA-dynamics kNN-graph"
    ]
   },
   {

diff --git a/docs/notebooks/single_cell/02_2_1_scrna_noack_priors_train.ipynb b/docs/notebooks/single_cell/02_2_1_scrna_noack_priors_train.ipynb
@@ -5,7 +5,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Mouse neurogenesis scRNA-seq (Noack et al. 2022) | Training with a RNA-dynamics kNN-graph\n"
+    "## Mouse neurogenesis scRNA-seq (Noack et al. 2022) | Training with an RNA-dynamics kNN-graph\n"
    ]
   },
   {

diff --git a/mubind/datasets/__init__.py b/mubind/datasets/__init__.py
@@ -9,8 +9,9 @@
     simulate_data,
     simulate_xy,
     cisbp_hs, genre,
-    archetypes, archetypes_anno, archetypes_clu, # pwm datasets
+    archetypes, archetypes_anno, archetypes_clu, archetypes_pickle, # pwm datasets
     pancreas_multiome,
     pancreas_rna,
+    pancreas_rna_pytest,
     pancreas_atac,
 )
diff --git a/mubind/datasets/datasets.py b/mubind/datasets/datasets.py
@@ -12,7 +12,8 @@
 import pandas as pd
 import os
 import pickle
-
+import urllib.request
+
 # Class for reading training/testing SELEX dataset files.
 class SelexDataset(tdata.Dataset):
     def __init__(self, df, n_rounds=None, enr_series=True, single_encoding_step=False, store_rev=False,
@@ -476,23 +477,46 @@ def genre(**kwargs):
     return pwms
 
 def archetypes_anno(**kwargs):
-    # read reference clusters
-    archetypes_dir = os.path.join(mb.bindome.constants.ANNOTATIONS_DIRECTORY, 'archetypes')
-    anno = pd.read_excel(os.path.join(archetypes_dir, 'motif_annotations.xlsx'), sheet_name='Archetype clusters')
+    url = kwargs['url']
+    # read reference clusters    
+    archetypes_dir = 'data/archetypes'
+    archetypes_path = os.path.join(archetypes_dir, 'motif_annotations.xlsx')
+
+    # save to avoid future redownloads
+    if not os.path.exists(archetypes_path):
+       if not os.path.exists(archetypes_dir):
+            os.makedirs(archetypes_dir)
+       urllib.request.urlretrieve(kwargs['url'], archetypes_path)
+
+    anno = pd.read_excel(archetypes_path, sheet_name='Archetype clusters')
     return anno
 
 def archetypes_clu(**kwargs):
-    archetypes_dir = os.path.join(mb.bindome.constants.ANNOTATIONS_DIRECTORY, 'archetypes')
-    clu = pd.read_excel(os.path.join(archetypes_dir, 'motif_annotations.xlsx'), sheet_name='Motifs')
+    url = kwargs['url']
+    archetypes_dir = 'data/archetypes'
+    archetypes_path = os.path.join(archetypes_dir, 'motif_annotations.xlsx')
+    clu = pd.read_excel(archetypes_path, sheet_name='Motifs')
     return clu
 
-def archetypes(**kwargs):
-    ppm_by_name = {}
-    archetypes_dir = os.path.join(mb.bindome.constants.ANNOTATIONS_DIRECTORY, 'archetypes')
+def archetypes_pickle(**kwargs):
+    # read reference clusters    
+    archetypes_dir = 'data/archetypes'
+    archetypes_path = os.path.join(archetypes_dir, 'archetypes_data.pkl')
 
-    anno = archetypes_anno(**kwargs)
-    clu = archetypes_anno(**kwargs)
+    # save to avoid future redownloads
+    if not os.path.exists(archetypes_path):
+       if not os.path.exists(archetypes_dir):
+            os.makedirs(archetypes_dir)
+       # print('downloading...')
+       urllib.request.urlretrieve(kwargs['url'], archetypes_path)
+
+    ppm_by_name = pickle.load(open(archetypes_path, 'rb'))
+    return ppm_by_name
 
+
+def archetypes_meme(**kwargs):
+    ppm_by_name = {}
+    archetypes_dir = 'data/archetypes'
     # read PFM across meme files
     for f in os.listdir(archetypes_dir):
         if f.endswith('.meme'):
@@ -513,7 +537,22 @@ def archetypes(**kwargs):
                     ppm.index = 'A', 'C', 'G', 'T'
                     ppm_by_name[name] = ppm
     print('# motifs loaded %i' % (len(ppm_by_name)))
+    return ppm_by_name
+
+def archetypes(**kwargs):
+    # annotation table
+    url = 'https://www.dropbox.com/scl/fi/odxcg72nj3djbfz6r9nq8/motif_annotations.xlsx?rlkey=qlbyx9m7dj6qqui9ct80q9ejc&dl=1'
+    kwargs['url'] = url
+    archetypes_dir = 'data/archetypes'
+    anno = archetypes_anno(**kwargs)
+    clu = archetypes_anno(**kwargs)
 
+    # PWM weights
+    url = 'https://www.dropbox.com/scl/fi/gytniua2uay1p6st0svh9/archetypes_data.pkl?rlkey=qe7mzhwaiqfpkjbdj31ijx193&dl=1'
+    kwargs['url'] = url
+    ppm_by_name = archetypes_pickle(**kwargs)
+
+    # print(clu)
     # return non-redundant groups
     reduced_groups = []
     for k in anno['Seed_motif']:
@@ -530,10 +569,27 @@ def pancreas_rna(
 ):
     from scanpy import read
     # rna
-    url = 'https://www.dropbox.com/scl/fi/ryb3q25n0kc2vw297f2xd/pancreas_multiome_2022_processed_rna_velocities_2024.h5ad?rlkey=in0qlpv038cn6wxrops1wsxgm&dl=0'
+    url = 'https://www.dropbox.com/scl/fi/ryb3q25n0kc2vw297f2xd/pancreas_multiome_2022_processed_rna_velocities_2024.h5ad?rlkey=in0qlpv038cn6wxrops1wsxgm&dl=1'
+    print(os.path.exists(file_path), file_path)
+    # print('reading RNA')
+    adata = read(file_path, backup_url=url, sparse=True, cache=True)
+    adata.var_names_make_unique()
+    # print('opening RNA successful')
+    return adata
+
+def pancreas_rna_pytest(
+    file_path: Optional[
+        Union[str, Path]
+    ] = "data/scatac/pancreas_multiome/pancreas_multiome_2022_processed_rna_velocities_2024_pytest.h5ad"
+):
+    from scanpy import read
+    # rna
+    url = 'https://www.dropbox.com/scl/fi/93hw0wru56ljryo6m17d9/pancreas_multiome_2022_processed_rna_velocities_2024_pytest.h5ad?rlkey=x8r14un3gu8ahyipcylwxytns&dl=1'
     print(os.path.exists(file_path), file_path)
+    # print('reading RNA')
     adata = read(file_path, backup_url=url, sparse=True, cache=True)
     adata.var_names_make_unique()
+    # print('opening RNA successful')
     return adata
 
 def pancreas_atac(
@@ -543,9 +599,11 @@ def pancreas_atac(
 ):
     from scanpy import read
     # atac
-    url = 'https://www.dropbox.com/scl/fi/53wv4v7tbnsmr12fbmea7/pancreas_multiome_2022_processed_atac.h5ad?rlkey=1kf352wya0pzffkn990wkbwmd&e=1&st=m6gv9hp5&dl=0'
+    url = 'https://www.dropbox.com/scl/fi/53wv4v7tbnsmr12fbmea7/pancreas_multiome_2022_processed_atac.h5ad?rlkey=1kf352wya0pzffkn990wkbwmd&e=1&st=m6gv9hp5&dl=1'
     print(os.path.exists(file_path), file_path)
+    print('reading ATAC')
     adata = read(file_path, backup_url=url, sparse=True, cache=True)
+    print('opening ATAC successful')
     adata.var_names_make_unique()
     return adata
 

diff --git a/mubind/models/models.py b/mubind/models/models.py
@@ -828,10 +828,10 @@ def closure():
         self.r2_history += r2_history
 
     def corr_etas_libsizes(self, train):
-        etas = self.get_log_etas().detach().numpy().cpu().flatten() if self.device != 'cpu' else self.get_log_etas().detach().flatten()
-        lib_sizes = train.dataset.rounds.sum(axis=0).detach().numpy().cpu().flatten() if self.device != 'cpu' else train.dataset.rounds.sum(axis=0).flatten()
-        print('etas', etas, etas.shape, etas.device)
-        print('libsizes', lib_sizes, lib_sizes.shape)
+        etas = self.get_log_etas().detach().cpu().numpy().flatten() if self.device != 'cpu' else self.get_log_etas().detach().flatten()
+        lib_sizes = train.dataset.rounds.sum(axis=0) if self.device != 'cpu' else train.dataset.rounds.sum(axis=0).flatten()
+        # print('etas', etas, etas.shape, etas.device)
+        # print('libsizes', lib_sizes, lib_sizes.shape)
         return 'etas corr with lib_sizes (before refinement)', spearmanr(etas, lib_sizes)
 
     def optimize_iterative(self,

diff --git a/pyproject.toml b/pyproject.toml
@@ -20,12 +20,12 @@ maintainers = [
 urls.Documentation = "https://mubind.readthedocs.io/"
 urls.Source = "https://github.com/theislab/mubind"
 urls.Home-page = "https://github.com/theislab/mubind"
-version = "0.2.1"
+version = "0.2.2"
 requires-python = ">=3.9" # for GPU-rapids
 license = {file = "LICENSE"}
 readme = "README.md"
 dependencies = ["seaborn", "scikit-learn", "pandas", "unidecode", "matplotlib", "scipy", "numpy>=1.22", "torch",
-                "logomaker", "biopython", "numba", "pytest", "pytest-cov", "openpyxl", "tqdm", "anndata"]
+                "logomaker", "biopython", "numba", "pytest", "pytest-cov", "openpyxl", "tqdm", "scanpy"]
 
 [project.optional-dependencies]
 dev = [

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -4,6 +4,7 @@
 import torch.optim as topti
 import torch.utils.data as tdata
 import mubind as mb
+import pytest
 
 def test_dataset_index_int():
     import warnings
@@ -49,7 +50,17 @@ def test_seq_conversion():
 
     assert (x2 == strs).all()
 
+def test_download_and_load_dataset():
+    import warnings
+    ad = mb.datasets.pancreas_rna_pytest()
+    return None
 
+@pytest.mark.filterwarnings("ignore::pytest.PytestUnraisableExceptionWarning")
+def test_archetypes():
+    import warnings
+    data = mb.datasets.archetypes()
+    return None
+
 def test_dataset_memory_increase():
     import warnings
     warnings.filterwarnings("ignore", category=DeprecationWarning)