Skip to content

Commit

Permalink
Merge pull request #132 from theislab/development
Browse files Browse the repository at this point in the history
Data downloading URL, actions, and dependencies.
  • Loading branch information
ilibarra authored Aug 24, 2024
2 parents cc7e67f + 2612564 commit 0a2efa2
Show file tree
Hide file tree
Showing 8 changed files with 2,198 additions and 5,655 deletions.
7,739 changes: 2,106 additions & 5,633 deletions docs/notebooks/single_cell/02_2_1_scatac_multiome_pancreas_priors_train.ipynb

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dentate gyrus (scRNA-seq) | Training with a RNA-dynamics kNN-graph"
"## Dentate gyrus (scRNA-seq) | Training with an RNA-dynamics kNN-graph"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Mouse neurogenesis scRNA-seq (Noack et al. 2022) | Training with a RNA-dynamics kNN-graph\n"
"## Mouse neurogenesis scRNA-seq (Noack et al. 2022) | Training with an RNA-dynamics kNN-graph\n"
]
},
{
Expand Down
3 changes: 2 additions & 1 deletion mubind/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
simulate_data,
simulate_xy,
cisbp_hs, genre,
archetypes, archetypes_anno, archetypes_clu, # pwm datasets
archetypes, archetypes_anno, archetypes_clu, archetypes_pickle, # pwm datasets
pancreas_multiome,
pancreas_rna,
pancreas_rna_pytest,
pancreas_atac,
)
84 changes: 71 additions & 13 deletions mubind/datasets/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
import pandas as pd
import os
import pickle

import urllib.request

# Class for reading training/testing SELEX dataset files.
class SelexDataset(tdata.Dataset):
def __init__(self, df, n_rounds=None, enr_series=True, single_encoding_step=False, store_rev=False,
Expand Down Expand Up @@ -476,23 +477,46 @@ def genre(**kwargs):
return pwms

def archetypes_anno(**kwargs):
# read reference clusters
archetypes_dir = os.path.join(mb.bindome.constants.ANNOTATIONS_DIRECTORY, 'archetypes')
anno = pd.read_excel(os.path.join(archetypes_dir, 'motif_annotations.xlsx'), sheet_name='Archetype clusters')
url = kwargs['url']
# read reference clusters
archetypes_dir = 'data/archetypes'
archetypes_path = os.path.join(archetypes_dir, 'motif_annotations.xlsx')

# save to avoid future redownloads
if not os.path.exists(archetypes_path):
if not os.path.exists(archetypes_dir):
os.makedirs(archetypes_dir)
urllib.request.urlretrieve(kwargs['url'], archetypes_path)

anno = pd.read_excel(archetypes_path, sheet_name='Archetype clusters')
return anno

def archetypes_clu(**kwargs):
archetypes_dir = os.path.join(mb.bindome.constants.ANNOTATIONS_DIRECTORY, 'archetypes')
clu = pd.read_excel(os.path.join(archetypes_dir, 'motif_annotations.xlsx'), sheet_name='Motifs')
url = kwargs['url']
archetypes_dir = 'data/archetypes'
archetypes_path = os.path.join(archetypes_dir, 'motif_annotations.xlsx')
clu = pd.read_excel(archetypes_path, sheet_name='Motifs')
return clu

def archetypes(**kwargs):
ppm_by_name = {}
archetypes_dir = os.path.join(mb.bindome.constants.ANNOTATIONS_DIRECTORY, 'archetypes')
def archetypes_pickle(**kwargs):
# read reference clusters
archetypes_dir = 'data/archetypes'
archetypes_path = os.path.join(archetypes_dir, 'archetypes_data.pkl')

anno = archetypes_anno(**kwargs)
clu = archetypes_anno(**kwargs)
# save to avoid future redownloads
if not os.path.exists(archetypes_path):
if not os.path.exists(archetypes_dir):
os.makedirs(archetypes_dir)
# print('downloading...')
urllib.request.urlretrieve(kwargs['url'], archetypes_path)

ppm_by_name = pickle.load(open(archetypes_path, 'rb'))
return ppm_by_name


def archetypes_meme(**kwargs):
ppm_by_name = {}
archetypes_dir = 'data/archetypes'
# read PFM across meme files
for f in os.listdir(archetypes_dir):
if f.endswith('.meme'):
Expand All @@ -513,7 +537,22 @@ def archetypes(**kwargs):
ppm.index = 'A', 'C', 'G', 'T'
ppm_by_name[name] = ppm
print('# motifs loaded %i' % (len(ppm_by_name)))
return ppm_by_name

def archetypes(**kwargs):
# annotation table
url = 'https://www.dropbox.com/scl/fi/odxcg72nj3djbfz6r9nq8/motif_annotations.xlsx?rlkey=qlbyx9m7dj6qqui9ct80q9ejc&dl=1'
kwargs['url'] = url
archetypes_dir = 'data/archetypes'
anno = archetypes_anno(**kwargs)
clu = archetypes_anno(**kwargs)

# PWM weights
url = 'https://www.dropbox.com/scl/fi/gytniua2uay1p6st0svh9/archetypes_data.pkl?rlkey=qe7mzhwaiqfpkjbdj31ijx193&dl=1'
kwargs['url'] = url
ppm_by_name = archetypes_pickle(**kwargs)

# print(clu)
# return non-redundant groups
reduced_groups = []
for k in anno['Seed_motif']:
Expand All @@ -530,10 +569,27 @@ def pancreas_rna(
):
from scanpy import read
# rna
url = 'https://www.dropbox.com/scl/fi/ryb3q25n0kc2vw297f2xd/pancreas_multiome_2022_processed_rna_velocities_2024.h5ad?rlkey=in0qlpv038cn6wxrops1wsxgm&dl=0'
url = 'https://www.dropbox.com/scl/fi/ryb3q25n0kc2vw297f2xd/pancreas_multiome_2022_processed_rna_velocities_2024.h5ad?rlkey=in0qlpv038cn6wxrops1wsxgm&dl=1'
print(os.path.exists(file_path), file_path)
# print('reading RNA')
adata = read(file_path, backup_url=url, sparse=True, cache=True)
adata.var_names_make_unique()
# print('opening RNA successful')
return adata

def pancreas_rna_pytest(
file_path: Optional[
Union[str, Path]
] = "data/scatac/pancreas_multiome/pancreas_multiome_2022_processed_rna_velocities_2024_pytest.h5ad"
):
from scanpy import read
# rna
url = 'https://www.dropbox.com/scl/fi/93hw0wru56ljryo6m17d9/pancreas_multiome_2022_processed_rna_velocities_2024_pytest.h5ad?rlkey=x8r14un3gu8ahyipcylwxytns&dl=1'
print(os.path.exists(file_path), file_path)
# print('reading RNA')
adata = read(file_path, backup_url=url, sparse=True, cache=True)
adata.var_names_make_unique()
# print('opening RNA successful')
return adata

def pancreas_atac(
Expand All @@ -543,9 +599,11 @@ def pancreas_atac(
):
from scanpy import read
# atac
url = 'https://www.dropbox.com/scl/fi/53wv4v7tbnsmr12fbmea7/pancreas_multiome_2022_processed_atac.h5ad?rlkey=1kf352wya0pzffkn990wkbwmd&e=1&st=m6gv9hp5&dl=0'
url = 'https://www.dropbox.com/scl/fi/53wv4v7tbnsmr12fbmea7/pancreas_multiome_2022_processed_atac.h5ad?rlkey=1kf352wya0pzffkn990wkbwmd&e=1&st=m6gv9hp5&dl=1'
print(os.path.exists(file_path), file_path)
print('reading ATAC')
adata = read(file_path, backup_url=url, sparse=True, cache=True)
print('opening ATAC successful')
adata.var_names_make_unique()
return adata

Expand Down
8 changes: 4 additions & 4 deletions mubind/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,10 +828,10 @@ def closure():
self.r2_history += r2_history

def corr_etas_libsizes(self, train):
etas = self.get_log_etas().detach().numpy().cpu().flatten() if self.device != 'cpu' else self.get_log_etas().detach().flatten()
lib_sizes = train.dataset.rounds.sum(axis=0).detach().numpy().cpu().flatten() if self.device != 'cpu' else train.dataset.rounds.sum(axis=0).flatten()
print('etas', etas, etas.shape, etas.device)
print('libsizes', lib_sizes, lib_sizes.shape)
etas = self.get_log_etas().detach().cpu().numpy().flatten() if self.device != 'cpu' else self.get_log_etas().detach().flatten()
lib_sizes = train.dataset.rounds.sum(axis=0) if self.device != 'cpu' else train.dataset.rounds.sum(axis=0).flatten()
# print('etas', etas, etas.shape, etas.device)
# print('libsizes', lib_sizes, lib_sizes.shape)
return 'etas corr with lib_sizes (before refinement)', spearmanr(etas, lib_sizes)

def optimize_iterative(self,
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ maintainers = [
urls.Documentation = "https://mubind.readthedocs.io/"
urls.Source = "https://github.com/theislab/mubind"
urls.Home-page = "https://github.com/theislab/mubind"
version = "0.2.1"
version = "0.2.2"
requires-python = ">=3.9" # for GPU-rapids
license = {file = "LICENSE"}
readme = "README.md"
dependencies = ["seaborn", "scikit-learn", "pandas", "unidecode", "matplotlib", "scipy", "numpy>=1.22", "torch",
"logomaker", "biopython", "numba", "pytest", "pytest-cov", "openpyxl", "tqdm", "anndata"]
"logomaker", "biopython", "numba", "pytest", "pytest-cov", "openpyxl", "tqdm", "scanpy"]

[project.optional-dependencies]
dev = [
Expand Down
11 changes: 11 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import torch.optim as topti
import torch.utils.data as tdata
import mubind as mb
import pytest

def test_dataset_index_int():
import warnings
Expand Down Expand Up @@ -49,7 +50,17 @@ def test_seq_conversion():

assert (x2 == strs).all()

def test_download_and_load_dataset():
import warnings
ad = mb.datasets.pancreas_rna_pytest()
return None

@pytest.mark.filterwarnings("ignore::pytest.PytestUnraisableExceptionWarning")
def test_archetypes():
import warnings
data = mb.datasets.archetypes()
return None

def test_dataset_memory_increase():
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
Expand Down

0 comments on commit 0a2efa2

Please sign in to comment.