From 5547cd1f6b24eed7499c6492bf5c1ef4e2ef9f7a Mon Sep 17 00:00:00 2001 From: John Arevalo Date: Wed, 7 Aug 2024 12:18:08 -0400 Subject: [PATCH] Skip labels before loop (#394) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Skip labels before loop * Fix lint --------- Co-authored-by: Michaela Müller <51025211+mumichae@users.noreply.github.com> --- scib/metrics/kbet.py | 116 +++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 58 deletions(-) diff --git a/scib/metrics/kbet.py b/scib/metrics/kbet.py index 0c13c9bc..238480ef 100644 --- a/scib/metrics/kbet.py +++ b/scib/metrics/kbet.py @@ -107,39 +107,72 @@ def kBET( # set upper bound for k0 size_max = 2**31 - 1 + # check if neighborhood size too small or only one batch in subset + counts = adata_tmp.obs.groupby(label_key).agg( + {label_key: "count", batch_key: "nunique"} + ) + labels = counts.query(f"{label_key}>=10 and {batch_key} > 1").index + skipped = counts.index.difference(labels) + print(f"{len(skipped)} labels consist of a single batch or is too small. Skip.") # prepare call of kBET per cluster - kBET_scores = {"cluster": [], "kBET": []} - for clus in adata_tmp.obs[label_key].unique(): + kBET_scores = {"cluster": list(skipped), "kBET": [np.nan] * len(skipped)} + for clus in labels: # subset by label adata_sub = adata_tmp[adata_tmp.obs[label_key] == clus, :].copy() - # check if neighborhood size too small or only one batch in subset - if np.logical_or( - adata_sub.n_obs < 10, len(adata_sub.obs[batch_key].cat.categories) == 1 - ): - print(f"{clus} consists of a single batch or is too small. Skip.") - score = np.nan - else: - quarter_mean = np.floor( - np.mean(adata_sub.obs[batch_key].value_counts()) / 4 - ).astype("int") - k0 = np.min([70, np.max([10, quarter_mean])]) - # check k0 for reasonability - if k0 * adata_sub.n_obs >= size_max: - k0 = np.floor(size_max / adata_sub.n_obs).astype("int") + quarter_mean = np.floor( + np.mean(adata_sub.obs[batch_key].value_counts()) / 4 + ).astype("int") + k0 = np.min([70, np.max([10, quarter_mean])]) + # check k0 for reasonability + if k0 * adata_sub.n_obs >= size_max: + k0 = np.floor(size_max / adata_sub.n_obs).astype("int") - matrix = np.zeros(shape=(adata_sub.n_obs, k0 + 1)) + matrix = np.zeros(shape=(adata_sub.n_obs, k0 + 1)) - if verbose: - print(f"Use {k0} nearest neighbors.") - n_comp, labs = scipy.sparse.csgraph.connected_components( - adata_sub.obsp["connectivities"], connection="strong" + if verbose: + print(f"Use {k0} nearest neighbors.") + n_comp, labs = scipy.sparse.csgraph.connected_components( + adata_sub.obsp["connectivities"], connection="strong" + ) + + if n_comp == 1: # a single component to compute kBET on + try: + nn_index_tmp = diffusion_nn(adata_sub, k=k0).astype("float") + # call kBET + score = kBET_single( + matrix=matrix, + batch=adata_sub.obs[batch_key], + knn=nn_index_tmp + + 1, # nn_index in python is 0-based and 1-based in R + verbose=verbose, + k0=k0, + ) + except NeighborsError: + print("Not enough neighbours") + score = 1 # i.e. 100% rejection + + else: + # check the number of components where kBET can be computed upon + comp_size = pd.value_counts(labs) + # check which components are small + comp_size_thresh = 3 * k0 + idx_nonan = np.flatnonzero( + np.in1d(labs, comp_size[comp_size >= comp_size_thresh].index) ) - if n_comp == 1: # a single component to compute kBET on + # check if 75% of all cells can be used for kBET run + if len(idx_nonan) / len(labs) >= 0.75: + # create another subset of components, assume they are not visited in a diffusion process + adata_sub_sub = adata_sub[idx_nonan, :].copy() + nn_index_tmp = np.empty(shape=(adata_sub.n_obs, k0)) + nn_index_tmp[:] = np.nan + try: - nn_index_tmp = diffusion_nn(adata_sub, k=k0).astype("float") + nn_index_tmp[idx_nonan] = diffusion_nn(adata_sub_sub, k=k0).astype( + "float" + ) # call kBET score = kBET_single( matrix=matrix, @@ -152,41 +185,8 @@ def kBET( except NeighborsError: print("Not enough neighbours") score = 1 # i.e. 100% rejection - - else: - # check the number of components where kBET can be computed upon - comp_size = pd.value_counts(labs) - # check which components are small - comp_size_thresh = 3 * k0 - idx_nonan = np.flatnonzero( - np.in1d(labs, comp_size[comp_size >= comp_size_thresh].index) - ) - - # check if 75% of all cells can be used for kBET run - if len(idx_nonan) / len(labs) >= 0.75: - # create another subset of components, assume they are not visited in a diffusion process - adata_sub_sub = adata_sub[idx_nonan, :].copy() - nn_index_tmp = np.empty(shape=(adata_sub.n_obs, k0)) - nn_index_tmp[:] = np.nan - - try: - nn_index_tmp[idx_nonan] = diffusion_nn( - adata_sub_sub, k=k0 - ).astype("float") - # call kBET - score = kBET_single( - matrix=matrix, - batch=adata_sub.obs[batch_key], - knn=nn_index_tmp - + 1, # nn_index in python is 0-based and 1-based in R - verbose=verbose, - k0=k0, - ) - except NeighborsError: - print("Not enough neighbours") - score = 1 # i.e. 100% rejection - else: # if there are too many too small connected components, set kBET score to 1 - score = 1 # i.e. 100% rejection + else: # if there are too many too small connected components, set kBET score to 1 + score = 1 # i.e. 100% rejection kBET_scores["cluster"].append(clus) kBET_scores["kBET"].append(score)