From 5547cd1f6b24eed7499c6492bf5c1ef4e2ef9f7a Mon Sep 17 00:00:00 2001
From: John Arevalo <johnarevalo@gmail.com>
Date: Wed, 7 Aug 2024 12:18:08 -0400
Subject: [PATCH] Skip labels before loop (#394)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Skip labels before loop

* Fix lint

---------

Co-authored-by: Michaela Müller <51025211+mumichae@users.noreply.github.com>
---
 scib/metrics/kbet.py | 116 +++++++++++++++++++++----------------------
 1 file changed, 58 insertions(+), 58 deletions(-)

diff --git a/scib/metrics/kbet.py b/scib/metrics/kbet.py
index 0c13c9bc..238480ef 100644
--- a/scib/metrics/kbet.py
+++ b/scib/metrics/kbet.py
@@ -107,39 +107,72 @@ def kBET(
     # set upper bound for k0
     size_max = 2**31 - 1
 
+    # check if neighborhood size too small or only one batch in subset
+    counts = adata_tmp.obs.groupby(label_key).agg(
+        {label_key: "count", batch_key: "nunique"}
+    )
+    labels = counts.query(f"{label_key}>=10 and {batch_key} > 1").index
+    skipped = counts.index.difference(labels)
+    print(f"{len(skipped)} labels consist of a single batch or is too small. Skip.")
     # prepare call of kBET per cluster
-    kBET_scores = {"cluster": [], "kBET": []}
-    for clus in adata_tmp.obs[label_key].unique():
+    kBET_scores = {"cluster": list(skipped), "kBET": [np.nan] * len(skipped)}
+    for clus in labels:
 
         # subset by label
         adata_sub = adata_tmp[adata_tmp.obs[label_key] == clus, :].copy()
 
-        # check if neighborhood size too small or only one batch in subset
-        if np.logical_or(
-            adata_sub.n_obs < 10, len(adata_sub.obs[batch_key].cat.categories) == 1
-        ):
-            print(f"{clus} consists of a single batch or is too small. Skip.")
-            score = np.nan
-        else:
-            quarter_mean = np.floor(
-                np.mean(adata_sub.obs[batch_key].value_counts()) / 4
-            ).astype("int")
-            k0 = np.min([70, np.max([10, quarter_mean])])
-            # check k0 for reasonability
-            if k0 * adata_sub.n_obs >= size_max:
-                k0 = np.floor(size_max / adata_sub.n_obs).astype("int")
+        quarter_mean = np.floor(
+            np.mean(adata_sub.obs[batch_key].value_counts()) / 4
+        ).astype("int")
+        k0 = np.min([70, np.max([10, quarter_mean])])
+        # check k0 for reasonability
+        if k0 * adata_sub.n_obs >= size_max:
+            k0 = np.floor(size_max / adata_sub.n_obs).astype("int")
 
-            matrix = np.zeros(shape=(adata_sub.n_obs, k0 + 1))
+        matrix = np.zeros(shape=(adata_sub.n_obs, k0 + 1))
 
-            if verbose:
-                print(f"Use {k0} nearest neighbors.")
-            n_comp, labs = scipy.sparse.csgraph.connected_components(
-                adata_sub.obsp["connectivities"], connection="strong"
+        if verbose:
+            print(f"Use {k0} nearest neighbors.")
+        n_comp, labs = scipy.sparse.csgraph.connected_components(
+            adata_sub.obsp["connectivities"], connection="strong"
+        )
+
+        if n_comp == 1:  # a single component to compute kBET on
+            try:
+                nn_index_tmp = diffusion_nn(adata_sub, k=k0).astype("float")
+                # call kBET
+                score = kBET_single(
+                    matrix=matrix,
+                    batch=adata_sub.obs[batch_key],
+                    knn=nn_index_tmp
+                    + 1,  # nn_index in python is 0-based and 1-based in R
+                    verbose=verbose,
+                    k0=k0,
+                )
+            except NeighborsError:
+                print("Not enough neighbours")
+                score = 1  # i.e. 100% rejection
+
+        else:
+            # check the number of components where kBET can be computed upon
+            comp_size = pd.value_counts(labs)
+            # check which components are small
+            comp_size_thresh = 3 * k0
+            idx_nonan = np.flatnonzero(
+                np.in1d(labs, comp_size[comp_size >= comp_size_thresh].index)
             )
 
-            if n_comp == 1:  # a single component to compute kBET on
+            # check if 75% of all cells can be used for kBET run
+            if len(idx_nonan) / len(labs) >= 0.75:
+                # create another subset of components, assume they are not visited in a diffusion process
+                adata_sub_sub = adata_sub[idx_nonan, :].copy()
+                nn_index_tmp = np.empty(shape=(adata_sub.n_obs, k0))
+                nn_index_tmp[:] = np.nan
+
                 try:
-                    nn_index_tmp = diffusion_nn(adata_sub, k=k0).astype("float")
+                    nn_index_tmp[idx_nonan] = diffusion_nn(adata_sub_sub, k=k0).astype(
+                        "float"
+                    )
                     # call kBET
                     score = kBET_single(
                         matrix=matrix,
@@ -152,41 +185,8 @@ def kBET(
                 except NeighborsError:
                     print("Not enough neighbours")
                     score = 1  # i.e. 100% rejection
-
-            else:
-                # check the number of components where kBET can be computed upon
-                comp_size = pd.value_counts(labs)
-                # check which components are small
-                comp_size_thresh = 3 * k0
-                idx_nonan = np.flatnonzero(
-                    np.in1d(labs, comp_size[comp_size >= comp_size_thresh].index)
-                )
-
-                # check if 75% of all cells can be used for kBET run
-                if len(idx_nonan) / len(labs) >= 0.75:
-                    # create another subset of components, assume they are not visited in a diffusion process
-                    adata_sub_sub = adata_sub[idx_nonan, :].copy()
-                    nn_index_tmp = np.empty(shape=(adata_sub.n_obs, k0))
-                    nn_index_tmp[:] = np.nan
-
-                    try:
-                        nn_index_tmp[idx_nonan] = diffusion_nn(
-                            adata_sub_sub, k=k0
-                        ).astype("float")
-                        # call kBET
-                        score = kBET_single(
-                            matrix=matrix,
-                            batch=adata_sub.obs[batch_key],
-                            knn=nn_index_tmp
-                            + 1,  # nn_index in python is 0-based and 1-based in R
-                            verbose=verbose,
-                            k0=k0,
-                        )
-                    except NeighborsError:
-                        print("Not enough neighbours")
-                        score = 1  # i.e. 100% rejection
-                else:  # if there are too many too small connected components, set kBET score to 1
-                    score = 1  # i.e. 100% rejection
+            else:  # if there are too many too small connected components, set kBET score to 1
+                score = 1  # i.e. 100% rejection
 
         kBET_scores["cluster"].append(clus)
         kBET_scores["kBET"].append(score)