[BUG] Unsupported graph for similiarity algos (#3710)

This PR update the docstrings raises an error when running any similarity algos with vertices from a graph that are unrenumbered. Authors: - Joseph Nke (https://github.com/jnke2016) Approvers: - Brad Rees (https://github.com/BradReesWork) URL: #3710
rapidsai · Jul 19, 2023 · 59b0eb7 · 59b0eb7
1 parent b8de24c
commit 59b0eb7
Show file tree

Hide file tree

Showing 12 changed files with 234 additions and 12 deletions.
diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,7 +20,7 @@
 )
 
 
-def jaccard(input_graph, vertex_pair=None):
+def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
     """
     Compute the Jaccard similarity between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
@@ -36,6 +36,10 @@ def jaccard(input_graph, vertex_pair=None):
     of cugraph.jaccard is different from the behavior of
     networkx.jaccard_coefficient.
 
+    This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     cugraph.jaccard, in the absence of a specified vertex pair list, will
     use the edges of the graph to construct a vertex pair list and will
     return the jaccard coefficient for those vertex pairs.
@@ -80,6 +84,10 @@ def jaccard(input_graph, vertex_pair=None):
         current implementation computes the jaccard coefficient for all
         adjacent vertices in the graph.
 
+    do_expensive_check: bool (default=True)
+        When set to True, check if the vertices in the graph are (re)numbered
+        from 0 to V-1 where V is the total number of vertices.
+
     Returns
     -------
     df  : cudf.DataFrame
@@ -104,6 +112,22 @@ def jaccard(input_graph, vertex_pair=None):
     >>> df = cugraph.jaccard(G)
 
     """
+    if do_expensive_check:
+        if not input_graph.renumbered:
+            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
+            max_vertex = input_df.max().max()
+            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
+                input_df.dtypes[0]
+            )
+            nodes = (
+                cudf.concat([input_df["src"], input_df["dst"]])
+                .unique()
+                .sort_values()
+                .reset_index(drop=True)
+            )
+            if not expected_nodes.equals(nodes):
+                raise ValueError("Unrenumbered vertices are not supported.")
+
     if input_graph.is_directed():
         raise ValueError("Input must be an undirected Graph.")
     if type(vertex_pair) == cudf.DataFrame:
@@ -120,10 +144,14 @@ def jaccard(input_graph, vertex_pair=None):
     return df
 
 
-def jaccard_coefficient(G, ebunch=None):
+def jaccard_coefficient(G, ebunch=None, do_expensive_check=True):
     """
     For NetworkX Compatability.  See `jaccard`
 
+    NOTE: This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     Parameters
     ----------
     graph : cugraph.Graph

diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,10 +20,14 @@
 )
 
 
-def overlap_coefficient(G, ebunch=None):
+def overlap_coefficient(G, ebunch=None, do_expensive_check=True):
     """
     For NetworkX Compatability.  See `overlap`
 
+    NOTE: This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     """
     vertex_pair = None
 
@@ -42,7 +46,7 @@ def overlap_coefficient(G, ebunch=None):
     return df
 
 
-def overlap(input_graph, vertex_pair=None):
+def overlap(input_graph, vertex_pair=None, do_expensive_check=True):
     """
     Compute the Overlap Coefficient between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
@@ -54,6 +58,10 @@ def overlap(input_graph, vertex_pair=None):
     neighbors. If first is specified but second is not, or vice versa, an
     exception will be thrown.
 
+    NOTE: This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     Parameters
     ----------
     input_graph : cugraph.Graph
@@ -66,6 +74,10 @@ def overlap(input_graph, vertex_pair=None):
         vertices. If provided, the overlap coefficient is computed for the
         given vertex pairs, else, it is computed for all vertex pairs.
 
+    do_expensive_check: bool (default=True)
+        When set to True, check if the vertices in the graph are (re)numbered
+        from 0 to V-1 where V is the total number of vertices.
+
     Returns
     -------
     df : cudf.DataFrame
@@ -90,6 +102,21 @@ def overlap(input_graph, vertex_pair=None):
     >>> df = cugraph.overlap(G)
 
     """
+    if do_expensive_check:
+        if not input_graph.renumbered:
+            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
+            max_vertex = input_df.max().max()
+            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
+                input_df.dtypes[0]
+            )
+            nodes = (
+                cudf.concat([input_df["src"], input_df["dst"]])
+                .unique()
+                .sort_values()
+                .reset_index(drop=True)
+            )
+            if not expected_nodes.equals(nodes):
+                raise ValueError("Unrenumbered vertices are not supported.")
 
     if type(vertex_pair) == cudf.DataFrame:
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)

diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py
@@ -21,7 +21,7 @@
 )
 
 
-def sorensen(input_graph, vertex_pair=None):
+def sorensen(input_graph, vertex_pair=None, do_expensive_check=True):
     """
     Compute the Sorensen coefficient between each pair of vertices connected by
     an edge, or between arbitrary pairs of vertices specified by the user.
@@ -30,6 +30,10 @@ def sorensen(input_graph, vertex_pair=None):
     If first is specified but second is not, or vice versa, an exception will
     be thrown.
 
+    NOTE: This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     cugraph.sorensen, in the absence of a specified vertex pair list, will
     use the edges of the graph to construct a vertex pair list and will
     return the sorensen coefficient for those vertex pairs.
@@ -50,6 +54,10 @@ def sorensen(input_graph, vertex_pair=None):
         current implementation computes the Sorensen coefficient for all
         adjacent vertices in the graph.
 
+    do_expensive_check: bool (default=True)
+        When set to True, check if the vertices in the graph are (re)numbered
+        from 0 to V-1 where V is the total number of vertices.
+
     Returns
     -------
     df  : cudf.DataFrame
@@ -76,6 +84,22 @@ def sorensen(input_graph, vertex_pair=None):
     >>> df = cugraph.sorensen(G)
 
     """
+    if do_expensive_check:
+        if not input_graph.renumbered:
+            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
+            max_vertex = input_df.max().max()
+            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
+                input_df.dtypes[0]
+            )
+            nodes = (
+                cudf.concat([input_df["src"], input_df["dst"]])
+                .unique()
+                .sort_values()
+                .reset_index(drop=True)
+            )
+            if not expected_nodes.equals(nodes):
+                raise ValueError("Unrenumbered vertices are not supported.")
+
     if type(input_graph) is not Graph:
         raise TypeError("input graph must a Graph")
 
@@ -94,10 +118,14 @@ def sorensen(input_graph, vertex_pair=None):
     return df
 
 
-def sorensen_coefficient(G, ebunch=None):
+def sorensen_coefficient(G, ebunch=None, do_expensive_check=True):
     """
     For NetworkX Compatability.  See `sorensen`
 
+    NOTE: This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     Parameters
     ----------
     G : cugraph.Graph

diff --git a/python/cugraph/cugraph/link_prediction/wjaccard.py b/python/cugraph/cugraph/link_prediction/wjaccard.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,7 +17,7 @@
 from cugraph.utilities import renumber_vertex_pair
 
 
-def jaccard_w(input_graph, weights, vertex_pair=None):
+def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
     """
     Compute the weighted Jaccard similarity between each pair of vertices
     connected by an edge, or between arbitrary pairs of vertices specified by
@@ -29,6 +29,10 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
     neighbors. If first is specified but second is not, or vice versa, an
     exception will be thrown.
 
+    NOTE: This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     Parameters
     ----------
     input_graph : cugraph.Graph
@@ -51,6 +55,10 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
         vertices. If provided, the jaccard coefficient is computed for the
         given vertex pairs, else, it is computed for all vertex pairs.
 
+    do_expensive_check: bool (default=True)
+        When set to True, check if the vertices in the graph are (re)numbered
+        from 0 to V-1 where V is the total number of vertices.
+
     Returns
     -------
     df : cudf.DataFrame
@@ -87,6 +95,22 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
     >>> df = cugraph.jaccard_w(G, weights)
 
     """
+    if do_expensive_check:
+        if not input_graph.renumbered:
+            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
+            max_vertex = input_df.max().max()
+            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
+                input_df.dtypes[0]
+            )
+            nodes = (
+                cudf.concat([input_df["src"], input_df["dst"]])
+                .unique()
+                .sort_values()
+                .reset_index(drop=True)
+            )
+            if not expected_nodes.equals(nodes):
+                raise ValueError("Unrenumbered vertices are not supported.")
+
     if type(input_graph) is not Graph:
         raise TypeError("input graph must a Graph")
 

diff --git a/python/cugraph/cugraph/link_prediction/woverlap.py b/python/cugraph/cugraph/link_prediction/woverlap.py
@@ -16,7 +16,7 @@
 from cugraph.utilities import renumber_vertex_pair
 
 
-def overlap_w(input_graph, weights, vertex_pair=None):
+def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
     """
     Compute the weighted Overlap Coefficient between each pair of vertices
     connected by an edge, or between arbitrary pairs of vertices specified by
@@ -28,6 +28,10 @@ def overlap_w(input_graph, weights, vertex_pair=None):
     neighbors. If first is specified but second is not, or vice versa, an
     exception will be thrown.
 
+    NOTE: This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     Parameters
     ----------
     input_graph : cugraph.Graph
@@ -51,6 +55,10 @@ def overlap_w(input_graph, weights, vertex_pair=None):
         vertices. If provided, the overlap coefficient is computed for the
         given vertex pairs, else, it is computed for all vertex pairs.
 
+    do_expensive_check: bool (default=True)
+        When set to True, check if the vertices in the graph are (re)numbered
+        from 0 to V-1 where V is the total number of vertices.
+
     Returns
     -------
     df : cudf.DataFrame
@@ -88,6 +96,21 @@ def overlap_w(input_graph, weights, vertex_pair=None):
     ...                      len(weights['vertex']))]
     >>> df = cugraph.overlap_w(G, weights)
     """
+    if do_expensive_check:
+        if not input_graph.renumbered:
+            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
+            max_vertex = input_df.max().max()
+            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
+                input_df.dtypes[0]
+            )
+            nodes = (
+                cudf.concat([input_df["src"], input_df["dst"]])
+                .unique()
+                .sort_values()
+                .reset_index(drop=True)
+            )
+            if not expected_nodes.equals(nodes):
+                raise ValueError("Unrenumbered vertices are not supported.")
 
     if type(vertex_pair) == cudf.DataFrame:
         vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)

diff --git a/python/cugraph/cugraph/link_prediction/wsorensen.py b/python/cugraph/cugraph/link_prediction/wsorensen.py
@@ -17,13 +17,17 @@
 from cugraph.utilities import renumber_vertex_pair
 
 
-def sorensen_w(input_graph, weights, vertex_pair=None):
+def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
     """
     Compute the weighted Sorensen similarity between each pair of vertices
     connected by an edge, or between arbitrary pairs of vertices specified by
     the user. Sorensen coefficient is defined between two sets as the ratio of
     twice the volume of their intersection divided by the volume of each set.
 
+    NOTE: This algorithm doesn't currently support datasets with vertices that
+    are not (re)numebred vertices from 0 to V-1 where V is the total number of
+    vertices as this creates isolated vertices.
+
     Parameters
     ----------
     input_graph : cugraph.Graph
@@ -47,6 +51,10 @@ def sorensen_w(input_graph, weights, vertex_pair=None):
         vertices. If provided, the sorensen coefficient is computed for the
         given vertex pairs, else, it is computed for all vertex pairs.
 
+    do_expensive_check: bool (default=True)
+        When set to True, check if the vertices in the graph are (re)numbered
+        from 0 to V-1 where V is the total number of vertices.
+
     Returns
     -------
     df : cudf.DataFrame
@@ -85,6 +93,22 @@ def sorensen_w(input_graph, weights, vertex_pair=None):
     >>> df = cugraph.sorensen_w(G, weights)
 
     """
+    if do_expensive_check:
+        if not input_graph.renumbered:
+            input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
+            max_vertex = input_df.max().max()
+            expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
+                input_df.dtypes[0]
+            )
+            nodes = (
+                cudf.concat([input_df["src"], input_df["dst"]])
+                .unique()
+                .sort_values()
+                .reset_index(drop=True)
+            )
+            if not expected_nodes.equals(nodes):
+                raise ValueError("Unrenumbered vertices are not supported.")
+
     if type(input_graph) is not Graph:
         raise TypeError("input graph must a Graph")