Skip to content

Commit

Permalink
[BUG] Unsupported graph for similiarity algos (#3710)
Browse files Browse the repository at this point in the history
This PR update the docstrings raises an error when running any similarity algos with vertices from a graph that are unrenumbered.

Authors:
  - Joseph Nke (https://github.com/jnke2016)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: #3710
  • Loading branch information
jnke2016 authored Jul 19, 2023
1 parent b8de24c commit 59b0eb7
Show file tree
Hide file tree
Showing 12 changed files with 234 additions and 12 deletions.
34 changes: 31 additions & 3 deletions python/cugraph/cugraph/link_prediction/jaccard.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -20,7 +20,7 @@
)


def jaccard(input_graph, vertex_pair=None):
def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
"""
Compute the Jaccard similarity between each pair of vertices connected by
an edge, or between arbitrary pairs of vertices specified by the user.
Expand All @@ -36,6 +36,10 @@ def jaccard(input_graph, vertex_pair=None):
of cugraph.jaccard is different from the behavior of
networkx.jaccard_coefficient.
This algorithm doesn't currently support datasets with vertices that
are not (re)numebred vertices from 0 to V-1 where V is the total number of
vertices as this creates isolated vertices.
cugraph.jaccard, in the absence of a specified vertex pair list, will
use the edges of the graph to construct a vertex pair list and will
return the jaccard coefficient for those vertex pairs.
Expand Down Expand Up @@ -80,6 +84,10 @@ def jaccard(input_graph, vertex_pair=None):
current implementation computes the jaccard coefficient for all
adjacent vertices in the graph.
do_expensive_check: bool (default=True)
When set to True, check if the vertices in the graph are (re)numbered
from 0 to V-1 where V is the total number of vertices.
Returns
-------
df : cudf.DataFrame
Expand All @@ -104,6 +112,22 @@ def jaccard(input_graph, vertex_pair=None):
>>> df = cugraph.jaccard(G)
"""
if do_expensive_check:
if not input_graph.renumbered:
input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
max_vertex = input_df.max().max()
expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
input_df.dtypes[0]
)
nodes = (
cudf.concat([input_df["src"], input_df["dst"]])
.unique()
.sort_values()
.reset_index(drop=True)
)
if not expected_nodes.equals(nodes):
raise ValueError("Unrenumbered vertices are not supported.")

if input_graph.is_directed():
raise ValueError("Input must be an undirected Graph.")
if type(vertex_pair) == cudf.DataFrame:
Expand All @@ -120,10 +144,14 @@ def jaccard(input_graph, vertex_pair=None):
return df


def jaccard_coefficient(G, ebunch=None):
def jaccard_coefficient(G, ebunch=None, do_expensive_check=True):
"""
For NetworkX Compatability. See `jaccard`
NOTE: This algorithm doesn't currently support datasets with vertices that
are not (re)numebred vertices from 0 to V-1 where V is the total number of
vertices as this creates isolated vertices.
Parameters
----------
graph : cugraph.Graph
Expand Down
33 changes: 30 additions & 3 deletions python/cugraph/cugraph/link_prediction/overlap.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -20,10 +20,14 @@
)


def overlap_coefficient(G, ebunch=None):
def overlap_coefficient(G, ebunch=None, do_expensive_check=True):
"""
For NetworkX Compatability. See `overlap`
NOTE: This algorithm doesn't currently support datasets with vertices that
are not (re)numebred vertices from 0 to V-1 where V is the total number of
vertices as this creates isolated vertices.
"""
vertex_pair = None

Expand All @@ -42,7 +46,7 @@ def overlap_coefficient(G, ebunch=None):
return df


def overlap(input_graph, vertex_pair=None):
def overlap(input_graph, vertex_pair=None, do_expensive_check=True):
"""
Compute the Overlap Coefficient between each pair of vertices connected by
an edge, or between arbitrary pairs of vertices specified by the user.
Expand All @@ -54,6 +58,10 @@ def overlap(input_graph, vertex_pair=None):
neighbors. If first is specified but second is not, or vice versa, an
exception will be thrown.
NOTE: This algorithm doesn't currently support datasets with vertices that
are not (re)numebred vertices from 0 to V-1 where V is the total number of
vertices as this creates isolated vertices.
Parameters
----------
input_graph : cugraph.Graph
Expand All @@ -66,6 +74,10 @@ def overlap(input_graph, vertex_pair=None):
vertices. If provided, the overlap coefficient is computed for the
given vertex pairs, else, it is computed for all vertex pairs.
do_expensive_check: bool (default=True)
When set to True, check if the vertices in the graph are (re)numbered
from 0 to V-1 where V is the total number of vertices.
Returns
-------
df : cudf.DataFrame
Expand All @@ -90,6 +102,21 @@ def overlap(input_graph, vertex_pair=None):
>>> df = cugraph.overlap(G)
"""
if do_expensive_check:
if not input_graph.renumbered:
input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
max_vertex = input_df.max().max()
expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
input_df.dtypes[0]
)
nodes = (
cudf.concat([input_df["src"], input_df["dst"]])
.unique()
.sort_values()
.reset_index(drop=True)
)
if not expected_nodes.equals(nodes):
raise ValueError("Unrenumbered vertices are not supported.")

if type(vertex_pair) == cudf.DataFrame:
vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
Expand Down
32 changes: 30 additions & 2 deletions python/cugraph/cugraph/link_prediction/sorensen.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
)


def sorensen(input_graph, vertex_pair=None):
def sorensen(input_graph, vertex_pair=None, do_expensive_check=True):
"""
Compute the Sorensen coefficient between each pair of vertices connected by
an edge, or between arbitrary pairs of vertices specified by the user.
Expand All @@ -30,6 +30,10 @@ def sorensen(input_graph, vertex_pair=None):
If first is specified but second is not, or vice versa, an exception will
be thrown.
NOTE: This algorithm doesn't currently support datasets with vertices that
are not (re)numebred vertices from 0 to V-1 where V is the total number of
vertices as this creates isolated vertices.
cugraph.sorensen, in the absence of a specified vertex pair list, will
use the edges of the graph to construct a vertex pair list and will
return the sorensen coefficient for those vertex pairs.
Expand All @@ -50,6 +54,10 @@ def sorensen(input_graph, vertex_pair=None):
current implementation computes the Sorensen coefficient for all
adjacent vertices in the graph.
do_expensive_check: bool (default=True)
When set to True, check if the vertices in the graph are (re)numbered
from 0 to V-1 where V is the total number of vertices.
Returns
-------
df : cudf.DataFrame
Expand All @@ -76,6 +84,22 @@ def sorensen(input_graph, vertex_pair=None):
>>> df = cugraph.sorensen(G)
"""
if do_expensive_check:
if not input_graph.renumbered:
input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
max_vertex = input_df.max().max()
expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
input_df.dtypes[0]
)
nodes = (
cudf.concat([input_df["src"], input_df["dst"]])
.unique()
.sort_values()
.reset_index(drop=True)
)
if not expected_nodes.equals(nodes):
raise ValueError("Unrenumbered vertices are not supported.")

if type(input_graph) is not Graph:
raise TypeError("input graph must a Graph")

Expand All @@ -94,10 +118,14 @@ def sorensen(input_graph, vertex_pair=None):
return df


def sorensen_coefficient(G, ebunch=None):
def sorensen_coefficient(G, ebunch=None, do_expensive_check=True):
"""
For NetworkX Compatability. See `sorensen`
NOTE: This algorithm doesn't currently support datasets with vertices that
are not (re)numebred vertices from 0 to V-1 where V is the total number of
vertices as this creates isolated vertices.
Parameters
----------
G : cugraph.Graph
Expand Down
28 changes: 26 additions & 2 deletions python/cugraph/cugraph/link_prediction/wjaccard.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -17,7 +17,7 @@
from cugraph.utilities import renumber_vertex_pair


def jaccard_w(input_graph, weights, vertex_pair=None):
def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
"""
Compute the weighted Jaccard similarity between each pair of vertices
connected by an edge, or between arbitrary pairs of vertices specified by
Expand All @@ -29,6 +29,10 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
neighbors. If first is specified but second is not, or vice versa, an
exception will be thrown.
NOTE: This algorithm doesn't currently support datasets with vertices that
are not (re)numebred vertices from 0 to V-1 where V is the total number of
vertices as this creates isolated vertices.
Parameters
----------
input_graph : cugraph.Graph
Expand All @@ -51,6 +55,10 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
vertices. If provided, the jaccard coefficient is computed for the
given vertex pairs, else, it is computed for all vertex pairs.
do_expensive_check: bool (default=True)
When set to True, check if the vertices in the graph are (re)numbered
from 0 to V-1 where V is the total number of vertices.
Returns
-------
df : cudf.DataFrame
Expand Down Expand Up @@ -87,6 +95,22 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
>>> df = cugraph.jaccard_w(G, weights)
"""
if do_expensive_check:
if not input_graph.renumbered:
input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
max_vertex = input_df.max().max()
expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
input_df.dtypes[0]
)
nodes = (
cudf.concat([input_df["src"], input_df["dst"]])
.unique()
.sort_values()
.reset_index(drop=True)
)
if not expected_nodes.equals(nodes):
raise ValueError("Unrenumbered vertices are not supported.")

if type(input_graph) is not Graph:
raise TypeError("input graph must a Graph")

Expand Down
25 changes: 24 additions & 1 deletion python/cugraph/cugraph/link_prediction/woverlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from cugraph.utilities import renumber_vertex_pair


def overlap_w(input_graph, weights, vertex_pair=None):
def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
"""
Compute the weighted Overlap Coefficient between each pair of vertices
connected by an edge, or between arbitrary pairs of vertices specified by
Expand All @@ -28,6 +28,10 @@ def overlap_w(input_graph, weights, vertex_pair=None):
neighbors. If first is specified but second is not, or vice versa, an
exception will be thrown.
NOTE: This algorithm doesn't currently support datasets with vertices that
are not (re)numebred vertices from 0 to V-1 where V is the total number of
vertices as this creates isolated vertices.
Parameters
----------
input_graph : cugraph.Graph
Expand All @@ -51,6 +55,10 @@ def overlap_w(input_graph, weights, vertex_pair=None):
vertices. If provided, the overlap coefficient is computed for the
given vertex pairs, else, it is computed for all vertex pairs.
do_expensive_check: bool (default=True)
When set to True, check if the vertices in the graph are (re)numbered
from 0 to V-1 where V is the total number of vertices.
Returns
-------
df : cudf.DataFrame
Expand Down Expand Up @@ -88,6 +96,21 @@ def overlap_w(input_graph, weights, vertex_pair=None):
... len(weights['vertex']))]
>>> df = cugraph.overlap_w(G, weights)
"""
if do_expensive_check:
if not input_graph.renumbered:
input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
max_vertex = input_df.max().max()
expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
input_df.dtypes[0]
)
nodes = (
cudf.concat([input_df["src"], input_df["dst"]])
.unique()
.sort_values()
.reset_index(drop=True)
)
if not expected_nodes.equals(nodes):
raise ValueError("Unrenumbered vertices are not supported.")

if type(vertex_pair) == cudf.DataFrame:
vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)
Expand Down
26 changes: 25 additions & 1 deletion python/cugraph/cugraph/link_prediction/wsorensen.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,17 @@
from cugraph.utilities import renumber_vertex_pair


def sorensen_w(input_graph, weights, vertex_pair=None):
def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
"""
Compute the weighted Sorensen similarity between each pair of vertices
connected by an edge, or between arbitrary pairs of vertices specified by
the user. Sorensen coefficient is defined between two sets as the ratio of
twice the volume of their intersection divided by the volume of each set.
NOTE: This algorithm doesn't currently support datasets with vertices that
are not (re)numebred vertices from 0 to V-1 where V is the total number of
vertices as this creates isolated vertices.
Parameters
----------
input_graph : cugraph.Graph
Expand All @@ -47,6 +51,10 @@ def sorensen_w(input_graph, weights, vertex_pair=None):
vertices. If provided, the sorensen coefficient is computed for the
given vertex pairs, else, it is computed for all vertex pairs.
do_expensive_check: bool (default=True)
When set to True, check if the vertices in the graph are (re)numbered
from 0 to V-1 where V is the total number of vertices.
Returns
-------
df : cudf.DataFrame
Expand Down Expand Up @@ -85,6 +93,22 @@ def sorensen_w(input_graph, weights, vertex_pair=None):
>>> df = cugraph.sorensen_w(G, weights)
"""
if do_expensive_check:
if not input_graph.renumbered:
input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
max_vertex = input_df.max().max()
expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
input_df.dtypes[0]
)
nodes = (
cudf.concat([input_df["src"], input_df["dst"]])
.unique()
.sort_values()
.reset_index(drop=True)
)
if not expected_nodes.equals(nodes):
raise ValueError("Unrenumbered vertices are not supported.")

if type(input_graph) is not Graph:
raise TypeError("input graph must a Graph")

Expand Down
Loading

0 comments on commit 59b0eb7

Please sign in to comment.