From 59b0eb70c4e3157c3184128bda52104c925bef25 Mon Sep 17 00:00:00 2001 From: Joseph Nke <76006812+jnke2016@users.noreply.github.com> Date: Wed, 19 Jul 2023 13:39:29 +0100 Subject: [PATCH] [BUG] Unsupported graph for similiarity algos (#3710) This PR update the docstrings raises an error when running any similarity algos with vertices from a graph that are unrenumbered. Authors: - Joseph Nke (https://github.com/jnke2016) Approvers: - Brad Rees (https://github.com/BradReesWork) URL: https://github.com/rapidsai/cugraph/pull/3710 --- .../cugraph/link_prediction/jaccard.py | 34 +++++++++++++++++-- .../cugraph/link_prediction/overlap.py | 33 ++++++++++++++++-- .../cugraph/link_prediction/sorensen.py | 32 +++++++++++++++-- .../cugraph/link_prediction/wjaccard.py | 28 +++++++++++++-- .../cugraph/link_prediction/woverlap.py | 25 +++++++++++++- .../cugraph/link_prediction/wsorensen.py | 26 +++++++++++++- .../tests/link_prediction/test_jaccard.py | 12 +++++++ .../tests/link_prediction/test_overlap.py | 11 ++++++ .../tests/link_prediction/test_sorensen.py | 12 +++++++ .../tests/link_prediction/test_wjaccard.py | 11 ++++++ .../tests/link_prediction/test_woverlap.py | 11 ++++++ .../tests/link_prediction/test_wsorensen.py | 11 ++++++ 12 files changed, 234 insertions(+), 12 deletions(-) diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py index 1c4fed7a8f9..dd411fa889d 100644 --- a/python/cugraph/cugraph/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/link_prediction/jaccard.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -20,7 +20,7 @@ ) -def jaccard(input_graph, vertex_pair=None): +def jaccard(input_graph, vertex_pair=None, do_expensive_check=True): """ Compute the Jaccard similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. @@ -36,6 +36,10 @@ def jaccard(input_graph, vertex_pair=None): of cugraph.jaccard is different from the behavior of networkx.jaccard_coefficient. + This algorithm doesn't currently support datasets with vertices that + are not (re)numebred vertices from 0 to V-1 where V is the total number of + vertices as this creates isolated vertices. + cugraph.jaccard, in the absence of a specified vertex pair list, will use the edges of the graph to construct a vertex pair list and will return the jaccard coefficient for those vertex pairs. @@ -80,6 +84,10 @@ def jaccard(input_graph, vertex_pair=None): current implementation computes the jaccard coefficient for all adjacent vertices in the graph. + do_expensive_check: bool (default=True) + When set to True, check if the vertices in the graph are (re)numbered + from 0 to V-1 where V is the total number of vertices. + Returns ------- df : cudf.DataFrame @@ -104,6 +112,22 @@ def jaccard(input_graph, vertex_pair=None): >>> df = cugraph.jaccard(G) """ + if do_expensive_check: + if not input_graph.renumbered: + input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] + max_vertex = input_df.max().max() + expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( + input_df.dtypes[0] + ) + nodes = ( + cudf.concat([input_df["src"], input_df["dst"]]) + .unique() + .sort_values() + .reset_index(drop=True) + ) + if not expected_nodes.equals(nodes): + raise ValueError("Unrenumbered vertices are not supported.") + if input_graph.is_directed(): raise ValueError("Input must be an undirected Graph.") if type(vertex_pair) == cudf.DataFrame: @@ -120,10 +144,14 @@ def jaccard(input_graph, vertex_pair=None): return df -def jaccard_coefficient(G, ebunch=None): +def jaccard_coefficient(G, ebunch=None, do_expensive_check=True): """ For NetworkX Compatability. See `jaccard` + NOTE: This algorithm doesn't currently support datasets with vertices that + are not (re)numebred vertices from 0 to V-1 where V is the total number of + vertices as this creates isolated vertices. + Parameters ---------- graph : cugraph.Graph diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py index ba9f225062e..e05e0c944fe 100644 --- a/python/cugraph/cugraph/link_prediction/overlap.py +++ b/python/cugraph/cugraph/link_prediction/overlap.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -20,10 +20,14 @@ ) -def overlap_coefficient(G, ebunch=None): +def overlap_coefficient(G, ebunch=None, do_expensive_check=True): """ For NetworkX Compatability. See `overlap` + NOTE: This algorithm doesn't currently support datasets with vertices that + are not (re)numebred vertices from 0 to V-1 where V is the total number of + vertices as this creates isolated vertices. + """ vertex_pair = None @@ -42,7 +46,7 @@ def overlap_coefficient(G, ebunch=None): return df -def overlap(input_graph, vertex_pair=None): +def overlap(input_graph, vertex_pair=None, do_expensive_check=True): """ Compute the Overlap Coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. @@ -54,6 +58,10 @@ def overlap(input_graph, vertex_pair=None): neighbors. If first is specified but second is not, or vice versa, an exception will be thrown. + NOTE: This algorithm doesn't currently support datasets with vertices that + are not (re)numebred vertices from 0 to V-1 where V is the total number of + vertices as this creates isolated vertices. + Parameters ---------- input_graph : cugraph.Graph @@ -66,6 +74,10 @@ def overlap(input_graph, vertex_pair=None): vertices. If provided, the overlap coefficient is computed for the given vertex pairs, else, it is computed for all vertex pairs. + do_expensive_check: bool (default=True) + When set to True, check if the vertices in the graph are (re)numbered + from 0 to V-1 where V is the total number of vertices. + Returns ------- df : cudf.DataFrame @@ -90,6 +102,21 @@ def overlap(input_graph, vertex_pair=None): >>> df = cugraph.overlap(G) """ + if do_expensive_check: + if not input_graph.renumbered: + input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] + max_vertex = input_df.max().max() + expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( + input_df.dtypes[0] + ) + nodes = ( + cudf.concat([input_df["src"], input_df["dst"]]) + .unique() + .sort_values() + .reset_index(drop=True) + ) + if not expected_nodes.equals(nodes): + raise ValueError("Unrenumbered vertices are not supported.") if type(vertex_pair) == cudf.DataFrame: vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py index 20238e10464..0f35f868b7c 100644 --- a/python/cugraph/cugraph/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/link_prediction/sorensen.py @@ -21,7 +21,7 @@ ) -def sorensen(input_graph, vertex_pair=None): +def sorensen(input_graph, vertex_pair=None, do_expensive_check=True): """ Compute the Sorensen coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. @@ -30,6 +30,10 @@ def sorensen(input_graph, vertex_pair=None): If first is specified but second is not, or vice versa, an exception will be thrown. + NOTE: This algorithm doesn't currently support datasets with vertices that + are not (re)numebred vertices from 0 to V-1 where V is the total number of + vertices as this creates isolated vertices. + cugraph.sorensen, in the absence of a specified vertex pair list, will use the edges of the graph to construct a vertex pair list and will return the sorensen coefficient for those vertex pairs. @@ -50,6 +54,10 @@ def sorensen(input_graph, vertex_pair=None): current implementation computes the Sorensen coefficient for all adjacent vertices in the graph. + do_expensive_check: bool (default=True) + When set to True, check if the vertices in the graph are (re)numbered + from 0 to V-1 where V is the total number of vertices. + Returns ------- df : cudf.DataFrame @@ -76,6 +84,22 @@ def sorensen(input_graph, vertex_pair=None): >>> df = cugraph.sorensen(G) """ + if do_expensive_check: + if not input_graph.renumbered: + input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] + max_vertex = input_df.max().max() + expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( + input_df.dtypes[0] + ) + nodes = ( + cudf.concat([input_df["src"], input_df["dst"]]) + .unique() + .sort_values() + .reset_index(drop=True) + ) + if not expected_nodes.equals(nodes): + raise ValueError("Unrenumbered vertices are not supported.") + if type(input_graph) is not Graph: raise TypeError("input graph must a Graph") @@ -94,10 +118,14 @@ def sorensen(input_graph, vertex_pair=None): return df -def sorensen_coefficient(G, ebunch=None): +def sorensen_coefficient(G, ebunch=None, do_expensive_check=True): """ For NetworkX Compatability. See `sorensen` + NOTE: This algorithm doesn't currently support datasets with vertices that + are not (re)numebred vertices from 0 to V-1 where V is the total number of + vertices as this creates isolated vertices. + Parameters ---------- G : cugraph.Graph diff --git a/python/cugraph/cugraph/link_prediction/wjaccard.py b/python/cugraph/cugraph/link_prediction/wjaccard.py index b8ef33d926f..fc6edae8d3e 100644 --- a/python/cugraph/cugraph/link_prediction/wjaccard.py +++ b/python/cugraph/cugraph/link_prediction/wjaccard.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -17,7 +17,7 @@ from cugraph.utilities import renumber_vertex_pair -def jaccard_w(input_graph, weights, vertex_pair=None): +def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): """ Compute the weighted Jaccard similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by @@ -29,6 +29,10 @@ def jaccard_w(input_graph, weights, vertex_pair=None): neighbors. If first is specified but second is not, or vice versa, an exception will be thrown. + NOTE: This algorithm doesn't currently support datasets with vertices that + are not (re)numebred vertices from 0 to V-1 where V is the total number of + vertices as this creates isolated vertices. + Parameters ---------- input_graph : cugraph.Graph @@ -51,6 +55,10 @@ def jaccard_w(input_graph, weights, vertex_pair=None): vertices. If provided, the jaccard coefficient is computed for the given vertex pairs, else, it is computed for all vertex pairs. + do_expensive_check: bool (default=True) + When set to True, check if the vertices in the graph are (re)numbered + from 0 to V-1 where V is the total number of vertices. + Returns ------- df : cudf.DataFrame @@ -87,6 +95,22 @@ def jaccard_w(input_graph, weights, vertex_pair=None): >>> df = cugraph.jaccard_w(G, weights) """ + if do_expensive_check: + if not input_graph.renumbered: + input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] + max_vertex = input_df.max().max() + expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( + input_df.dtypes[0] + ) + nodes = ( + cudf.concat([input_df["src"], input_df["dst"]]) + .unique() + .sort_values() + .reset_index(drop=True) + ) + if not expected_nodes.equals(nodes): + raise ValueError("Unrenumbered vertices are not supported.") + if type(input_graph) is not Graph: raise TypeError("input graph must a Graph") diff --git a/python/cugraph/cugraph/link_prediction/woverlap.py b/python/cugraph/cugraph/link_prediction/woverlap.py index c7d4f56a428..27fb7d608ca 100644 --- a/python/cugraph/cugraph/link_prediction/woverlap.py +++ b/python/cugraph/cugraph/link_prediction/woverlap.py @@ -16,7 +16,7 @@ from cugraph.utilities import renumber_vertex_pair -def overlap_w(input_graph, weights, vertex_pair=None): +def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): """ Compute the weighted Overlap Coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by @@ -28,6 +28,10 @@ def overlap_w(input_graph, weights, vertex_pair=None): neighbors. If first is specified but second is not, or vice versa, an exception will be thrown. + NOTE: This algorithm doesn't currently support datasets with vertices that + are not (re)numebred vertices from 0 to V-1 where V is the total number of + vertices as this creates isolated vertices. + Parameters ---------- input_graph : cugraph.Graph @@ -51,6 +55,10 @@ def overlap_w(input_graph, weights, vertex_pair=None): vertices. If provided, the overlap coefficient is computed for the given vertex pairs, else, it is computed for all vertex pairs. + do_expensive_check: bool (default=True) + When set to True, check if the vertices in the graph are (re)numbered + from 0 to V-1 where V is the total number of vertices. + Returns ------- df : cudf.DataFrame @@ -88,6 +96,21 @@ def overlap_w(input_graph, weights, vertex_pair=None): ... len(weights['vertex']))] >>> df = cugraph.overlap_w(G, weights) """ + if do_expensive_check: + if not input_graph.renumbered: + input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] + max_vertex = input_df.max().max() + expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( + input_df.dtypes[0] + ) + nodes = ( + cudf.concat([input_df["src"], input_df["dst"]]) + .unique() + .sort_values() + .reset_index(drop=True) + ) + if not expected_nodes.equals(nodes): + raise ValueError("Unrenumbered vertices are not supported.") if type(vertex_pair) == cudf.DataFrame: vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) diff --git a/python/cugraph/cugraph/link_prediction/wsorensen.py b/python/cugraph/cugraph/link_prediction/wsorensen.py index c017463a294..c27e4f66a02 100644 --- a/python/cugraph/cugraph/link_prediction/wsorensen.py +++ b/python/cugraph/cugraph/link_prediction/wsorensen.py @@ -17,13 +17,17 @@ from cugraph.utilities import renumber_vertex_pair -def sorensen_w(input_graph, weights, vertex_pair=None): +def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True): """ Compute the weighted Sorensen similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Sorensen coefficient is defined between two sets as the ratio of twice the volume of their intersection divided by the volume of each set. + NOTE: This algorithm doesn't currently support datasets with vertices that + are not (re)numebred vertices from 0 to V-1 where V is the total number of + vertices as this creates isolated vertices. + Parameters ---------- input_graph : cugraph.Graph @@ -47,6 +51,10 @@ def sorensen_w(input_graph, weights, vertex_pair=None): vertices. If provided, the sorensen coefficient is computed for the given vertex pairs, else, it is computed for all vertex pairs. + do_expensive_check: bool (default=True) + When set to True, check if the vertices in the graph are (re)numbered + from 0 to V-1 where V is the total number of vertices. + Returns ------- df : cudf.DataFrame @@ -85,6 +93,22 @@ def sorensen_w(input_graph, weights, vertex_pair=None): >>> df = cugraph.sorensen_w(G, weights) """ + if do_expensive_check: + if not input_graph.renumbered: + input_df = input_graph.edgelist.edgelist_df[["src", "dst"]] + max_vertex = input_df.max().max() + expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype( + input_df.dtypes[0] + ) + nodes = ( + cudf.concat([input_df["src"], input_df["dst"]]) + .unique() + .sort_values() + .reset_index(drop=True) + ) + if not expected_nodes.equals(nodes): + raise ValueError("Unrenumbered vertices are not supported.") + if type(input_graph) is not Graph: raise TypeError("input graph must a Graph") diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py index b04c4c741b1..43077126827 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py @@ -202,6 +202,7 @@ def test_nx_jaccard_time(read_csv, gpubenchmark): @pytest.mark.sg @pytest.mark.parametrize("graph_file", [netscience]) +@pytest.mark.skip(reason="Skipping because this datasets is unrenumbered") def test_jaccard_edgevals(gpubenchmark, graph_file): dataset_path = netscience.get_path() M = utils.read_csv_for_nx(dataset_path) @@ -326,3 +327,14 @@ def test_weighted_exp_jaccard(): use_weight = True with pytest.raises(ValueError): exp_jaccard(G, use_weight=use_weight) + + +@pytest.mark.sg +def test_invalid_datasets_jaccard(): + karate = DATASETS_UNDIRECTED[0] + df = karate.get_edgelist() + df = df.add(1) + G = cugraph.Graph(directed=False) + G.from_cudf_edgelist(df, source="src", destination="dst") + with pytest.raises(ValueError): + cugraph.jaccard(G) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py index 68f879dacdb..03bee451f3c 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py @@ -225,3 +225,14 @@ def test_weighted_exp_overlap(): use_weight = True with pytest.raises(ValueError): exp_overlap(G, use_weight=use_weight) + + +@pytest.mark.sg +def test_invalid_datasets_overlap(): + karate = DATASETS_UNDIRECTED[0] + df = karate.get_edgelist() + df = df.add(1) + G = cugraph.Graph(directed=False) + G.from_cudf_edgelist(df, source="src", destination="dst") + with pytest.raises(ValueError): + cugraph.overlap(G) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py index 3457627ed7d..14d84784161 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py @@ -187,6 +187,7 @@ def test_nx_sorensen_time(gpubenchmark, read_csv): @pytest.mark.sg @pytest.mark.parametrize("graph_file", [netscience]) +@pytest.mark.skip(reason="Skipping because this datasets is unrenumbered") def test_sorensen_edgevals(gpubenchmark, graph_file): dataset_path = netscience.get_path() M = utils.read_csv_for_nx(dataset_path) @@ -288,3 +289,14 @@ def test_weighted_exp_sorensen(): use_weight = True with pytest.raises(ValueError): exp_sorensen(G, use_weight=use_weight) + + +@pytest.mark.sg +def test_invalid_datasets_sorensen(): + karate = DATASETS_UNDIRECTED[0] + df = karate.get_edgelist() + df = df.add(1) + G = cugraph.Graph(directed=False) + G.from_cudf_edgelist(df, source="src", destination="dst") + with pytest.raises(ValueError): + cugraph.sorensen(G) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py index 22ace93c0e4..2bc39b877ea 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_wjaccard.py @@ -176,3 +176,14 @@ def test_wjaccard_multi_column(read_csv): actual = df_res.sort_values("0_first").reset_index() expected = df_exp.sort_values("first").reset_index() assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"]) + + +@pytest.mark.sg +def test_invalid_datasets_jaccard_w(): + karate = DATASETS_UNDIRECTED[0] + df = karate.get_edgelist() + df = df.add(1) + G = cugraph.Graph(directed=False) + G.from_cudf_edgelist(df, source="src", destination="dst") + with pytest.raises(ValueError): + cugraph.jaccard_w(G, None) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py b/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py index f4fab9d0faa..5e35bb66f07 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_woverlap.py @@ -159,3 +159,14 @@ def test_woverlap_multi_column(graph_file): actual = df_res.sort_values("0_first").reset_index() expected = df_exp.sort_values("first").reset_index() assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"]) + + +@pytest.mark.sg +def test_invalid_datasets_overlap_w(): + karate = DATASETS_UNDIRECTED[0] + df = karate.get_edgelist() + df = df.add(1) + G = cugraph.Graph(directed=False) + G.from_cudf_edgelist(df, source="src", destination="dst") + with pytest.raises(ValueError): + cugraph.overlap_w(G, None) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py index 0cf775d666c..cca2363d2d6 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_wsorensen.py @@ -180,3 +180,14 @@ def test_wsorensen_multi_column(read_csv): actual = df_res.sort_values("0_first").reset_index() expected = df_exp.sort_values("first").reset_index() assert_series_equal(actual["sorensen_coeff"], expected["sorensen_coeff"]) + + +@pytest.mark.sg +def test_invalid_datasets_sorensen_w(): + karate = DATASETS_UNDIRECTED[0] + df = karate.get_edgelist() + df = df.add(1) + G = cugraph.Graph(directed=False) + G.from_cudf_edgelist(df, source="src", destination="dst") + with pytest.raises(ValueError): + cugraph.sorensen_w(G, None)