adjust file and folder hierarchy

PKU-DAIR · Apr 26, 2022 · fc7db52 · fc7db52
1 parent 7f63dc3
commit fc7db52
Show file tree

Hide file tree

Showing 76 changed files with 880 additions and 757 deletions.
diff --git a/README.md b/README.md
@@ -1,33 +1,35 @@
 ## SGL: Scalable Graph Learning
 
-**SGL** is a Graph Neural Network (GNN) toolkit targeting scalable graph learning, which supports deep graph learning on extremely large datasets. SGL allows users to easily implement scalable graph neural networks and evalaute its performance on various downstream tasks like node classification, node clustering, and link prediction. Further, SGL supports auto neural architecture search functionality based on <a href="https://github.com/PKU-DAIR/open-box" target="_blank" rel="nofollow">OpenBox</a>. SGL is designed and developed by the graph learning team from the <a href="https://cuibinpku.github.io/index.html" target="_blank" rel="nofollow">DAIR Lab</a> at Peking University.
-
-
+**SGL** is a Graph Neural Network (GNN) toolkit targeting scalable graph learning, which supports deep graph learning on
+extremely large datasets. SGL allows users to easily implement scalable graph neural networks and evalaute its
+performance on various downstream tasks like node classification, node clustering, and link prediction. Further, SGL
+supports auto neural architecture search functionality based
+on <a href="https://github.com/PKU-DAIR/open-box" target="_blank" rel="nofollow">OpenBox</a>. SGL is designed and
+developed by the graph learning team from
+the <a href="https://cuibinpku.github.io/index.html" target="_blank" rel="nofollow">DAIR Lab</a> at Peking University.
 
 ## Library Highlights
 
-+ **High scalability**: Follow the scalable design paradigm **SGAP** in <a href="https://arxiv.org/abs/2203.00638" target="_blank" rel="nofollow">PaSca</a>, SGL scale to graph data with billions of nodes and edegs.
-+ **Auto neural architecture search**: Automatically choose decent neural architectures according to specific tasks, and pre-defined objectives (e.g., inference time).
-+ **Ease of use**: User-friendly interfaces of implementing existing scalable GNNs and executing various downstream tasks.
-
-
++ **High scalability**: Follow the scalable design paradigm **SGAP**
+  in <a href="https://arxiv.org/abs/2203.00638" target="_blank" rel="nofollow">PaSca</a>, SGL scale to graph data with
+  billions of nodes and edegs.
++ **Auto neural architecture search**: Automatically choose decent neural architectures according to specific tasks, and
+  pre-defined objectives (e.g., inference time).
++ **Ease of use**: User-friendly interfaces of implementing existing scalable GNNs and executing various downstream
+  tasks.
 
 ## Installation (TODO)
 
 #### Install from pip
 
-
 #### Install from source
 
-
-
-
 ## Quick Start (TODO)
+
 **TODO**
 A quick start example is given by:
-```python
-import torch
 
+```python
 from SGL.dataset import Planetoid
 from SGL.models.homo import SGC
 from SGL.tasks import NodeClassification
@@ -36,22 +38,22 @@ dataset = Planetoid("pubmed", "./", "official")
 model = SGC(prop_steps=3, feat_dim=dataset.num_features, num_classes=dataset.num_classes)
 
 device = "cuda:0"
-test_acc = NodeClassification(dataset, model, lr=0.1, weight_decay=5e-4, epochs=200, device=device).test_acc
+test_acc = NodeClassification(dataset, model, lr=0.1, weight_decay=5e-5, epochs=200, device=device).test_acc
 print(test_acc)
 ```
 
 **TODO**
 An example of the auto neural network search functionality is as follows:
+
 ```python
 
 ```
 
-
 ## Related Publications
 
-**PaSca: a Graph Neural Architecture Search System under the Scalable Paradigm** Wentao Zhang, Yu Shen, Zheyu Lin, Yang Li, Xiaosen Li, Wen Ouyang, Yangyu Tao, Zhi Yang, and Bin Cui; The world wide web conference (WWW 2022, CCF-A). https://arxiv.org/abs/2203.00638
-
-
+**PaSca: a Graph Neural Architecture Search System under the Scalable Paradigm** Wentao Zhang, Yu Shen, Zheyu Lin, Yang
+Li, Xiaosen Li, Wen Ouyang, Yangyu Tao, Zhi Yang, and Bin Cui; The world wide web conference (WWW 2022, CCF-A)
+. https://arxiv.org/abs/2203.00638
 
 ## License
 

diff --git a/__init__.py b/__init__.py
@@ -0,0 +1,5 @@
+__version__ = "0.0.1"
+
+__all__ = [
+    "__version__",
+]
diff --git a/data/base_data.py b/data/base_data.py
@@ -1,7 +1,7 @@
+import numpy as np
 import torch
 from scipy.sparse import csr_matrix
 from torch import Tensor
-import numpy as np
 
 
 # Base class for adjacency matrix

diff --git a/data/base_dataset.py b/data/base_dataset.py
@@ -1,9 +1,9 @@
-import os
-import os.path as osp
 import itertools
-import warnings
 import numpy as np
+import os
+import os.path as osp
 import torch
+import warnings
 from scipy.sparse import csr_matrix
 
 from data.base_data import Node, Edge
@@ -248,7 +248,7 @@ def sample_by_edge_type(self, edge_types, undirected=True):
         pre_sampled_node_types = []
         for edge_type in edge_types:
             pre_sampled_node_types = pre_sampled_node_types + \
-                [edge_type.split('__')[0], edge_type.split('__')[2]]
+                                     [edge_type.split('__')[0], edge_type.split('__')[2]]
         pre_sampled_node_types = list(set(pre_sampled_node_types))
 
         sampled_node_types = []
@@ -302,7 +302,7 @@ def sample_by_edge_type(self, edge_types, undirected=True):
 
         edge_weight = torch.ones(len(rows))
         adj = csr_matrix((edge_weight.numpy(), (rows.numpy(),
-                         cols.numpy())), shape=(num_node, num_node))
+                                                cols.numpy())), shape=(num_node, num_node))
 
         # remove previously existed undirected edges
         adj.data = torch.ones(len(adj.data)).numpy()

diff --git a/data/utils.py b/data/utils.py
@@ -21,4 +21,4 @@ def to_undirected(edge_index):
     new_col = torch.hstack((col, row))
     new_edge_index = torch.stack((new_row, new_col), dim=0)
 
-    return new_edge_index
+    return new_edge_index
diff --git a/dataset/__init__.py b/dataset/__init__.py
@@ -0,0 +1,43 @@
+from .acm import Acm
+from .actor import Actor
+from .airports import Airports
+from .amazon import Amazon
+from .amazon_product import AmazonProduct
+from .coauthor import Coauthor
+from .dblp import Dblp
+from .facebook import Facebook
+from .flickr import Flickr
+from .github import Github
+from .karateclub import KarateClub
+from .linkx_dataset import LINKXDataset
+from .nell import Nell
+from .ogbn import Ogbn
+from .ogbn_mag import OgbnMag
+from .planetoid import Planetoid
+from .reddit import Reddit
+from .twitch import Twitch
+from .webkb import WebKB
+from .wikics import Wikics
+
+__all__ = [
+    "Acm",
+    "Actor",
+    "Airports",
+    "AmazonProduct",
+    "Amazon",
+    "Coauthor",
+    "Dblp",
+    "Facebook",
+    "Flickr",
+    "Github",
+    "KarateClub",
+    "LINKXDataset",
+    "Nell",
+    "OgbnMag",
+    "Ogbn",
+    "Planetoid",
+    "Reddit",
+    "Twitch",
+    "WebKB",
+    "Wikics",
+]
diff --git a/dataset/acm.py b/dataset/acm.py
@@ -1,16 +1,15 @@
 import os.path as osp
 import pickle as pkl
 import torch
+from torch_geometric.datasets import HGBDataset
 from typing import Tuple
 
 from data.base_data import HeteroGraph
 from data.base_dataset import HeteroNodeDataset
 from dataset.utils import pkl_read_file
-from torch_geometric.datasets import HGBDataset
 
 
 class Acm(HeteroNodeDataset):
-
     NODE_TYPES = [
         'paper',
         'author',
@@ -47,7 +46,7 @@ def __init__(self, root="./", split="official"):
     def edge_type_tuple_to_str(self, edge_type_tuple: Tuple) -> str:
         if len(edge_type_tuple) != 3:
             raise ValueError('number of elements is invalid for input tuple')
-        return edge_type_tuple[0]+self.EDGE_TYPE_DELIMITER+edge_type_tuple[2]
+        return edge_type_tuple[0] + self.EDGE_TYPE_DELIMITER + edge_type_tuple[2]
 
     @property
     def raw_file_paths(self):
@@ -86,12 +85,12 @@ def _process(self):
         edge_weight_dict = {}
         for edge_type_tuple in self.EDGE_TYPES_TUPLE:
             edge_type = self.edge_type_tuple_to_str(edge_type_tuple)
-         
+
             row_dict[edge_type] = self.src_dataset[edge_type_tuple]['edge_index'][0] \
-                + previous_node_cnt_dict[edge_type_tuple[0]]
+                                  + previous_node_cnt_dict[edge_type_tuple[0]]
 
             col_dict[edge_type] = self.src_dataset[edge_type_tuple]['edge_index'][1] \
-                + previous_node_cnt_dict[edge_type_tuple[2]]
+                                  + previous_node_cnt_dict[edge_type_tuple[2]]
 
             edge_weight_dict[edge_type] = torch.ones(
                 self.src_dataset[edge_type_tuple]['edge_index'].size(1))
@@ -110,7 +109,7 @@ def _process(self):
             num_cur_node_type = num_node_dict[node_type]
             node_id_dict[node_type] = [i for i in range(
                 accumulated_node_cnt,
-                accumulated_node_cnt+num_cur_node_type)]
+                accumulated_node_cnt + num_cur_node_type)]
             accumulated_node_cnt += num_cur_node_type
 
         # obtain x_dict
@@ -130,7 +129,7 @@ def _process(self):
             if 'x' in self.src_dataset[node_type]:
                 cur_x_len = self.src_dataset[node_type]['x'].size(1)
                 padded_tensor[:, accumulated_feature_dim:accumulated_feature_dim +
-                              cur_x_len] = self.src_dataset[node_type]['x']
+                                                         cur_x_len] = self.src_dataset[node_type]['x']
                 accumulated_feature_dim += cur_x_len
             x_dict[node_type] = padded_tensor.numpy()
 

diff --git a/dataset/actor.py b/dataset/actor.py
@@ -1,7 +1,6 @@
+import numpy as np
 import os.path as osp
 import pickle as pkl
-
-import numpy as np
 import torch
 from torch_sparse import SparseTensor, coalesce
 
@@ -23,17 +22,17 @@ def __init__(self, name="actor", root="./", split="official", split_id=0):
         self._split, self._split_id = split, split_id
         self._train_idx, self._val_idx, self._test_idx = self.__generate_split(
             split)
-        
+
     @property
     def raw_file_paths(self):
         filenames = ['out1_node_feature_label.txt', 'out1_graph_edges.txt'
-                ] + [f'film_split_0.6_0.2_{i}.npz' for i in range(10)]
+                     ] + [f'film_split_0.6_0.2_{i}.npz' for i in range(10)]
         return [osp.join(self._raw_dir, filename) for filename in filenames]
 
     @property
     def processed_file_paths(self):
         return osp.join(self._processed_dir, f"{self._name}.graph")
-    
+
     def _download(self):
         url = 'https://raw.githubusercontent.com/graphdml-uiuc-jlu/geom-gcn/master'
 
@@ -42,13 +41,13 @@ def _download(self):
             file_url = f'{url}/new_data/film/{raw_file_name}'
             print(file_url)
             download_to(file_url, raw_file_path)
-        
+
         for raw_file_path in self.raw_file_paths[2:]:
             raw_file_name = osp.basename(raw_file_path)
             file_url = f'{url}/splits/{raw_file_name}'
             print(file_url)
             download_to(file_url, raw_file_path)
-        
+
     def _process(self):
         with open(self.raw_file_paths[0], 'r') as f:
             data = [x.split('\t') for x in f.read().split('\n')[1:-1]]
@@ -74,21 +73,21 @@ def _process(self):
             data = [[int(v) for v in r.split('\t')] for r in data]
             edge_index = torch.tensor(data, dtype=torch.long).t().contiguous()
             edge_index, _ = coalesce(edge_index, None, x.size(0), x.size(0))
-        
+
         row, col = edge_index[0], edge_index[1]
         edge_weight = torch.ones(len(row))
         edge_type = "actor__to__actor"
 
         g = Graph(row, col, edge_weight, num_node,
                   node_type, edge_type, x=features, y=labels)
-        
+
         with open(self.processed_file_paths, 'wb') as rf:
             try:
                 pkl.dump(g, rf)
             except IOError as e:
                 print(e)
                 exit(1)
-    
+
     def __generate_split(self, split):
         if split == "official":
             train_masks, val_masks, test_masks = [], [], []

diff --git a/dataset/airports.py b/dataset/airports.py
@@ -1,7 +1,6 @@
+import numpy as np
 import os.path as osp
 import pickle as pkl
-
-import numpy as np
 import torch
 
 from data.base_data import Graph

diff --git a/dataset/amazon.py b/dataset/amazon.py
@@ -1,6 +1,5 @@
 import os.path as osp
 import pickle as pkl
-
 import torch
 
 from data.base_data import Graph

diff --git a/dataset/amazon_product.py b/dataset/amazon_product.py
@@ -1,16 +1,16 @@
 # Haven't tested yet
-import os.path as osp
-import pickle as pkl
 import json
-
 import numpy as np
+import os.path as osp
+import pickle as pkl
 import scipy.sparse as sp
 import torch
 
 from data.base_data import Graph
 from data.base_dataset import NodeDataset
 from dataset.utils import pkl_read_file, download_to
 
+
 class AmazonProduct(NodeDataset):
     def __init__(self, name="amazonproduct", root="./", split="official"):
         super(AmazonProduct, self).__init__(root + "AmazonProduct", name)
@@ -19,12 +19,12 @@ def __init__(self, name="amazonproduct", root="./", split="official"):
         self._split = split
         self._train_idx, self._val_idx, self._test_idx = self.__generate_split(
             split)
-    
+
     @property
     def raw_file_paths(self):
         filenames = ['adj_full.npz', 'feats.npy', 'class_map.json', 'role.json']
         return [osp.join(self._raw_dir, filename) for filename in filenames]
-    
+
     @property
     def processed_file_paths(self):
         return osp.join(self._processed_dir, f"{self._name}.graph")

diff --git a/dataset/choose_edge_type.py b/dataset/choose_edge_type.py
@@ -1,7 +1,7 @@
-from typing import List, Tuple
-import random
 import math
+import random
 import warnings
+from typing import List, Tuple
 
 EDGE_TYPE_DELIMITER = '__to__'
 
@@ -18,7 +18,7 @@ def RemoveDuplicateEdgeType(edge_types: List) -> List[str]:
     unique_edge_types = []
     for et in edge_types:
         et_tuple = EdgeTypeStr2Tuple(et)
-        reversed_et = et_tuple[1]+EDGE_TYPE_DELIMITER+et_tuple[0]
+        reversed_et = et_tuple[1] + EDGE_TYPE_DELIMITER + et_tuple[0]
         if reversed_et not in unique_edge_types:
             unique_edge_types.append(et)
     return unique_edge_types
@@ -63,7 +63,7 @@ def Combination(n: int, k: int) -> int:
         raise ValueError('n < 0 or k < 0!')
     result = 1
     for i in range(k):
-        result = result*(n-i)//(i+1)
+        result = result * (n - i) // (i + 1)
     return result
 
 
@@ -75,8 +75,8 @@ def ChooseMultiSubgraphs(subgraph_num: int, edge_type_num: int,
         return subgraph_edge_types_list
 
     # Estimate by "coupon collector"
-    maximal_reasonable_steps = 10 * Combination(len(unique_edge_type), edge_type_num) *\
-        (math.log2(Combination(len(unique_edge_type), edge_type_num))+1)
+    maximal_reasonable_steps = 10 * Combination(len(unique_edge_type), edge_type_num) * \
+                               (math.log2(Combination(len(unique_edge_type), edge_type_num)) + 1)
     step_cnt = 0
 
     for _ in range(subgraph_num):