I borrowed some code from pytorch-geometric tutorials
@inproceedings{Fey/Lenssen/2019,
title={Fast Graph Representation Learning with {PyTorch Geometric}},
author={Fey, Matthias and Lenssen, Jan E.},
booktitle={ICLR Workshop on Representation Learning on Graphs and Manifolds},
year={2019},
}
@inproceedings{Chiang_2019,
title={Cluster-GCN: An Efficient Algorithm for Training Deep and Large Graph Convolutional Networks},
booktitle={Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining},
publisher={ACM},
author={Chiang, Wei-Lin and Liu, Xuanqing and Si, Si and Li, Yang and Bengio, Samy and Hsieh, Cho-Jui},
year={2019}
}
Neural nets learn by seeing samples from the training set then adjusting the weights so as to make the actual output more like the desired target values. There are three ways to present samples to the neural net:
Suppose we have a dataset of size 100 by 2:
image source: Introduction to Deep Learning by Nandita Bhaskhar
image source: Chiang et al. 2019
# install pytorch_geometric
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
If you’re using a CPU, check torch version:
!python -c "import torch; print(torch.__version__)"
the command to install the dpendencies:
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.2.1+cpu.html
If you’re using a GPU, check torch version:
!python -c "import torch; print(torch.__version__)"
and CUDA version:
!python -c "import torch; print(torch.version.cuda)"
the command to install the dpendencies:
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.2.1+cu121.html
!python -c "import torch; print(torch.__version__)"
2.2.1+cu121
!python -c "import torch; print(torch.version.cuda)"
12.1
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.2.1+cpu.html
# Standard libraries
import numpy as np
from scipy import sparse
import seaborn as sns
import pandas as pd
# Plotting libraries
import matplotlib.pyplot as plt
import networkx as nx
from matplotlib import cm
from IPython.display import Javascript # Restrict height of output cell.
# PyTorch
import torch
import torch.nn.functional as F
from torch.nn import Linear
# import pyg_lib
import torch_sparse
# PyTorch geometric
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import Planetoid
from torch_geometric.loader import ClusterData, ClusterLoader
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.data import Data
from torch_geometric import seed_everything
random_seed = 42
torch.manual_seed(1234567)
seed_everything(42)
plt.style.use('dark_background')
num_epochs = 101
dataset = Planetoid(root='data/Planetoid', name='PubMed', transform=NormalizeFeatures())
num_features = dataset.num_features
num_classes = dataset.num_classes
data = dataset[0] # Get the first graph object.
data
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.test.index
Processing...
Done!
Data(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717])
# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.3f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')
Number of nodes: 19717
Number of edges: 88648
Average node degree: 4.50
Number of training nodes: 60
Training node label rate: 0.003
Has isolated nodes: False
Has self-loops: False
Is undirected: True
class GCN(torch.nn.Module):
def __init__(self, hidden_channels):
super().__init__()
self.conv1 = GCNConv(num_features, hidden_channels)
self.conv2 = GCNConv(hidden_channels, num_classes)
def forward(self, x, edge_index):
x = self.conv1(x, edge_index)
x = x.relu()
x = F.dropout(x, p=0.5, training=self.training)
x = self.conv2(x, edge_index)
return x
model = GCN(hidden_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()
print(model)
GCN(
(conv1): GCNConv(500, 16)
(conv2): GCNConv(16, 3)
)
def train(data, mask):
model.train()
optimizer.zero_grad() # Clear gradients.
out = model(data.x, data.edge_index) # Perform a single forward pass.
loss = criterion(out[mask], data.y[mask]) # Compute the loss solely based on the training nodes.
loss.backward() # Derive gradients.
optimizer.step() # Update parameters based on gradients.
return loss
def test(data, mask):
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1) # Use the class with highest probability.
correct = pred[mask] == data.y[mask] # Check against ground-truth labels.
acc = int(correct.sum()) / int(mask.sum()) # Derive ratio of correct predictions.
return acc
model = GCN(hidden_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))
for epoch in range(1, num_epochs):
loss = train(data, data.train_mask)
if epoch % 10 == 0:
train_acc = test(data, data.train_mask)
val_acc = test(data, data.val_mask)
print(f'Epoch: {epoch:03d}, Train: {train_acc:.4f}')
Epoch: 010, Train: 0.9167
Epoch: 020, Train: 0.9333
Epoch: 030, Train: 0.9333
Epoch: 040, Train: 0.9500
Epoch: 050, Train: 0.9500
Epoch: 060, Train: 0.9667
Epoch: 070, Train: 0.9667
Epoch: 080, Train: 0.9833
Epoch: 090, Train: 1.0000
Epoch: 100, Train: 1.0000
test_acc = test(data, data.test_mask)
test_acc
0.766
cluster_data = ClusterData(data, num_parts=128)
train_loader = ClusterLoader(cluster_data, batch_size=32, shuffle=True)
total_num_nodes = 0
for step, sub_data in enumerate(train_loader):
print(f'Batch: {step + 1} has {sub_data.num_nodes} nodes')
print(sub_data)
print()
total_num_nodes += sub_data.num_nodes
print(f'Iterated over {total_num_nodes} of {data.num_nodes} nodes!')
Computing METIS partitioning...
Batch: 1 has 4961 nodes
Data(x=[4961, 500], y=[4961], train_mask=[4961], val_mask=[4961], test_mask=[4961], edge_index=[2, 17856])
Batch: 2 has 4912 nodes
Data(x=[4912, 500], y=[4912], train_mask=[4912], val_mask=[4912], test_mask=[4912], edge_index=[2, 17472])
Batch: 3 has 4915 nodes
Data(x=[4915, 500], y=[4915], train_mask=[4915], val_mask=[4915], test_mask=[4915], edge_index=[2, 16548])
Batch: 4 has 4929 nodes
Data(x=[4929, 500], y=[4929], train_mask=[4929], val_mask=[4929], test_mask=[4929], edge_index=[2, 14892])
Iterated over 19717 of 19717 nodes!
Done!
def train_batch(loader):
model.train()
for sub_data in train_loader: # Iterate over each mini-batch.
out = model(sub_data.x, sub_data.edge_index) # Perform a single forward pass.
loss = criterion(out[sub_data.train_mask], sub_data.y[sub_data.train_mask]) # Compute the loss solely based on the training nodes.
loss.backward() # Derive gradients.
optimizer.step() # Update parameters based on gradients.
optimizer.zero_grad() # Clear gradients.
model = GCN(hidden_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))
for epoch in range(1, num_epochs):
loss = train_batch(train_loader)
if epoch % 10 == 0:
train_acc = test(data, data.train_mask)
val_acc = test(data, data.val_mask)
print(f'Epoch: {epoch:03d}, Train: {train_acc:.4f}')
Epoch: 010, Train: 0.9333
Epoch: 020, Train: 0.9833
Epoch: 030, Train: 1.0000
Epoch: 040, Train: 1.0000
Epoch: 050, Train: 1.0000
Epoch: 060, Train: 1.0000
Epoch: 070, Train: 1.0000
Epoch: 080, Train: 1.0000
Epoch: 090, Train: 1.0000
Epoch: 100, Train: 1.0000
test_acc = test(data, data.test_mask)
test_acc
0.773