I’m currently engaged in a project that involves the construction of a Graph Neural Network (GNN) model using PyTorch Geometric. The primary objective of this project is to predict train delays in the French railway network through a multistep process.
The data for this project is represented as a heterogeneous graph, where each train is connected to its preceding and succeeding stations, referred to as Remarkable Points (PRs – Points Remarquables). The aim is to predict the delay attribute of the edges that connect a train to its subsequent PRs.
In addition to the train-PR connections, the stations themselves are interconnected to represent the overall network structure. Here is what one graph looks like:
enter image description here
In the graphical representation of this network, trains are depicted as blue nodes, while the relationships between stations are represented by blue edges
My data is represented as a HeteroData object, which is a heterogeneous graph with different types of nodes and edges. Here’s an example of my data:
HeteroData(
train={
x=[391, 8],
geometry=[391, 2],
},
pr={ geometry=[3076, 2] },
(train, prev_pr, pr)={
edge_index=[2, 2034],
edge_attr=[2034, 2],
},
(train, foll_pr, pr)={
edge_index=[2, 5871],
edge_attr=[5871, 1],
y=[5871],
},
(pr, pr_pr, pr)={
edge_index=[2, 3716],
edge_attr=[3716, 2],
}
)
I have already written code to create a dataset object for my data. Now, I’m trying to build a GNN model that can handle this type of data, but I’m not sure how to proceed. Specifically, I’m not sure how to define the architecture of the model that can predict the delay attribute for each edge, how to handle the different types of nodes and edges, and how to train the model on my data.
Here’s how I created the dataset object:
class RailwayGraphDataset(InMemoryDataset):
def __init__(self, root, transform=None, pre_transform=None):
super(RailwayGraphDataset, self).__init__(root, transform, pre_transform)
self.data, self.slices = torch.load(self.processed_paths[0])
@property
def raw_file_names(self):
return os.listdir(self.raw_dir)
@property
def processed_file_names(self):
return ['data.pt']
def download(self):
pass
def process(self):
data_list = []
# process each raw file into a PyG Data object
for raw_path in self.raw_paths:
if os.path.splitext(raw_path)[1] == '.pickle':
with open(raw_path, 'rb') as f:
G = pickle.load(f)
else:
continue
hetero_data = HeteroData()
# Node features
train_features = {}
pr_features = {}
# Edge features
prev_pr_features = {}
foll_pr_features = {}
pr_pr_features = {}
# Labels
foll_pr_labels = {}
# Create a local mapping from original node indices to new contiguous indices
train_nodes = [node for node, data in G.nodes(data=True) if data['type'] == 'train']
pr_nodes = [node for node, data in G.nodes(data=True) if data['type'] == 'pr']
train_node_mapping = {node: i for i, node in enumerate(train_nodes)}
pr_node_mapping = {node: i for i, node in enumerate(pr_nodes)} # Without offset
pr_node_mapping_offset = {node: i + len(train_nodes) for i, node in enumerate(pr_nodes)} # With offset
for node, data in G.nodes(data=True):
if data['type'] == 'train':
train_features[node] = {
'train_type': torch.tensor(data['train_type'], dtype=torch.float),
'geometry': torch.tensor(data['geometry'], dtype=torch.float)
}
elif data['type'] == 'pr':
pr_features[node] = {
'geometry': torch.tensor(data['geometry'], dtype=torch.float)
}
for u, v, data in G.edges(data=True):
if data['type'] == 'prev_pr':
try:
map_u = train_node_mapping[u]
map_v = pr_node_mapping[v]
except KeyError:
map_u = pr_node_mapping[u]
map_v = train_node_mapping[v]
map_u, map_v = map_v, map_u
prev_pr_features[(map_u, map_v)] = {
'delay': torch.tensor(data['delay'], dtype=torch.float),
'theoric_arrival_time': torch.tensor(data['theoric_arrival_time'], dtype=torch.float)
}
elif data['type'] == 'foll_pr':
try:
map_u = train_node_mapping[u]
map_v = pr_node_mapping[v]
except KeyError:
map_u = pr_node_mapping[u]
map_v = train_node_mapping[v]
map_u, map_v = map_v, map_u
foll_pr_features[(map_u, map_v)] = {
'delay': torch.tensor(data['delay'], dtype=torch.float)
}
foll_pr_labels[(map_u, map_v)] = torch.tensor(data['delay'], dtype=torch.float)
elif data['type'] == 'pr-pr':
pr_pr_features[(pr_node_mapping[u], pr_node_mapping[v])] = {
'length_m': torch.tensor(data['length_m'], dtype=torch.float),
'vol_oiseau': torch.tensor(data['vol_oiseau'], dtype=torch.float)
}
# Initialize node and edge feature matrices
hetero_data['train'].x = torch.zeros([len(train_features), 8], dtype=torch.float)
hetero_data['train'].geometry = torch.zeros([len(train_features), 2], dtype=torch.float)
#hetero_data['pr'].x = torch.zeros([len(pr_features), 8], dtype=torch.float) # default value for train_type
hetero_data['pr'].geometry = torch.zeros([len(pr_features), 2], dtype=torch.float)
# Add node features to HeteroData
for node, features in train_features.items():
hetero_data['train'].x[train_node_mapping[node]] = features['train_type']
hetero_data['train'].geometry[train_node_mapping[node]] = features['geometry']
for node, features in pr_features.items():
hetero_data['pr'].geometry[pr_node_mapping[node]] = features['geometry']
# Initialize edge indices
hetero_data['train', 'prev_pr', 'pr'].edge_index = []
hetero_data['train', 'foll_pr', 'pr'].edge_index = []
hetero_data['pr', 'pr_pr', 'pr'].edge_index = []
hetero_data['train', 'prev_pr', 'pr'].edge_attr = []
hetero_data['train', 'foll_pr', 'pr'].edge_attr = []
hetero_data['pr', 'pr_pr', 'pr'].edge_attr = []
# Add edge features to HeteroData
for edge, features in prev_pr_features.items():
hetero_data['train', 'prev_pr', 'pr'].edge_index.append(edge)
hetero_data['train', 'prev_pr', 'pr'].edge_attr.append(torch.stack(list(features.values())))
#print(hetero_data['train', 'prev_pr', 'pr'].edge_index)
for edge, features in foll_pr_features.items():
hetero_data['train', 'foll_pr', 'pr'].edge_index.append(edge)
hetero_data['train', 'foll_pr', 'pr'].edge_attr.append(torch.stack(list(features.values())))
for edge, features in pr_pr_features.items():
hetero_data['pr', 'pr_pr', 'pr'].edge_index.append(edge)
hetero_data['pr', 'pr_pr', 'pr'].edge_attr.append(torch.stack(list(features.values())))
# Convert lists to tensors
hetero_data['train', 'prev_pr', 'pr'].edge_index = torch.tensor(hetero_data['train', 'prev_pr', 'pr'].edge_index, dtype=torch.long).t().contiguous()
hetero_data['train', 'prev_pr', 'pr'].edge_attr = torch.stack(hetero_data['train', 'prev_pr', 'pr'].edge_attr)
hetero_data['train', 'foll_pr', 'pr'].edge_index = torch.tensor(hetero_data['train', 'foll_pr', 'pr'].edge_index, dtype=torch.long).t().contiguous()
hetero_data['train', 'foll_pr', 'pr'].edge_attr = torch.stack(hetero_data['train', 'foll_pr', 'pr'].edge_attr)
hetero_data['pr', 'pr_pr', 'pr'].edge_index = torch.tensor(hetero_data['pr', 'pr_pr', 'pr'].edge_index, dtype=torch.long).t().contiguous()
hetero_data['pr', 'pr_pr', 'pr'].edge_attr = torch.stack(hetero_data['pr', 'pr_pr', 'pr'].edge_attr)
# Add labels to HeteroData
hetero_data['train', 'foll_pr', 'pr'].y = torch.tensor([foll_pr_labels[tuple(edge)] for edge in hetero_data['train', 'foll_pr', 'pr'].edge_index.t().tolist()])
data_list.append(hetero_data)
if self.pre_filter is not None:
data_list = [data for data in data_list if self.pre_filter(data)]
if self.pre_transform is not None:
data_list = [self.pre_transform(data) for data in data_list]
data, slices = self.collate(data_list)
torch.save((data, slices), self.processed_paths[0])
Thank you very much for helping me!!
I tried to use the pytorch “to_hetero” function without success for now as I don’t really know how to define the model
Boubacar Sow is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.