Running following code if device = 'cpu'
then loss decreases as expected. However, if device = 'cuda'
the loss wont decrease, and model training fails. How should I correct this?
device = 'cpu'
device = 'cuda'
import sys
from typing import Optional
import torch
from torch import Tensor
from torch_geometric.nn import Node2Vec
class Node2VecV2(Node2Vec):
def __init__(
self,
edge_index: Tensor,
embedding_dim: int,
walk_length: int,
context_size: int,
walks_per_node: int = 1,
p: float = 1.0,
q: float = 1.0,
num_negative_samples: int = 1,
num_nodes: Optional[int] = None,
sparse: bool = False,
batch_size: int = 128,
shuffle: bool = True,
num_workers: int = 0,
device: str = 'cuda'
):
super().__init__(edge_index, embedding_dim, walk_length, context_size, walks_per_node,
p, q, num_negative_samples, num_nodes, sparse)
self.device = device
self.batch_size = batch_size
self.shuffle = shuffle
self.num_workers = 0 if sys.platform.startswith('win') else num_workers
self.loader = self.loader(batch_size=128, shuffle=True,
num_workers=num_workers)
# def make_loader(
# self,
# batch_size: int = 128,
# shuffle: bool = True,
# num_workers: int = 0,
# device: str = 'cpu'
# ):
# self.device = device
# self.batch_size = batch_size
# self.shuffle = shuffle
# self.num_workers = 0 if sys.platform.startswith('win') else num_workers
# self.loader = super().loader(batch_size=128, shuffle=True,
# num_workers=num_workers)
def __repr__(self) -> str:
return (f'{self.__class__.__name__}({self.embedding.weight.size(0)}, '
f'{self.embedding.weight.size(1)})')
def train_(self):
self.train()
total_loss = 0
for pos_rw, neg_rw in self.loader:
self.optimizer.zero_grad()
loss = self.loss(pos_rw.to(device), neg_rw.to(device))
loss.backward()
self.optimizer.step()
total_loss += loss.item()
#print(total_loss / len(self.loader))
return total_loss / len(self.loader)
def train_dw_model(dw_model, dataset):
data = dataset[0]
data = data.to(device)
best_val = 0
for epoch in range(1, 101):
loss = dw_model.train_()
with torch.no_grad():
dw_model.eval()
z = dw_model()
val_acc = dw_model.test(
z[data.train_mask], data.y[data.train_mask],
z[data.val_mask], data.y[data.val_mask],
max_iter=150)
if val_acc> best_val:
best_val = val_acc
torch.save(dw_model.state_dict(), 'dw.pt')
print(f'{dataset}_{dw_model} Epoch: {epoch:02d}, Loss: {loss:.4f},'
f' Val: {val_acc*100:.2f} best Val: {best_val*100:.2f} ')
if __name__ == '__main__':
import os.path as osp
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'Planetoid')
dataset = Planetoid(path, 'Cora', transform=T.NormalizeFeatures())
data = dataset[0].to(device)
dw_model = Node2VecV2(data.edge_index, embedding_dim=128, walk_length=20,
context_size=10, walks_per_node=10,
num_negative_samples=1, p=1, q=1, sparse=True,batch_size=128, shuffle=True, num_workers=4, device=device).to(device)
#dw_model.make_loader(batch_size=128, shuffle=True, num_workers=4, device=args.device)
dw_model.optimizer = torch.optim.SparseAdam(list(dw_model.parameters()), lr=0.01)
for i in range(10):
print(f'Epoch {i} loss: ',dw_model.train_())
cpu output:
Epoch 0 loss: 8.111482880332254
Epoch 1 loss: 6.081473242152821
Epoch 2 loss: 4.976185473528775
Epoch 3 loss: 4.138110041618347
Epoch 4 loss: 3.4765207875858652
Epoch 5 loss: 2.960351337086071
Epoch 6 loss: 2.5505979494615034
Epoch 7 loss: 2.2174546501853247
Epoch 8 loss: 1.955638435753909
Epoch 9 loss: 1.7383252869952808
Process finished with exit code 0
cuda output:
Epoch 0 loss: 1.3862942511385137
Epoch 1 loss: 1.3862942511385137
Epoch 2 loss: 1.3862942511385137
Epoch 3 loss: 1.3862942511385137
Epoch 4 loss: 1.3862942511385137
Epoch 5 loss: 1.3862942511385137
Epoch 6 loss: 1.3862942511385137
Epoch 7 loss: 1.3862942511385137
Epoch 8 loss: 1.3862942511385137
Epoch 9 loss: 1.3862942511385137
Process finished with exit code 0