I trained a LayoutLMv3 document classification model using pytorch lightning for few document classes. After training I uploaded model weights to huggingface hub.
ISSUE: My model1 was trained on 4 classes and now I want to fine-tune model1 with other data that has different classes. I want to use the same weights so that model2 should be able to classify on both dataset(dataset1 classes + dataset2 classes). I’m facing issues removing the last layers.
Could anybody please help what are the changes that I need to make in order to fine-tune model1 on different data.
Could anybody please help what are the changes that I need to make in order to fine-tune model1 on different data.
Thanks!
I tried below code but looks like model weights are newly initialized.
<code>model = LayoutLMv3ForSequenceClassification.from_pretrained(
num_labels=len(DOCUMENT_CLASSES),
ignore_mismatched_sizes=True,
# warning: Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at 314e/mymodel1 and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
<code>model = LayoutLMv3ForSequenceClassification.from_pretrained(
'user/mymodel1',
num_labels=len(DOCUMENT_CLASSES),
ignore_mismatched_sizes=True,
)
# warning: Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at 314e/mymodel1 and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
</code>
model = LayoutLMv3ForSequenceClassification.from_pretrained(
'user/mymodel1',
num_labels=len(DOCUMENT_CLASSES),
ignore_mismatched_sizes=True,
)
# warning: Some weights of LayoutLMv3ForSequenceClassification were not initialized from the model checkpoint at 314e/mymodel1 and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([7]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Below is the code snippet that I used to train model1.
<code>DOCUMENT_CLASSES = sorted(list(map(
Path(TRAIN_DS_PATH).glob("*")
train_images = list(TRAIN_DS_PATH.glob('*/*.jpg'))
test_images = list(TEST_DS_PATH.glob('*/*.jpg'))
val_images = list(VAL_DS_PATH.glob('*/*.jpg'))
def scale_bounding_box(box: List[int], width_scale : float = 1.0, height_scale : float = 1.0) -> List[int]:
int(box[0] * width_scale),
int(box[1] * height_scale),
int(box[2] * width_scale),
int(box[3] * height_scale)
class DocumentClassificationDataset(Dataset):
def __init__(self, image_paths, processor):
self.image_paths = image_paths
self.processor = processor
return len(self.image_paths)
def __getitem__(self, item):
image_path = self.image_paths[item]
json_path = image_path.with_suffix(".json")
with json_path.open("r") as f:
ocr_result = json.load(f)
with Image.open(image_path).convert("RGB") as image:
width, height = image.size
width_scale = 1000 / width
height_scale = 1000 / height
boxes.append(scale_bounding_box(
words.append(row["word"])
encoding = self.processor(
label = DOCUMENT_CLASSES.index(image_path.parent.name)
input_ids=encoding["input_ids"].flatten(),
attention_mask=encoding["attention_mask"].flatten(),
bbox=encoding["bbox"].flatten(end_dim=1),
pixel_values=encoding["pixel_values"].flatten(end_dim=1),
labels=torch.tensor(label, dtype=torch.long)
train_dataset = DocumentClassificationDataset(train_images, processor)
val_dataset = DocumentClassificationDataset(val_images, processor)
train_data_loader = DataLoader(
val_data_loader = DataLoader(
class ModelModule(pl.LightningModule):
def __init__(self, n_classes:int):
self.model = LayoutLMv3ForSequenceClassification.from_pretrained(
"microsoft/layoutlmv3-base",
self.model.config.id2label = {k: v for k, v in enumerate(DOCUMENT_CLASSES)}
self.model.config.label2id = {v: k for k, v in enumerate(DOCUMENT_CLASSES)}
self.train_accuracy = Accuracy(task="multiclass", num_classes=n_classes)
self.val_accuracy = Accuracy(task="multiclass", num_classes=n_classes)
def forward(self, input_ids, attention_mask, bbox, pixel_values, labels=None):
attention_mask=attention_mask,
pixel_values=pixel_values,
def training_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
pixel_values = batch["pixel_values"]
output = self(input_ids, attention_mask, bbox, pixel_values, labels)
self.log("train_loss", output.loss)
self.train_accuracy(output.logits, labels),
def validation_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
pixel_values = batch["pixel_values"]
output = self(input_ids, attention_mask, bbox, pixel_values, labels)
self.log("val_loss", output.loss)
self.val_accuracy(output.logits, labels),
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.model.parameters(), lr=0.00001) #1e-5
model_module = ModelModule(len(DOCUMENT_CLASSES))
from pytorch_lightning.callbacks import EarlyStopping
early_stopping = EarlyStopping(
monitor='val_loss', # Monitoring validation loss
min_delta=0.00, # Minimum change to qualify as an improvement
patience=5, # Number of epochs with no improvement after which training will be stopped
verbose=True, # Whether to print logs to stdout
mode='min' # `min` mode means training will stop when the quantity monitored has stopped decreasing
model_checkpoint = ModelCheckpoint(
filename="{epoch}-{step}-{val_loss:.4f}", save_last=True, save_top_k=3, monitor="val_loss", mode="min"
trainer.fit(model_module, train_data_loader, val_data_loader)
<code>DOCUMENT_CLASSES = sorted(list(map(
lambda p: p.name,
Path(TRAIN_DS_PATH).glob("*")
)))
train_images = list(TRAIN_DS_PATH.glob('*/*.jpg'))
test_images = list(TEST_DS_PATH.glob('*/*.jpg'))
val_images = list(VAL_DS_PATH.glob('*/*.jpg'))
def scale_bounding_box(box: List[int], width_scale : float = 1.0, height_scale : float = 1.0) -> List[int]:
return [
int(box[0] * width_scale),
int(box[1] * height_scale),
int(box[2] * width_scale),
int(box[3] * height_scale)
]
class DocumentClassificationDataset(Dataset):
def __init__(self, image_paths, processor):
self.image_paths = image_paths
self.processor = processor
def __len__(self):
return len(self.image_paths)
def __getitem__(self, item):
image_path = self.image_paths[item]
json_path = image_path.with_suffix(".json")
with json_path.open("r") as f:
ocr_result = json.load(f)
with Image.open(image_path).convert("RGB") as image:
width, height = image.size
width_scale = 1000 / width
height_scale = 1000 / height
words = []
boxes = []
for row in ocr_result:
boxes.append(scale_bounding_box(
row["bounding_box"],
width_scale,
height_scale
))
words.append(row["word"])
encoding = self.processor(
image,
words,
boxes=boxes,
max_length=512,
padding="max_length",
truncation=True,
return_tensors="pt"
)
label = DOCUMENT_CLASSES.index(image_path.parent.name)
return dict(
input_ids=encoding["input_ids"].flatten(),
attention_mask=encoding["attention_mask"].flatten(),
bbox=encoding["bbox"].flatten(end_dim=1),
pixel_values=encoding["pixel_values"].flatten(end_dim=1),
labels=torch.tensor(label, dtype=torch.long)
)
train_dataset = DocumentClassificationDataset(train_images, processor)
val_dataset = DocumentClassificationDataset(val_images, processor)
train_data_loader = DataLoader(
train_dataset,
batch_size=4,
shuffle=True,
num_workers=4
)
val_data_loader = DataLoader(
val_dataset,
batch_size=4,
shuffle=False,
num_workers=4
)
class ModelModule(pl.LightningModule):
def __init__(self, n_classes:int):
super().__init__()
self.model = LayoutLMv3ForSequenceClassification.from_pretrained(
"microsoft/layoutlmv3-base",
num_labels=n_classes
)
self.model.config.id2label = {k: v for k, v in enumerate(DOCUMENT_CLASSES)}
self.model.config.label2id = {v: k for k, v in enumerate(DOCUMENT_CLASSES)}
self.train_accuracy = Accuracy(task="multiclass", num_classes=n_classes)
self.val_accuracy = Accuracy(task="multiclass", num_classes=n_classes)
def forward(self, input_ids, attention_mask, bbox, pixel_values, labels=None):
return self.model(
input_ids,
attention_mask=attention_mask,
bbox=bbox,
pixel_values=pixel_values,
labels=labels
)
def training_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
bbox = batch["bbox"]
pixel_values = batch["pixel_values"]
labels = batch["labels"]
output = self(input_ids, attention_mask, bbox, pixel_values, labels)
self.log("train_loss", output.loss)
self.log(
"train_acc",
self.train_accuracy(output.logits, labels),
on_step=True,
on_epoch=True
)
return output.loss
def validation_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
bbox = batch["bbox"]
pixel_values = batch["pixel_values"]
labels = batch["labels"]
output = self(input_ids, attention_mask, bbox, pixel_values, labels)
self.log("val_loss", output.loss)
self.log(
"val_acc",
self.val_accuracy(output.logits, labels),
on_step=False,
on_epoch=True
)
return output.loss
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.model.parameters(), lr=0.00001) #1e-5
return optimizer
model_module = ModelModule(len(DOCUMENT_CLASSES))
# Add early stopping
from pytorch_lightning.callbacks import EarlyStopping
early_stopping = EarlyStopping(
monitor='val_loss', # Monitoring validation loss
min_delta=0.00, # Minimum change to qualify as an improvement
patience=5, # Number of epochs with no improvement after which training will be stopped
verbose=True, # Whether to print logs to stdout
mode='min' # `min` mode means training will stop when the quantity monitored has stopped decreasing
)
model_checkpoint = ModelCheckpoint(
filename="{epoch}-{step}-{val_loss:.4f}", save_last=True, save_top_k=3, monitor="val_loss", mode="min"
)
trainer = pl.Trainer(
accelerator="gpu",
precision=16,
max_epochs=10,
callbacks=[
model_checkpoint,
early_stopping
]
)
trainer.fit(model_module, train_data_loader, val_data_loader)
</code>
DOCUMENT_CLASSES = sorted(list(map(
lambda p: p.name,
Path(TRAIN_DS_PATH).glob("*")
)))
train_images = list(TRAIN_DS_PATH.glob('*/*.jpg'))
test_images = list(TEST_DS_PATH.glob('*/*.jpg'))
val_images = list(VAL_DS_PATH.glob('*/*.jpg'))
def scale_bounding_box(box: List[int], width_scale : float = 1.0, height_scale : float = 1.0) -> List[int]:
return [
int(box[0] * width_scale),
int(box[1] * height_scale),
int(box[2] * width_scale),
int(box[3] * height_scale)
]
class DocumentClassificationDataset(Dataset):
def __init__(self, image_paths, processor):
self.image_paths = image_paths
self.processor = processor
def __len__(self):
return len(self.image_paths)
def __getitem__(self, item):
image_path = self.image_paths[item]
json_path = image_path.with_suffix(".json")
with json_path.open("r") as f:
ocr_result = json.load(f)
with Image.open(image_path).convert("RGB") as image:
width, height = image.size
width_scale = 1000 / width
height_scale = 1000 / height
words = []
boxes = []
for row in ocr_result:
boxes.append(scale_bounding_box(
row["bounding_box"],
width_scale,
height_scale
))
words.append(row["word"])
encoding = self.processor(
image,
words,
boxes=boxes,
max_length=512,
padding="max_length",
truncation=True,
return_tensors="pt"
)
label = DOCUMENT_CLASSES.index(image_path.parent.name)
return dict(
input_ids=encoding["input_ids"].flatten(),
attention_mask=encoding["attention_mask"].flatten(),
bbox=encoding["bbox"].flatten(end_dim=1),
pixel_values=encoding["pixel_values"].flatten(end_dim=1),
labels=torch.tensor(label, dtype=torch.long)
)
train_dataset = DocumentClassificationDataset(train_images, processor)
val_dataset = DocumentClassificationDataset(val_images, processor)
train_data_loader = DataLoader(
train_dataset,
batch_size=4,
shuffle=True,
num_workers=4
)
val_data_loader = DataLoader(
val_dataset,
batch_size=4,
shuffle=False,
num_workers=4
)
class ModelModule(pl.LightningModule):
def __init__(self, n_classes:int):
super().__init__()
self.model = LayoutLMv3ForSequenceClassification.from_pretrained(
"microsoft/layoutlmv3-base",
num_labels=n_classes
)
self.model.config.id2label = {k: v for k, v in enumerate(DOCUMENT_CLASSES)}
self.model.config.label2id = {v: k for k, v in enumerate(DOCUMENT_CLASSES)}
self.train_accuracy = Accuracy(task="multiclass", num_classes=n_classes)
self.val_accuracy = Accuracy(task="multiclass", num_classes=n_classes)
def forward(self, input_ids, attention_mask, bbox, pixel_values, labels=None):
return self.model(
input_ids,
attention_mask=attention_mask,
bbox=bbox,
pixel_values=pixel_values,
labels=labels
)
def training_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
bbox = batch["bbox"]
pixel_values = batch["pixel_values"]
labels = batch["labels"]
output = self(input_ids, attention_mask, bbox, pixel_values, labels)
self.log("train_loss", output.loss)
self.log(
"train_acc",
self.train_accuracy(output.logits, labels),
on_step=True,
on_epoch=True
)
return output.loss
def validation_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
bbox = batch["bbox"]
pixel_values = batch["pixel_values"]
labels = batch["labels"]
output = self(input_ids, attention_mask, bbox, pixel_values, labels)
self.log("val_loss", output.loss)
self.log(
"val_acc",
self.val_accuracy(output.logits, labels),
on_step=False,
on_epoch=True
)
return output.loss
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.model.parameters(), lr=0.00001) #1e-5
return optimizer
model_module = ModelModule(len(DOCUMENT_CLASSES))
# Add early stopping
from pytorch_lightning.callbacks import EarlyStopping
early_stopping = EarlyStopping(
monitor='val_loss', # Monitoring validation loss
min_delta=0.00, # Minimum change to qualify as an improvement
patience=5, # Number of epochs with no improvement after which training will be stopped
verbose=True, # Whether to print logs to stdout
mode='min' # `min` mode means training will stop when the quantity monitored has stopped decreasing
)
model_checkpoint = ModelCheckpoint(
filename="{epoch}-{step}-{val_loss:.4f}", save_last=True, save_top_k=3, monitor="val_loss", mode="min"
)
trainer = pl.Trainer(
accelerator="gpu",
precision=16,
max_epochs=10,
callbacks=[
model_checkpoint,
early_stopping
]
)
trainer.fit(model_module, train_data_loader, val_data_loader)