I’m using Data2Vec from the Huggingface hub to feature extract on three modalities of a dataset. After processing I have tensors the shape of [1,768] for text, [1,499,768] for audio, and [1,197,768] for image. I am trying to pass this into a classification model to train but I’m encountering dimension issues.
I get the error RuntimeError: running_mean should contain 197 elements not 1024
when I pass the data into a custom Dataset class -> Dataloader -> model. I slightly altered the model structure to include a mean pooling layer to account for the dimensionality differences but I got a ValueError: expected 2D or 3D input (got 4D input)
instead. Then I tried to implement the mean pooling for when passing data into the Dataset class instead of in the model, but I got a RuntimeError: running_mean should contain 1 elements not 1024
.
I am suspecting this to be a batch normalization problem, but I don’t understand why 197 would still cause an issue if I mean pooled it. In addition I declared neither 197 or 1 to be the batch normalization dimension.
This is the model I want to pass the data into for reference.
class Speaker_Dependent_Triple_Mode_with_Context(nn.Module):
def __init__(self, n_speaker=24, input_embedding_A=2048, input_embedding_B=1024, input_embedding_C=audio_embedding_size, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
super(Speaker_Dependent_Triple_Mode_with_Context, self).__init__()
self.n_speaker = n_speaker
self.input_embedding_A = input_embedding_A
self.input_embedding_B = input_embedding_B
self.input_embedding_C = input_embedding_C
self.shared_embedding = shared_embedding
self.projection_embedding = projection_embedding
self.num_classes = num_classes
self.dropout = dropout
self.A_context_share = nn.Linear(
self.input_embedding_A, self.shared_embedding)
self.A_utterance_share = nn.Linear(
self.input_embedding_A, self.shared_embedding)
self.C_context_share = nn.Linear(
self.input_embedding_C, self.shared_embedding)
self.C_utterance_share = nn.Linear(
self.input_embedding_C, self.shared_embedding)
self.B_context_share = nn.Linear(
self.input_embedding_B, self.shared_embedding)
self.B_utterance_share = nn.Linear(
self.input_embedding_B, self.shared_embedding)
self.norm_A_context = nn.BatchNorm1d(self.shared_embedding)
self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
self.norm_C_context = nn.BatchNorm1d(self.shared_embedding)
self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding)
self.norm_B_context = nn.BatchNorm1d(self.shared_embedding)
self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
self.collabrative_gate_1 = nn.Linear(
2*self.shared_embedding, self.projection_embedding)
self.collabrative_gate_2 = nn.Linear(
self.projection_embedding, self.shared_embedding)
self.pred_module = nn.Sequential(
nn.Linear(self.n_speaker+3*self.shared_embedding,
2*self.shared_embedding),
nn.BatchNorm1d(2*self.shared_embedding),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(2*self.shared_embedding, self.shared_embedding),
nn.BatchNorm1d(self.shared_embedding),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(self.shared_embedding, 512),
nn.BatchNorm1d(512),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(512, 128),
nn.BatchNorm1d(128),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(128, self.num_classes)
)
def attention(self, featureA, featureB):
""" This method takes two features and calculates the attention """
input = torch.cat((featureA, featureB), dim=1)
return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
def attention_aggregator(self, feA, feB, feC, feD, feE, feF):
""" This method calculates the attention for feA with respect to others"""
input = self.attention(feA, feB) + self.attention(feA, feC) + self.attention(
feA, feD) + self.attention(feA, feE) + self.attention(feA, feF)
# here we call for pairwise attention
return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
def forward(self, uA, cA, uB, cB, uC, cC, speaker_embedding):
"""Args:
uA:
Utterance Video
uB:
Utterance Text
uC:
Utterance Audio
cA:
Context Video
cB:
Context Text
cC:
Context Audio
Returns:
probability of emotion classes
(
Since we have used Crossentropy as loss function,
Therefore we have not used softmax here because Crossentropy perform Softmax while calculating loss
While evaluation we have to perform softmax explicitly
)
"""
"""making Feature Projection in order to make all feature of same dimension"""
shared_A_context = self.norm_A_context(
nn.functional.relu(self.A_context_share(cA)))
shared_A_utterance = self.norm_A_utterance(
nn.functional.relu(self.A_utterance_share(uA)))
shared_C_context = self.norm_C_context(
nn.functional.relu(self.C_context_share(cC)))
shared_C_utterance = self.norm_C_utterance(
nn.functional.relu(self.C_utterance_share(uC)))
shared_B_context = self.norm_B_context(
nn.functional.relu(self.B_context_share(cB)))
shared_B_utterance = self.norm_B_utterance(
nn.functional.relu(self.B_utterance_share(uB)))
updated_shared_A = shared_A_utterance * self.attention_aggregator(
shared_A_utterance, shared_A_context, shared_C_context, shared_C_utterance, shared_B_context, shared_B_utterance)
updated_shared_C = shared_C_utterance * self.attention_aggregator(
shared_C_utterance, shared_C_context, shared_A_context, shared_A_utterance, shared_B_context, shared_B_utterance)
updated_shared_B = shared_B_utterance * self.attention_aggregator(
shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance, shared_C_context, shared_C_utterance)
temp = torch.cat((updated_shared_A, updated_shared_C), dim=1)
input = torch.cat((temp, updated_shared_B), dim=1)
input = torch.cat((input, speaker_embedding), dim=1)
return self.pred_module(input)
Patrick Wu is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.