I’m using Data2Vec from the Huggingface hub to feature extract on three modalities of a dataset. After processing I have tensors the shape of [1,768] for text, [1,499,768] for audio, and [1,197,768] for image. I am trying to pass this into a classification model to train but I’m encountering dimension issues.
I get the error RuntimeError: running_mean should contain 197 elements not 1024
when I pass the data into a custom Dataset class -> Dataloader -> model. I slightly altered the model structure to include a mean pooling layer to account for the dimensionality differences but I got a ValueError: expected 2D or 3D input (got 4D input)
instead. Then I tried to implement the mean pooling for when passing data into the Dataset class instead of in the model, but I got a RuntimeError: running_mean should contain 1 elements not 1024
.
I am suspecting this to be a batch normalization problem, but I don’t understand why 197 would still cause an issue if I mean pooled it. In addition I declared neither 197 or 1 to be the batch normalization dimension.
This is the model I want to pass the data into for reference.
<code>class Speaker_Dependent_Triple_Mode_with_Context(nn.Module):
def __init__(self, n_speaker=24, input_embedding_A=2048, input_embedding_B=1024, input_embedding_C=audio_embedding_size, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
super(Speaker_Dependent_Triple_Mode_with_Context, self).__init__()
self.n_speaker = n_speaker
self.input_embedding_A = input_embedding_A
self.input_embedding_B = input_embedding_B
self.input_embedding_C = input_embedding_C
self.shared_embedding = shared_embedding
self.projection_embedding = projection_embedding
self.num_classes = num_classes
self.A_context_share = nn.Linear(
self.input_embedding_A, self.shared_embedding)
self.A_utterance_share = nn.Linear(
self.input_embedding_A, self.shared_embedding)
self.C_context_share = nn.Linear(
self.input_embedding_C, self.shared_embedding)
self.C_utterance_share = nn.Linear(
self.input_embedding_C, self.shared_embedding)
self.B_context_share = nn.Linear(
self.input_embedding_B, self.shared_embedding)
self.B_utterance_share = nn.Linear(
self.input_embedding_B, self.shared_embedding)
self.norm_A_context = nn.BatchNorm1d(self.shared_embedding)
self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
self.norm_C_context = nn.BatchNorm1d(self.shared_embedding)
self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding)
self.norm_B_context = nn.BatchNorm1d(self.shared_embedding)
self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
self.collabrative_gate_1 = nn.Linear(
2*self.shared_embedding, self.projection_embedding)
self.collabrative_gate_2 = nn.Linear(
self.projection_embedding, self.shared_embedding)
self.pred_module = nn.Sequential(
nn.Linear(self.n_speaker+3*self.shared_embedding,
2*self.shared_embedding),
nn.BatchNorm1d(2*self.shared_embedding),
nn.Linear(2*self.shared_embedding, self.shared_embedding),
nn.BatchNorm1d(self.shared_embedding),
nn.Linear(self.shared_embedding, 512),
nn.Linear(128, self.num_classes)
def attention(self, featureA, featureB):
""" This method takes two features and calculates the attention """
input = torch.cat((featureA, featureB), dim=1)
return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
def attention_aggregator(self, feA, feB, feC, feD, feE, feF):
""" This method calculates the attention for feA with respect to others"""
input = self.attention(feA, feB) + self.attention(feA, feC) + self.attention(
feA, feD) + self.attention(feA, feE) + self.attention(feA, feF)
# here we call for pairwise attention
return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
def forward(self, uA, cA, uB, cB, uC, cC, speaker_embedding):
probability of emotion classes
Since we have used Crossentropy as loss function,
Therefore we have not used softmax here because Crossentropy perform Softmax while calculating loss
While evaluation we have to perform softmax explicitly
"""making Feature Projection in order to make all feature of same dimension"""
shared_A_context = self.norm_A_context(
nn.functional.relu(self.A_context_share(cA)))
shared_A_utterance = self.norm_A_utterance(
nn.functional.relu(self.A_utterance_share(uA)))
shared_C_context = self.norm_C_context(
nn.functional.relu(self.C_context_share(cC)))
shared_C_utterance = self.norm_C_utterance(
nn.functional.relu(self.C_utterance_share(uC)))
shared_B_context = self.norm_B_context(
nn.functional.relu(self.B_context_share(cB)))
shared_B_utterance = self.norm_B_utterance(
nn.functional.relu(self.B_utterance_share(uB)))
updated_shared_A = shared_A_utterance * self.attention_aggregator(
shared_A_utterance, shared_A_context, shared_C_context, shared_C_utterance, shared_B_context, shared_B_utterance)
updated_shared_C = shared_C_utterance * self.attention_aggregator(
shared_C_utterance, shared_C_context, shared_A_context, shared_A_utterance, shared_B_context, shared_B_utterance)
updated_shared_B = shared_B_utterance * self.attention_aggregator(
shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance, shared_C_context, shared_C_utterance)
temp = torch.cat((updated_shared_A, updated_shared_C), dim=1)
input = torch.cat((temp, updated_shared_B), dim=1)
input = torch.cat((input, speaker_embedding), dim=1)
return self.pred_module(input)
<code>class Speaker_Dependent_Triple_Mode_with_Context(nn.Module):
def __init__(self, n_speaker=24, input_embedding_A=2048, input_embedding_B=1024, input_embedding_C=audio_embedding_size, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
super(Speaker_Dependent_Triple_Mode_with_Context, self).__init__()
self.n_speaker = n_speaker
self.input_embedding_A = input_embedding_A
self.input_embedding_B = input_embedding_B
self.input_embedding_C = input_embedding_C
self.shared_embedding = shared_embedding
self.projection_embedding = projection_embedding
self.num_classes = num_classes
self.dropout = dropout
self.A_context_share = nn.Linear(
self.input_embedding_A, self.shared_embedding)
self.A_utterance_share = nn.Linear(
self.input_embedding_A, self.shared_embedding)
self.C_context_share = nn.Linear(
self.input_embedding_C, self.shared_embedding)
self.C_utterance_share = nn.Linear(
self.input_embedding_C, self.shared_embedding)
self.B_context_share = nn.Linear(
self.input_embedding_B, self.shared_embedding)
self.B_utterance_share = nn.Linear(
self.input_embedding_B, self.shared_embedding)
self.norm_A_context = nn.BatchNorm1d(self.shared_embedding)
self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
self.norm_C_context = nn.BatchNorm1d(self.shared_embedding)
self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding)
self.norm_B_context = nn.BatchNorm1d(self.shared_embedding)
self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
self.collabrative_gate_1 = nn.Linear(
2*self.shared_embedding, self.projection_embedding)
self.collabrative_gate_2 = nn.Linear(
self.projection_embedding, self.shared_embedding)
self.pred_module = nn.Sequential(
nn.Linear(self.n_speaker+3*self.shared_embedding,
2*self.shared_embedding),
nn.BatchNorm1d(2*self.shared_embedding),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(2*self.shared_embedding, self.shared_embedding),
nn.BatchNorm1d(self.shared_embedding),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(self.shared_embedding, 512),
nn.BatchNorm1d(512),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(512, 128),
nn.BatchNorm1d(128),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(128, self.num_classes)
)
def attention(self, featureA, featureB):
""" This method takes two features and calculates the attention """
input = torch.cat((featureA, featureB), dim=1)
return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
def attention_aggregator(self, feA, feB, feC, feD, feE, feF):
""" This method calculates the attention for feA with respect to others"""
input = self.attention(feA, feB) + self.attention(feA, feC) + self.attention(
feA, feD) + self.attention(feA, feE) + self.attention(feA, feF)
# here we call for pairwise attention
return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
def forward(self, uA, cA, uB, cB, uC, cC, speaker_embedding):
"""Args:
uA:
Utterance Video
uB:
Utterance Text
uC:
Utterance Audio
cA:
Context Video
cB:
Context Text
cC:
Context Audio
Returns:
probability of emotion classes
(
Since we have used Crossentropy as loss function,
Therefore we have not used softmax here because Crossentropy perform Softmax while calculating loss
While evaluation we have to perform softmax explicitly
)
"""
"""making Feature Projection in order to make all feature of same dimension"""
shared_A_context = self.norm_A_context(
nn.functional.relu(self.A_context_share(cA)))
shared_A_utterance = self.norm_A_utterance(
nn.functional.relu(self.A_utterance_share(uA)))
shared_C_context = self.norm_C_context(
nn.functional.relu(self.C_context_share(cC)))
shared_C_utterance = self.norm_C_utterance(
nn.functional.relu(self.C_utterance_share(uC)))
shared_B_context = self.norm_B_context(
nn.functional.relu(self.B_context_share(cB)))
shared_B_utterance = self.norm_B_utterance(
nn.functional.relu(self.B_utterance_share(uB)))
updated_shared_A = shared_A_utterance * self.attention_aggregator(
shared_A_utterance, shared_A_context, shared_C_context, shared_C_utterance, shared_B_context, shared_B_utterance)
updated_shared_C = shared_C_utterance * self.attention_aggregator(
shared_C_utterance, shared_C_context, shared_A_context, shared_A_utterance, shared_B_context, shared_B_utterance)
updated_shared_B = shared_B_utterance * self.attention_aggregator(
shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance, shared_C_context, shared_C_utterance)
temp = torch.cat((updated_shared_A, updated_shared_C), dim=1)
input = torch.cat((temp, updated_shared_B), dim=1)
input = torch.cat((input, speaker_embedding), dim=1)
return self.pred_module(input)
</code>
class Speaker_Dependent_Triple_Mode_with_Context(nn.Module):
def __init__(self, n_speaker=24, input_embedding_A=2048, input_embedding_B=1024, input_embedding_C=audio_embedding_size, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
super(Speaker_Dependent_Triple_Mode_with_Context, self).__init__()
self.n_speaker = n_speaker
self.input_embedding_A = input_embedding_A
self.input_embedding_B = input_embedding_B
self.input_embedding_C = input_embedding_C
self.shared_embedding = shared_embedding
self.projection_embedding = projection_embedding
self.num_classes = num_classes
self.dropout = dropout
self.A_context_share = nn.Linear(
self.input_embedding_A, self.shared_embedding)
self.A_utterance_share = nn.Linear(
self.input_embedding_A, self.shared_embedding)
self.C_context_share = nn.Linear(
self.input_embedding_C, self.shared_embedding)
self.C_utterance_share = nn.Linear(
self.input_embedding_C, self.shared_embedding)
self.B_context_share = nn.Linear(
self.input_embedding_B, self.shared_embedding)
self.B_utterance_share = nn.Linear(
self.input_embedding_B, self.shared_embedding)
self.norm_A_context = nn.BatchNorm1d(self.shared_embedding)
self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)
self.norm_C_context = nn.BatchNorm1d(self.shared_embedding)
self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding)
self.norm_B_context = nn.BatchNorm1d(self.shared_embedding)
self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)
self.collabrative_gate_1 = nn.Linear(
2*self.shared_embedding, self.projection_embedding)
self.collabrative_gate_2 = nn.Linear(
self.projection_embedding, self.shared_embedding)
self.pred_module = nn.Sequential(
nn.Linear(self.n_speaker+3*self.shared_embedding,
2*self.shared_embedding),
nn.BatchNorm1d(2*self.shared_embedding),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(2*self.shared_embedding, self.shared_embedding),
nn.BatchNorm1d(self.shared_embedding),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(self.shared_embedding, 512),
nn.BatchNorm1d(512),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(512, 128),
nn.BatchNorm1d(128),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(128, self.num_classes)
)
def attention(self, featureA, featureB):
""" This method takes two features and calculates the attention """
input = torch.cat((featureA, featureB), dim=1)
return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)
def attention_aggregator(self, feA, feB, feC, feD, feE, feF):
""" This method calculates the attention for feA with respect to others"""
input = self.attention(feA, feB) + self.attention(feA, feC) + self.attention(
feA, feD) + self.attention(feA, feE) + self.attention(feA, feF)
# here we call for pairwise attention
return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)
def forward(self, uA, cA, uB, cB, uC, cC, speaker_embedding):
"""Args:
uA:
Utterance Video
uB:
Utterance Text
uC:
Utterance Audio
cA:
Context Video
cB:
Context Text
cC:
Context Audio
Returns:
probability of emotion classes
(
Since we have used Crossentropy as loss function,
Therefore we have not used softmax here because Crossentropy perform Softmax while calculating loss
While evaluation we have to perform softmax explicitly
)
"""
"""making Feature Projection in order to make all feature of same dimension"""
shared_A_context = self.norm_A_context(
nn.functional.relu(self.A_context_share(cA)))
shared_A_utterance = self.norm_A_utterance(
nn.functional.relu(self.A_utterance_share(uA)))
shared_C_context = self.norm_C_context(
nn.functional.relu(self.C_context_share(cC)))
shared_C_utterance = self.norm_C_utterance(
nn.functional.relu(self.C_utterance_share(uC)))
shared_B_context = self.norm_B_context(
nn.functional.relu(self.B_context_share(cB)))
shared_B_utterance = self.norm_B_utterance(
nn.functional.relu(self.B_utterance_share(uB)))
updated_shared_A = shared_A_utterance * self.attention_aggregator(
shared_A_utterance, shared_A_context, shared_C_context, shared_C_utterance, shared_B_context, shared_B_utterance)
updated_shared_C = shared_C_utterance * self.attention_aggregator(
shared_C_utterance, shared_C_context, shared_A_context, shared_A_utterance, shared_B_context, shared_B_utterance)
updated_shared_B = shared_B_utterance * self.attention_aggregator(
shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance, shared_C_context, shared_C_utterance)
temp = torch.cat((updated_shared_A, updated_shared_C), dim=1)
input = torch.cat((temp, updated_shared_B), dim=1)
input = torch.cat((input, speaker_embedding), dim=1)
return self.pred_module(input)