Thiết kế website giá rẻ

Question

I’m using Data2Vec from the Huggingface hub to feature extract on three modalities of a dataset. After processing I have tensors the shape of [1,768] for text, [1,499,768] for audio, and [1,197,768] for image. I am trying to pass this into a classification model to train but I’m encountering dimension issues.

I get the error RuntimeError: running_mean should contain 197 elements not 1024 when I pass the data into a custom Dataset class -> Dataloader -> model. I slightly altered the model structure to include a mean pooling layer to account for the dimensionality differences but I got a ValueError: expected 2D or 3D input (got 4D input) instead. Then I tried to implement the mean pooling for when passing data into the Dataset class instead of in the model, but I got a RuntimeError: running_mean should contain 1 elements not 1024.

I am suspecting this to be a batch normalization problem, but I don’t understand why 197 would still cause an issue if I mean pooled it. In addition I declared neither 197 or 1 to be the batch normalization dimension.

This is the model I want to pass the data into for reference.

class Speaker_Dependent_Triple_Mode_with_Context(nn.Module):
    def __init__(self, n_speaker=24, input_embedding_A=2048, input_embedding_B=1024, input_embedding_C=audio_embedding_size, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
        super(Speaker_Dependent_Triple_Mode_with_Context, self).__init__()

        self.n_speaker = n_speaker

        self.input_embedding_A = input_embedding_A
        self.input_embedding_B = input_embedding_B
        self.input_embedding_C = input_embedding_C

        self.shared_embedding = shared_embedding
        self.projection_embedding = projection_embedding
        self.num_classes = num_classes
        self.dropout = dropout

        self.A_context_share = nn.Linear(
            self.input_embedding_A, self.shared_embedding)
        self.A_utterance_share = nn.Linear(
            self.input_embedding_A, self.shared_embedding)

        self.C_context_share = nn.Linear(
            self.input_embedding_C, self.shared_embedding)
        self.C_utterance_share = nn.Linear(
            self.input_embedding_C, self.shared_embedding)

        self.B_context_share = nn.Linear(
            self.input_embedding_B, self.shared_embedding)
        self.B_utterance_share = nn.Linear(
            self.input_embedding_B, self.shared_embedding)

        self.norm_A_context = nn.BatchNorm1d(self.shared_embedding)
        self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)

        self.norm_C_context = nn.BatchNorm1d(self.shared_embedding)
        self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding)

        self.norm_B_context = nn.BatchNorm1d(self.shared_embedding)
        self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)

        self.collabrative_gate_1 = nn.Linear(
            2*self.shared_embedding, self.projection_embedding)
        self.collabrative_gate_2 = nn.Linear(
            self.projection_embedding, self.shared_embedding)

        self.pred_module = nn.Sequential(
            nn.Linear(self.n_speaker+3*self.shared_embedding,
                      2*self.shared_embedding),
            nn.BatchNorm1d(2*self.shared_embedding),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(2*self.shared_embedding, self.shared_embedding),
            nn.BatchNorm1d(self.shared_embedding),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(self.shared_embedding,  512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512,  128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128,  self.num_classes)

        )

    def attention(self, featureA, featureB):
        """ This method takes two features and calculates the attention """
        input = torch.cat((featureA, featureB), dim=1)
        return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)

    def attention_aggregator(self, feA, feB, feC, feD, feE, feF):
        """ This method calculates the attention for feA with respect to others"""
        input = self.attention(feA, feB) + self.attention(feA, feC) + self.attention(
            feA, feD) + self.attention(feA, feE) + self.attention(feA, feF)
        # here we call for pairwise attention
        return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)

    def forward(self, uA, cA, uB, cB, uC, cC, speaker_embedding):
        """Args:
                uA:
                    Utterance Video
                uB:
                    Utterance Text
                uC:
                    Utterance Audio
                cA:
                    Context Video
                cB:
                    Context Text
                cC:
                    Context Audio

            Returns:
                probability of emotion classes
                (
                    Since we have used Crossentropy as loss function,
                    Therefore we have not used softmax here because Crossentropy perform Softmax while calculating loss
                    While evaluation we have to perform softmax explicitly
                )
        """
        """making Feature Projection in order to make all feature of same dimension"""

        shared_A_context = self.norm_A_context(
            nn.functional.relu(self.A_context_share(cA)))
        shared_A_utterance = self.norm_A_utterance(
            nn.functional.relu(self.A_utterance_share(uA)))

        shared_C_context = self.norm_C_context(
            nn.functional.relu(self.C_context_share(cC)))
        shared_C_utterance = self.norm_C_utterance(
            nn.functional.relu(self.C_utterance_share(uC)))

        shared_B_context = self.norm_B_context(
            nn.functional.relu(self.B_context_share(cB)))
        shared_B_utterance = self.norm_B_utterance(
            nn.functional.relu(self.B_utterance_share(uB)))

        updated_shared_A = shared_A_utterance * self.attention_aggregator(
            shared_A_utterance, shared_A_context, shared_C_context, shared_C_utterance, shared_B_context, shared_B_utterance)
        updated_shared_C = shared_C_utterance * self.attention_aggregator(
            shared_C_utterance, shared_C_context, shared_A_context, shared_A_utterance, shared_B_context, shared_B_utterance)
        updated_shared_B = shared_B_utterance * self.attention_aggregator(
            shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance, shared_C_context, shared_C_utterance)

        temp = torch.cat((updated_shared_A, updated_shared_C), dim=1)
        input = torch.cat((temp, updated_shared_B), dim=1)

        input = torch.cat((input, speaker_embedding), dim=1)

        return self.pred_module(input)

Thiết kế website giá rẻ

Danh mục

Shape of Data2Vec output dimensions