Thiết kế website giá rẻ

Question

I’m using Data2Vec from the Huggingface hub to feature extract on three modalities of a dataset. After processing I have tensors the shape of [1,768] for text, [1,499,768] for audio, and [1,197,768] for image. I am trying to pass this into a classification model to train but I’m encountering dimension issues.

I get the error RuntimeError: running_mean should contain 197 elements not 1024 when I pass the data into a custom Dataset class -> Dataloader -> model. I slightly altered the model structure to include a mean pooling layer to account for the dimensionality differences but I got a ValueError: expected 2D or 3D input (got 4D input) instead. Then I tried to implement the mean pooling for when passing data into the Dataset class instead of in the model, but I got a RuntimeError: running_mean should contain 1 elements not 1024.

I am suspecting this to be a batch normalization problem, but I don’t understand why 197 would still cause an issue if I mean pooled it. In addition I declared neither 197 or 1 to be the batch normalization dimension.

This is the model I want to pass the data into for reference.

<code>class Speaker_Dependent_Triple_Mode_with_Context(nn.Module):

def __init__(self, n_speaker=24, input_embedding_A=2048, input_embedding_B=1024, input_embedding_C=audio_embedding_size, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):

super(Speaker_Dependent_Triple_Mode_with_Context, self).__init__()

self.n_speaker = n_speaker

self.input_embedding_A = input_embedding_A

self.input_embedding_B = input_embedding_B

self.input_embedding_C = input_embedding_C

self.shared_embedding = shared_embedding

self.projection_embedding = projection_embedding

self.num_classes = num_classes

self.dropout = dropout

self.A_context_share = nn.Linear(

self.input_embedding_A, self.shared_embedding)

self.A_utterance_share = nn.Linear(

self.input_embedding_A, self.shared_embedding)

self.C_context_share = nn.Linear(

self.input_embedding_C, self.shared_embedding)

self.C_utterance_share = nn.Linear(

self.input_embedding_C, self.shared_embedding)

self.B_context_share = nn.Linear(

self.input_embedding_B, self.shared_embedding)

self.B_utterance_share = nn.Linear(

self.input_embedding_B, self.shared_embedding)

self.norm_A_context = nn.BatchNorm1d(self.shared_embedding)

self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)

self.norm_C_context = nn.BatchNorm1d(self.shared_embedding)

self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding)

self.norm_B_context = nn.BatchNorm1d(self.shared_embedding)

self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)

self.collabrative_gate_1 = nn.Linear(

2*self.shared_embedding, self.projection_embedding)

self.collabrative_gate_2 = nn.Linear(

self.projection_embedding, self.shared_embedding)

self.pred_module = nn.Sequential(

nn.Linear(self.n_speaker+3*self.shared_embedding,

2*self.shared_embedding),

nn.BatchNorm1d(2*self.shared_embedding),

nn.ReLU(),

nn.Dropout(dropout),

nn.Linear(2*self.shared_embedding, self.shared_embedding),

nn.BatchNorm1d(self.shared_embedding),

nn.ReLU(),

nn.Dropout(dropout),

nn.Linear(self.shared_embedding, 512),

nn.BatchNorm1d(512),

nn.ReLU(),

nn.Dropout(dropout),

nn.Linear(512, 128),

nn.BatchNorm1d(128),

nn.ReLU(),

nn.Dropout(dropout),

nn.Linear(128, self.num_classes)

)

def attention(self, featureA, featureB):

""" This method takes two features and calculates the attention """

input = torch.cat((featureA, featureB), dim=1)

return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)

def attention_aggregator(self, feA, feB, feC, feD, feE, feF):

""" This method calculates the attention for feA with respect to others"""

input = self.attention(feA, feB) + self.attention(feA, feC) + self.attention(

feA, feD) + self.attention(feA, feE) + self.attention(feA, feF)

# here we call for pairwise attention

return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)

def forward(self, uA, cA, uB, cB, uC, cC, speaker_embedding):

"""Args:

uA:

Utterance Video

uB:

Utterance Text

uC:

Utterance Audio

cA:

Context Video

cB:

Context Text

cC:

Context Audio

Returns:

probability of emotion classes

(

Since we have used Crossentropy as loss function,

Therefore we have not used softmax here because Crossentropy perform Softmax while calculating loss

While evaluation we have to perform softmax explicitly

)

"""

"""making Feature Projection in order to make all feature of same dimension"""

shared_A_context = self.norm_A_context(

nn.functional.relu(self.A_context_share(cA)))

shared_A_utterance = self.norm_A_utterance(

nn.functional.relu(self.A_utterance_share(uA)))

shared_C_context = self.norm_C_context(

nn.functional.relu(self.C_context_share(cC)))

shared_C_utterance = self.norm_C_utterance(

nn.functional.relu(self.C_utterance_share(uC)))

shared_B_context = self.norm_B_context(

nn.functional.relu(self.B_context_share(cB)))

shared_B_utterance = self.norm_B_utterance(

nn.functional.relu(self.B_utterance_share(uB)))

updated_shared_A = shared_A_utterance * self.attention_aggregator(

shared_A_utterance, shared_A_context, shared_C_context, shared_C_utterance, shared_B_context, shared_B_utterance)

updated_shared_C = shared_C_utterance * self.attention_aggregator(

shared_C_utterance, shared_C_context, shared_A_context, shared_A_utterance, shared_B_context, shared_B_utterance)

updated_shared_B = shared_B_utterance * self.attention_aggregator(

shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance, shared_C_context, shared_C_utterance)

temp = torch.cat((updated_shared_A, updated_shared_C), dim=1)

input = torch.cat((temp, updated_shared_B), dim=1)

input = torch.cat((input, speaker_embedding), dim=1)

return self.pred_module(input)

</code>

<code>class Speaker_Dependent_Triple_Mode_with_Context(nn.Module): def __init__(self, n_speaker=24, input_embedding_A=2048, input_embedding_B=1024, input_embedding_C=audio_embedding_size, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5): super(Speaker_Dependent_Triple_Mode_with_Context, self).__init__() self.n_speaker = n_speaker self.input_embedding_A = input_embedding_A self.input_embedding_B = input_embedding_B self.input_embedding_C = input_embedding_C self.shared_embedding = shared_embedding self.projection_embedding = projection_embedding self.num_classes = num_classes self.dropout = dropout self.A_context_share = nn.Linear( self.input_embedding_A, self.shared_embedding) self.A_utterance_share = nn.Linear( self.input_embedding_A, self.shared_embedding) self.C_context_share = nn.Linear( self.input_embedding_C, self.shared_embedding) self.C_utterance_share = nn.Linear( self.input_embedding_C, self.shared_embedding) self.B_context_share = nn.Linear( self.input_embedding_B, self.shared_embedding) self.B_utterance_share = nn.Linear( self.input_embedding_B, self.shared_embedding) self.norm_A_context = nn.BatchNorm1d(self.shared_embedding) self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding) self.norm_C_context = nn.BatchNorm1d(self.shared_embedding) self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding) self.norm_B_context = nn.BatchNorm1d(self.shared_embedding) self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding) self.collabrative_gate_1 = nn.Linear( 2*self.shared_embedding, self.projection_embedding) self.collabrative_gate_2 = nn.Linear( self.projection_embedding, self.shared_embedding) self.pred_module = nn.Sequential( nn.Linear(self.n_speaker+3*self.shared_embedding, 2*self.shared_embedding), nn.BatchNorm1d(2*self.shared_embedding), nn.ReLU(), nn.Dropout(dropout), nn.Linear(2*self.shared_embedding, self.shared_embedding), nn.BatchNorm1d(self.shared_embedding), nn.ReLU(), nn.Dropout(dropout), nn.Linear(self.shared_embedding, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(dropout), nn.Linear(512, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(dropout), nn.Linear(128, self.num_classes) ) def attention(self, featureA, featureB): """ This method takes two features and calculates the attention """ input = torch.cat((featureA, featureB), dim=1) return nn.functional.softmax(self.collabrative_gate_1(input), dim=1) def attention_aggregator(self, feA, feB, feC, feD, feE, feF): """ This method calculates the attention for feA with respect to others""" input = self.attention(feA, feB) + self.attention(feA, feC) + self.attention( feA, feD) + self.attention(feA, feE) + self.attention(feA, feF) # here we call for pairwise attention return nn.functional.softmax(self.collabrative_gate_2(input), dim=1) def forward(self, uA, cA, uB, cB, uC, cC, speaker_embedding): """Args: uA: Utterance Video uB: Utterance Text uC: Utterance Audio cA: Context Video cB: Context Text cC: Context Audio Returns: probability of emotion classes ( Since we have used Crossentropy as loss function, Therefore we have not used softmax here because Crossentropy perform Softmax while calculating loss While evaluation we have to perform softmax explicitly ) """ """making Feature Projection in order to make all feature of same dimension""" shared_A_context = self.norm_A_context( nn.functional.relu(self.A_context_share(cA))) shared_A_utterance = self.norm_A_utterance( nn.functional.relu(self.A_utterance_share(uA))) shared_C_context = self.norm_C_context( nn.functional.relu(self.C_context_share(cC))) shared_C_utterance = self.norm_C_utterance( nn.functional.relu(self.C_utterance_share(uC))) shared_B_context = self.norm_B_context( nn.functional.relu(self.B_context_share(cB))) shared_B_utterance = self.norm_B_utterance( nn.functional.relu(self.B_utterance_share(uB))) updated_shared_A = shared_A_utterance * self.attention_aggregator( shared_A_utterance, shared_A_context, shared_C_context, shared_C_utterance, shared_B_context, shared_B_utterance) updated_shared_C = shared_C_utterance * self.attention_aggregator( shared_C_utterance, shared_C_context, shared_A_context, shared_A_utterance, shared_B_context, shared_B_utterance) updated_shared_B = shared_B_utterance * self.attention_aggregator( shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance, shared_C_context, shared_C_utterance) temp = torch.cat((updated_shared_A, updated_shared_C), dim=1) input = torch.cat((temp, updated_shared_B), dim=1) input = torch.cat((input, speaker_embedding), dim=1) return self.pred_module(input) </code>

class Speaker_Dependent_Triple_Mode_with_Context(nn.Module):
    def __init__(self, n_speaker=24, input_embedding_A=2048, input_embedding_B=1024, input_embedding_C=audio_embedding_size, shared_embedding=1024, projection_embedding=512, dropout=0.5, num_classes=5):
        super(Speaker_Dependent_Triple_Mode_with_Context, self).__init__()

        self.n_speaker = n_speaker

        self.input_embedding_A = input_embedding_A
        self.input_embedding_B = input_embedding_B
        self.input_embedding_C = input_embedding_C

        self.shared_embedding = shared_embedding
        self.projection_embedding = projection_embedding
        self.num_classes = num_classes
        self.dropout = dropout

        self.A_context_share = nn.Linear(
            self.input_embedding_A, self.shared_embedding)
        self.A_utterance_share = nn.Linear(
            self.input_embedding_A, self.shared_embedding)

        self.C_context_share = nn.Linear(
            self.input_embedding_C, self.shared_embedding)
        self.C_utterance_share = nn.Linear(
            self.input_embedding_C, self.shared_embedding)

        self.B_context_share = nn.Linear(
            self.input_embedding_B, self.shared_embedding)
        self.B_utterance_share = nn.Linear(
            self.input_embedding_B, self.shared_embedding)

        self.norm_A_context = nn.BatchNorm1d(self.shared_embedding)
        self.norm_A_utterance = nn.BatchNorm1d(self.shared_embedding)

        self.norm_C_context = nn.BatchNorm1d(self.shared_embedding)
        self.norm_C_utterance = nn.BatchNorm1d(self.shared_embedding)

        self.norm_B_context = nn.BatchNorm1d(self.shared_embedding)
        self.norm_B_utterance = nn.BatchNorm1d(self.shared_embedding)

        self.collabrative_gate_1 = nn.Linear(
            2*self.shared_embedding, self.projection_embedding)
        self.collabrative_gate_2 = nn.Linear(
            self.projection_embedding, self.shared_embedding)

        self.pred_module = nn.Sequential(
            nn.Linear(self.n_speaker+3*self.shared_embedding,
                      2*self.shared_embedding),
            nn.BatchNorm1d(2*self.shared_embedding),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(2*self.shared_embedding, self.shared_embedding),
            nn.BatchNorm1d(self.shared_embedding),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(self.shared_embedding,  512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512,  128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128,  self.num_classes)

        )

    def attention(self, featureA, featureB):
        """ This method takes two features and calculates the attention """
        input = torch.cat((featureA, featureB), dim=1)
        return nn.functional.softmax(self.collabrative_gate_1(input), dim=1)

    def attention_aggregator(self, feA, feB, feC, feD, feE, feF):
        """ This method calculates the attention for feA with respect to others"""
        input = self.attention(feA, feB) + self.attention(feA, feC) + self.attention(
            feA, feD) + self.attention(feA, feE) + self.attention(feA, feF)
        # here we call for pairwise attention
        return nn.functional.softmax(self.collabrative_gate_2(input), dim=1)

    def forward(self, uA, cA, uB, cB, uC, cC, speaker_embedding):
        """Args:
                uA:
                    Utterance Video
                uB:
                    Utterance Text
                uC:
                    Utterance Audio
                cA:
                    Context Video
                cB:
                    Context Text
                cC:
                    Context Audio

            Returns:
                probability of emotion classes
                (
                    Since we have used Crossentropy as loss function,
                    Therefore we have not used softmax here because Crossentropy perform Softmax while calculating loss
                    While evaluation we have to perform softmax explicitly
                )
        """
        """making Feature Projection in order to make all feature of same dimension"""

        shared_A_context = self.norm_A_context(
            nn.functional.relu(self.A_context_share(cA)))
        shared_A_utterance = self.norm_A_utterance(
            nn.functional.relu(self.A_utterance_share(uA)))

        shared_C_context = self.norm_C_context(
            nn.functional.relu(self.C_context_share(cC)))
        shared_C_utterance = self.norm_C_utterance(
            nn.functional.relu(self.C_utterance_share(uC)))

        shared_B_context = self.norm_B_context(
            nn.functional.relu(self.B_context_share(cB)))
        shared_B_utterance = self.norm_B_utterance(
            nn.functional.relu(self.B_utterance_share(uB)))

        updated_shared_A = shared_A_utterance * self.attention_aggregator(
            shared_A_utterance, shared_A_context, shared_C_context, shared_C_utterance, shared_B_context, shared_B_utterance)
        updated_shared_C = shared_C_utterance * self.attention_aggregator(
            shared_C_utterance, shared_C_context, shared_A_context, shared_A_utterance, shared_B_context, shared_B_utterance)
        updated_shared_B = shared_B_utterance * self.attention_aggregator(
            shared_B_utterance, shared_B_context, shared_A_context, shared_A_utterance, shared_C_context, shared_C_utterance)

        temp = torch.cat((updated_shared_A, updated_shared_C), dim=1)
        input = torch.cat((temp, updated_shared_B), dim=1)

        input = torch.cat((input, speaker_embedding), dim=1)

        return self.pred_module(input)

Thiết kế website giá rẻ

Danh mục

Shape of Data2Vec output dimensions