I’m trying to using Cross-Attention to fuse text feature and image feature following [text](Latent Diffusion Model)[https://github.com/CompVis/latent-diffusion/blob/a506df5756472e2ebaf9078affdde2c4f1502cd4/ldm/modules/attention.py#L152].
Before fed into the module, (b,c,h,w) image feature x is reshaped to (b,h*w,c) and calaute similarity score with (b,c) text feature.
I wish the sim can represent the pixel-wise score of image and text feature, but I print it to check, I just got an output all value are 1.0
class CrossAttention(nn.Module):
def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
super().__init__()
inner_dim = dim_head * heads
# context_dim = default(context_dim, query_dim)
self.scale = dim_head ** -0.5
self.heads = heads
self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
self.to_out = nn.Sequential(
nn.Linear(inner_dim, query_dim),
nn.Dropout(dropout)
)
def forward(self, x, context=None, mask=None):
h = self.heads
q = self.to_q(x)
# context = default(context, x)
k = self.to_k(context)
v = self.to_v(context)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
# attention, what we cannot get enough of
attn = sim.softmax(dim=-1) ## not all same value. sim is (b,4096,1) tensor.
print("attnmap of caption&featureX: ", attn) ## got all 1 ??
out = einsum('b i j, b j d -> b i d', attn, v) ##
out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
return self.to_out(out)
I guess it is the legth of sim tensor is too large and overfolw occoured when softmax(). But I don’t know how to fix it.