I used the swin-transformer model of pytorch.
model = models.swin_b(weights='IMAGENET1K_V1')
- model structure is
SwinTransformer(
(features): Sequential(
(0): Sequential(
(0): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
(1): Permute()
(2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
)
(1): Sequential(
(0): SwinTransformerBlock(
(norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=128, out_features=384, bias=True)
(proj): Linear(in_features=128, out_features=128, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.0, mode=row)
(norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=128, out_features=512, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=512, out_features=128, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(1): SwinTransformerBlock(
(norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=128, out_features=384, bias=True)
(proj): Linear(in_features=128, out_features=128, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.021739130434782608, mode=row)
(norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=128, out_features=512, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=512, out_features=128, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
)
(2): PatchMerging(
(reduction): Linear(in_features=512, out_features=256, bias=False)
(norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(3): Sequential(
(0): SwinTransformerBlock(
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=256, out_features=768, bias=True)
(proj): Linear(in_features=256, out_features=256, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.043478260869565216, mode=row)
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=256, out_features=1024, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=1024, out_features=256, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(1): SwinTransformerBlock(
(norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=256, out_features=768, bias=True)
(proj): Linear(in_features=256, out_features=256, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.06521739130434782, mode=row)
(norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=256, out_features=1024, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=1024, out_features=256, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
)
(4): PatchMerging(
(reduction): Linear(in_features=1024, out_features=512, bias=False)
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(5): Sequential(
(0): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.08695652173913043, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(1): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.10869565217391304, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(2): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.13043478260869565, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(3): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.15217391304347827, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(4): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.17391304347826086, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(5): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.1956521739130435, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(6): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.21739130434782608, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(7): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.2391304347826087, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(8): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.2608695652173913, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(9): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.2826086956521739, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(10): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.30434782608695654, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(11): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.32608695652173914, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(12): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.34782608695652173, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(13): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.3695652173913043, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(14): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.391304347826087, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(15): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.41304347826086957, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(16): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.43478260869565216, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(17): SwinTransformerBlock(
(norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=512, out_features=1536, bias=True)
(proj): Linear(in_features=512, out_features=512, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.45652173913043476, mode=row)
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=2048, out_features=512, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
)
(6): PatchMerging(
(reduction): Linear(in_features=2048, out_features=1024, bias=False)
(norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
)
(7): Sequential(
(0): SwinTransformerBlock(
(norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.4782608695652174, mode=row)
(norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=4096, out_features=1024, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
(1): SwinTransformerBlock(
(norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(attn): ShiftedWindowAttention(
(qkv): Linear(in_features=1024, out_features=3072, bias=True)
(proj): Linear(in_features=1024, out_features=1024, bias=True)
)
(stochastic_depth): StochasticDepth(p=0.5, mode=row)
(norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): MLP(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
(2): Dropout(p=0.0, inplace=False)
(3): Linear(in_features=4096, out_features=1024, bias=True)
(4): Dropout(p=0.0, inplace=False)
)
)
)
)
(norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(permute): Permute()
(avgpool): AdaptiveAvgPool2d(output_size=1)
(flatten): Flatten(start_dim=1, end_dim=-1)
(head): Linear(in_features=1024, out_features=2, bias=True)
)
- Also, I set up the target layer like this
targets = None
target_layers = [model.features[-1][-1].norm1]
and the grad-cam code is here
n = 0
for i in rln_image_list:
grayscale_cams = None
# Read sample image and transform into tensor
img = np.array(Image.open(os.path.join(rln_dir, i)))
img = np.float32(img) / 255
input_tensor = preprocess_image(img, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
input_tensor = input_tensor.to(device)
# Calculate highlight level
with GradCAM(model=model, target_layers=target_layers) as cam:
grayscale_cams = cam(input_tensor=input_tensor, targets=targets)
cam_image = show_cam_on_image(img, grayscale_cams[0,:], use_rgb=True)
# Visualize
cam = np.uint8(255*grayscale_cams[0,:])
cam = cv2.merge([cam, cam, cam])
images = np.hstack((np.uint8(255*img), cam_image))
result = Image.fromarray(images)
result.save(os.path.join(cat1_dir, "Swin_b_" + i))
print('#'+str(n), "Saved:", "Swin_b_" + i)
n += 1 here
- the heat map of result was not shown.
there were some lines as heat map results
I really don’t know what problem is. I can’t the target layer of swintransformer model.
I just tried to several target layers, those were same results (vertical line)
please advise me.
Thank you
New contributor
Hazel is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.