update README.md

2023-07-26 00:59:51 +04:00
parent 075daf69f8
commit 670dea3e1f
2 changed files with 19 additions and 21 deletions
--- a/README.md
+++ b/README.md
@@ -47,10 +47,10 @@ Self-attention has become a defacto choice for capturing global context in vario

 | Model | Top-1 accuracy | #params | GMACs | Latency | Ckpt | CoreML|
 |:---------------|:----:|:---:|:--:|:--:|:--:|:--:|
-| SwiftFormer-XS |   75.7%    |     3.5M    |   0.4G   |      0.7ms     |  [XS](https://drive.google.com/file/d/15Ils-U96pQePXQXx2MpmaI-yAceFAr2x/view?usp=sharing)    |   [XS](https://drive.google.com/file/d/1tZVxtbtAZoLLoDc5qqoUGulilksomLeK/view?usp=sharing)    |
-| SwiftFormer-S  |   78.5%    |     6.1M    |   1.0G   |      0.8ms     |   [S](https://drive.google.com/file/d/1_0eWwgsejtS0bWGBQS3gwAtYjXdPRGlu/view?usp=sharing)   |   [S](https://drive.google.com/file/d/13EOCZmtvbMR2V6UjezSZnbBz2_-59Fva/view?usp=sharing)    |
-| SwiftFormer-L1 |   80.9%   |    12.1M   |   1.6G   |      1.1ms     |   [L1](https://drive.google.com/file/d/1jlwrwWQ0SQzDRc5adtWIwIut5d1g9EsM/view?usp=sharing)   |   [L1](https://drive.google.com/file/d/1c3VUsi4q7QQ2ykXVS2d4iCRL478fWF3e/view?usp=sharing)    |
-| SwiftFormer-L3 |   83.0%   |    28.5M    |   4.0G   |      1.9ms     |  [L3](https://drive.google.com/file/d/1ypBcjx04ShmPYRhhjBRubiVjbExUgSa7/view?usp=sharing)    |   [L3](https://drive.google.com/file/d/1svahgIjh7da781jHOHjX58mtzCzYXSsJ/view?usp=sharing)   |
+| SwiftFormer-XS |   75.7%    |     3.5M    |   0.6G   |      0.7ms     |  [XS](https://drive.google.com/file/d/12RchxzyiJrtZS-2Bur9k4wcRQMItA43S/view?usp=sharing)    |   [XS](https://drive.google.com/file/d/1bkAP_BD6CdDqlbQsStZhLa0ST2NZTIvH/view?usp=sharing)    |
+| SwiftFormer-S  |   78.5%    |     6.1M    |   1.0G   |      0.8ms     |   [S](https://drive.google.com/file/d/1awpcXAaHH38WaHrOmUM8updxQazUZ3Nb/view?usp=sharing)   |   [S](https://drive.google.com/file/d/1qNAhecWIeQ1YJotWhbnLTCR5Uv1zBaf1/view?usp=sharing)    |
+| SwiftFormer-L1 |   80.9%   |    12.1M   |   1.6G   |      1.1ms     |   [L1](https://drive.google.com/file/d/1SDzauVmpR5uExkOv3ajxdwFnP-Buj9Uo/view?usp=sharing)   |   [L1](https://drive.google.com/file/d/1CowZE7-lbxz93uwXqefe-HxGOHUdvX_a/view?usp=sharing)    |
+| SwiftFormer-L3 |   83.0%   |    28.5M    |   4.0G   |      1.9ms     |  [L3](https://drive.google.com/file/d/1DAxMe6FlnZBBIpR-HYIDfFLWJzIgiF0Y/view?usp=sharing)    |   [L3](https://drive.google.com/file/d/1SO3bRWd9oWJemy-gpYUcwP-B4bJ-dsdg/view?usp=sharing)   |


 ## Detection and Segmentation Qualitative Results
@@ -77,6 +77,7 @@ conda activate swiftformer

 pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
 pip install timm
+pip install coremltools==5.2.0
 ```

 ### Data preparation
--- a/models/swiftformer.py
+++ b/models/swiftformer.py
@@ -25,9 +25,6 @@ SwiftFormer_depth = {
    'l3': [4, 4, 12, 6],
 }

-CoreMLConversion = False
-
-
 def stem(in_chs, out_chs):
    """
    Stem Layer that is implemented by two layers of conv.
@@ -144,8 +141,8 @@ class Mlp(nn.Module):
 class EfficientAdditiveAttnetion(nn.Module):
    """
    Efficient Additive Attention module for SwiftFormer.
-    Input: tensor in shape [B, C, H, W]
-    Output: tensor in shape [B, C, H, W]
+    Input: tensor in shape [B, N, D]
+    Output: tensor in shape [B, N, D]
    """

    def __init__(self, in_dims=512, token_dim=256, num_heads=2):
@@ -163,26 +160,23 @@ class EfficientAdditiveAttnetion(nn.Module):
        query = self.to_query(x)
        key = self.to_key(x)

-        if not CoreMLConversion:
-            # torch.nn.functional.normalize is not supported by the ANE of iPhone devices.
-            # Using this layer improves the accuracy by ~0.1-0.2%
-            query = torch.nn.functional.normalize(query, dim=-1)
-            key = torch.nn.functional.normalize(key, dim=-1)
+        query = torch.nn.functional.normalize(query, dim=-1) #BxNxD
+        key = torch.nn.functional.normalize(key, dim=-1) #BxNxD

-        query_weight = query @ self.w_g
-        A = query_weight * self.scale_factor
+        query_weight = query @ self.w_g # BxNx1 (BxNxD @ Dx1)
+        A = query_weight * self.scale_factor # BxNx1

-        A = A.softmax(dim=-1)
+        A = torch.nn.functional.normalize(A, dim=1) # BxNx1

-        G = torch.sum(A * query, dim=1)
+        G = torch.sum(A * query, dim=1) # BxD

        G = einops.repeat(
            G, "b d -> b repeat d", repeat=key.shape[1]
-        )
+        ) # BxNxD

-        out = self.Proj(G * key) + query
+        out = self.Proj(G * key) + query #BxNxD

-        out = self.final(out)
+        out = self.final(out) # BxNxD

        return out

@@ -215,6 +209,7 @@ class SwiftFormerLocalRepresentation(nn.Module):
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
+        print("SwiftFormerLocalRepresentation input is ", x.shape)
        input = x
        x = self.dwconv(x)
        x = self.norm(x)
@@ -225,6 +220,7 @@ class SwiftFormerLocalRepresentation(nn.Module):
            x = input + self.drop_path(self.layer_scale * x)
        else:
            x = input + self.drop_path(x)
+        
        return x


@@ -505,3 +501,4 @@ def SwiftFormer_L3(pretrained=False, **kwargs):
        **kwargs)
    model.default_cfg = _cfg(crop_pct=0.9)
    return model
+