From 143e493f650aebb2fa4f9d0d30a3efb201f3ae5b Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Tue, 18 Apr 2023 17:14:58 +0800
Subject: [PATCH 01/46] transfer_swav

---
 passl/models/resnet.py                        | 265 ++++++++++++++++++
 passl/models/swav.py                          | 135 +++++++++
 tasks/ssl/swav/README.md                      | 108 +++++++
 ...se_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml | 148 ++++++++++
 ...se_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml | 109 +++++++
 ...e_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml | 108 +++++++
 ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml | 110 ++++++++
 tasks/ssl/swav/finetune.sh                    |  28 ++
 tasks/ssl/swav/linearprobe.sh                 |  26 ++
 tasks/ssl/swav/pretrain.sh                    |  26 ++
 10 files changed, 1063 insertions(+)
 create mode 100644 passl/models/resnet.py
 create mode 100644 passl/models/swav.py
 create mode 100644 tasks/ssl/swav/README.md
 create mode 100644 tasks/ssl/swav/configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml
 create mode 100644 tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml
 create mode 100644 tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml
 create mode 100644 tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
 create mode 100644 tasks/ssl/swav/finetune.sh
 create mode 100644 tasks/ssl/swav/linearprobe.sh
 create mode 100644 tasks/ssl/swav/pretrain.sh

diff --git a/passl/models/resnet.py b/passl/models/resnet.py
new file mode 100644
index 00000000..c76709cc
--- /dev/null
+++ b/passl/models/resnet.py
@@ -0,0 +1,265 @@
+import paddle
+from passl.models.base_model import Model
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return paddle.nn.Conv2D(in_channels=in_planes, out_channels=out_planes,
+        kernel_size=3, stride=stride, padding=dilation, groups=groups,
+        dilation=dilation, bias_attr=False, )
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return paddle.nn.Conv2D(in_channels=in_planes, out_channels=out_planes,
+        kernel_size=1, stride=stride, bias_attr=False)
+
+
+class BasicBlock(nn.Layer):
+    expansion = 1
+    __constants__ = ['downsample']
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=
+        1, base_width=64, dilation=1, norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = paddle.nn.BatchNorm2D
+        if groups != 1 or base_width != 64:
+            raise ValueError(
+                'BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError(
+                'Dilation > 1 not supported in BasicBlock')
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = paddle.nn.ReLU()
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+
+
+class Bottleneck(paddle.nn.Layer):
+    expansion = 4
+    __constants__ = ['downsample']
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=
+        1, base_width=64, dilation=1, norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = paddle.nn.BatchNorm2D
+        width = int(planes * (base_width / 64.0)) * groups
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = paddle.nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+
+def kaiming_normal_init(param, **kwargs):
+    initializer = nn.initializer.KaimingNormal(**kwargs)
+    initializer(param, param.block)
+
+def constant_init(param, **kwargs):
+    initializer = nn.initializer.Constant(**kwargs)
+    initializer(param, param.block)
+    
+class ResNet(paddle.nn.Layer):
+    def __init__(self, block, layers, zero_init_residual=False, groups=1,
+        widen=1, width_per_group=64, replace_stride_with_dilation=None,
+        norm_layer=None, normalize=False, output_dim=0, hidden_mlp=0,
+        nmb_prototypes=0, eval_mode=False):
+        
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = paddle.nn.BatchNorm2D
+        self._norm_layer = norm_layer
+        self.eval_mode = eval_mode
+        self.padding = paddle.nn.Pad2D(padding=1, value=0.0)
+        self.inplanes = width_per_group * widen
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError(
+                'replace_stride_with_dilation should be None or a 3-element tuple, got {}'
+                .format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        num_out_filters = width_per_group * widen
+        self.conv1 = paddle.nn.Conv2D(in_channels=3, out_channels=
+            num_out_filters, kernel_size=7, stride=2, padding=2, bias_attr=
+            False)
+        self.bn1 = norm_layer(num_out_filters)
+        self.relu = paddle.nn.ReLU()
+        self.maxpool = paddle.nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, num_out_filters, layers[0])
+        num_out_filters *= 2
+        self.layer2 = self._make_layer(block, num_out_filters, layers[1],
+            stride=2, dilate=replace_stride_with_dilation[0])
+        num_out_filters *= 2
+        self.layer3 = self._make_layer(block, num_out_filters, layers[2],
+            stride=2, dilate=replace_stride_with_dilation[1])
+        num_out_filters *= 2
+        self.layer4 = self._make_layer(block, num_out_filters, layers[3],
+            stride=2, dilate=replace_stride_with_dilation[2])
+        self.avgpool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1))
+        self.l2norm = normalize
+        if output_dim == 0:
+            self.projection_head = None
+        elif hidden_mlp == 0:
+            self.projection_head = paddle.nn.Linear(in_features=
+                num_out_filters * block.expansion, out_features=output_dim)
+        else:
+            self.projection_head = paddle.nn.Sequential(paddle.nn.Linear(
+                in_features=num_out_filters * block.expansion, out_features
+                =hidden_mlp), paddle.nn.BatchNorm1D(num_features=hidden_mlp,
+                momentum=1 - 0.1, epsilon=1e-05, weight_attr=None,
+                bias_attr=None, use_global_stats=True), paddle.nn.ReLU(),
+                paddle.nn.Linear(in_features=hidden_mlp, out_features=
+                output_dim))
+        self.prototypes = None
+        if isinstance(nmb_prototypes, list):
+            self.prototypes = MultiPrototypes(output_dim, nmb_prototypes)
+        elif nmb_prototypes > 0:
+            self.prototypes = paddle.nn.Linear(in_features=output_dim,
+                out_features=nmb_prototypes, bias_attr=False)
+            for sublayer in self.sublayers():
+                if isinstance(sublayer, nn.Conv2D):
+                    kaiming_normal_init(sublayer.weight) # todo mode='fan_out',
+                elif isinstance(sublayer, (nn.BatchNorm2D, nn.GroupNorm)):
+                    param_init.constant_init(sublayer.weight, value=1.0)
+                    param_init.constant_init(sublayer.bias, value=0.0)
+
+        if zero_init_residual:
+            for sublayer in self.sublayers():
+                if isinstance(m, Bottleneck):
+                    param_init.constant_init(sublayer.bn3.weight, value=0.0)
+                elif isinstance(m, BasicBlock):
+                    param_init.constant_init(sublayer.bn2.weight, value=0.0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = paddle.nn.Sequential(conv1x1(self.inplanes, planes *
+                block.expansion, stride), norm_layer(planes * block.expansion))
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, self
+            .groups, self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=self.groups,
+                base_width=self.base_width, dilation=self.dilation,
+                norm_layer=norm_layer))
+        return paddle.nn.Sequential(*layers)
+
+    def forward_backbone(self, x):
+        x = self.padding(x)
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        if self.eval_mode:
+            return x
+        x = self.avgpool(x)
+        x = paddle.flatten(x=x, start_axis=1)
+        return x
+
+    def forward_head(self, x):
+        if self.projection_head is not None:
+            x = self.projection_head(x)
+        if self.l2norm:
+            x = paddle.nn.functional.normalize(x=x, axis=1, p=2)
+        if self.prototypes is not None:
+            return x, self.prototypes(x)
+        return x
+
+    def forward(self, inputs):
+        if not isinstance(inputs, list):
+            inputs = [inputs]
+        idx_crops = paddle.cumsum(x=paddle.unique_consecutive(x=paddle.
+            to_tensor(data=[inp.shape[-1] for inp in inputs]),
+            return_counts=True)[1], dim=0)
+        start_idx = 0
+        for end_idx in idx_crops:
+            _out = self.forward_backbone(paddle.concat(x=inputs[start_idx:
+                end_idx]))
+            if start_idx == 0:
+                output = _out
+            else:
+                output = paddle.concat(x=(output, _out))
+            start_idx = end_idx
+        return self.forward_head(output)
+
+
+class MultiPrototypes(paddle.nn.Layer):
+
+    def __init__(self, output_dim, nmb_prototypes):
+        super(MultiPrototypes, self).__init__()
+        self.nmb_heads = len(nmb_prototypes)
+        for i, k in enumerate(nmb_prototypes):
+            self.add_module('prototypes' + str(i), paddle.nn.Linear(
+                in_features=output_dim, out_features=k, bias_attr=False))
+
+    def forward(self, x):
+        out = []
+        for i in range(self.nmb_heads):
+            out.append(getattr(self, 'prototypes' + str(i))(x))
+        return out
+
+
+def resnet50(**kwargs):
+    return ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+
+
+def resnet50w2(**kwargs):
+    return ResNet(Bottleneck, [3, 4, 6, 3], widen=2, **kwargs)
+
+
+def resnet50w4(**kwargs):
+    return ResNet(Bottleneck, [3, 4, 6, 3], widen=4, **kwargs)
+
+
+def resnet50w5(**kwargs):
+    return ResNet(Bottleneck, [3, 4, 6, 3], widen=5, **kwargs)
+
diff --git a/passl/models/swav.py b/passl/models/swav.py
new file mode 100644
index 00000000..9c64df62
--- /dev/null
+++ b/passl/models/swav.py
@@ -0,0 +1,135 @@
+import paddle
+import paddle.nn as nn
+
+from passl.models.resnet import resnet50
+from passl.models.base_model import Model
+
+
+__all__ = [
+    'swav_resnet50',
+    'swav_resnet50_linearprobe',
+    # 'swav_resnet50_pretrain',
+    'SwAV',
+    'SwAVLinearProbe',
+    # 'SwAVPretrain',
+]
+
+# def model and 
+class SwAV(Model):
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.res_model = resnet50(**kwargs)
+    
+        
+    def load_pretrained(self, path, rank=0, finetune=False):
+        pass
+#         if not os.path.exists(path + '.pdparams'):
+#             raise ValueError("Model pretrain path {} does not "
+#                              "exists.".format(path))
+
+#         state_dict = self.state_dict()
+#         param_state_dict = paddle.load(path + ".pdparams")
+
+#         # for FP16 saving pretrained weight
+#         for key, value in param_state_dict.items():
+#             if key in param_state_dict and key in state_dict and param_state_dict[
+#                     key].dtype != state_dict[key].dtype:
+#                 param_state_dict[key] = param_state_dict[key].astype(
+#                     state_dict[key].dtype)
+
+#         if not finetune:
+#             self.set_dict(param_state_dict)
+#         else: # load model when finetune
+#             for k in ['head0.weight', 'head0.bias', 'head.weight', 'head.bias']:
+#                 if k in param_state_dict:
+#                     logger.info(f"Removing key {k} from pretrained checkpoint")
+#                     del param_state_dict[k]
+
+#             self.set_dict(param_state_dict)
+
+    def save(self, path, local_rank=0, rank=0):
+        paddle.save(self.state_dict(), path + ".pdparams")
+
+        
+class SwAVLinearProbe(SwAV):
+    def __init__(self, num_classes=1000, linear_arch="resnet50", global_avg=True, use_bn=False, **kwargs):
+        super().__init__(**kwargs)
+        self.linear = RegLog(1000, "resnet50", global_avg=True, use_bn=False)
+        self.res_model.eval()
+        self.criterion = nn.CrossEntropyLoss()
+    
+    def load_pretrained(self, path):
+        # only load res_model
+        model = path + ".pdparams"
+        if os.path.isfile(path):
+            state_dict = paddle.load(path)
+
+            # remove prefixe "module."
+            state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+            for k, v in model.state_dict().items():
+                if k not in list(state_dict):
+                    logger.info('key "{}" could not be found in provided state dict'.format(k))
+                elif state_dict[k].shape != v.shape:
+                    logger.info('key "{}" is of different shape in model and provided state dict'.format(k))
+                    state_dict[k] = v
+            msg = self.res_model.set_dict(state_dict, strict=False)
+            logger.info("Load pretrained model with msg: {}".format(msg))
+        else:
+            logger.info("No pretrained weights found => training with random weights")
+        
+    def forward()
+        with paddle.no_grad():
+            output = self.res_model(inp)
+        output = reglog(output)
+        
+        return output
+
+        
+def swav_resnet50_linearprobe(**kwargs):
+    model = SwAVLinearProbe(num_classes=1000, 
+                            linear_arch="resnet50", 
+                            global_avg=True, 
+                            use_bn=False,
+                            output_dim=0, 
+                            eval_mode=True,
+                            **kwargs)
+    return model
+        
+            
+
+class RegLog(paddle.nn.Layer):
+    """Creates logistic regression on top of frozen features"""
+
+    def __init__(self, num_labels, arch='resnet50', global_avg=False,
+        use_bn=True):
+        super(RegLog, self).__init__()
+        self.bn = None
+        if global_avg:
+            if arch == 'resnet50':
+                s = 2048
+            elif arch == 'resnet50w2':
+                s = 4096
+            elif arch == 'resnet50w4':
+                s = 8192
+            self.av_pool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1))
+        else:
+            assert arch == 'resnet50'
+            s = 8192
+            self.av_pool = paddle.nn.AvgPool2D(6, stride=1)
+            if use_bn:
+                self.bn = paddle.nn.BatchNorm2D(num_features=2048, momentum
+                    =1 - 0.1, epsilon=1e-05, weight_attr=None, bias_attr=
+                    None, use_global_stats=True)
+        self.linear = paddle.nn.Linear(in_features=s, out_features=num_labels)
+        x = self.linear.weight.data
+        paddle.assign(paddle.normal(mean=0.0, std=0.01, shape=x.shape).
+            astype(x.dtype), x)
+        self.linear.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.av_pool(x)
+        if self.bn is not None:
+            x = self.bn(x)
+
+        x = x.view((x.shape[0], -1))
+        return self.linear(x)
\ No newline at end of file
diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md
new file mode 100644
index 00000000..b3f14b0e
--- /dev/null
+++ b/tasks/ssl/swav/README.md
@@ -0,0 +1,108 @@
+## MoCo v3 for Self-supervised ResNet and ViT
+
+
+PaddlePaddle reimplementation of [facebookresearch's repository for the MoCo v3 model](https://github.com/facebookresearch/moco-v3) that was released with the paper [An Empirical Study of Training Self-Supervised Vision Transformers](https://arxiv.org/abs/2104.02057).
+
+## Requirements
+To enjoy some new features, PaddlePaddle 2.4 is required. For more installation tutorials
+refer to [installation.md](../../../tutorials/get_started/installation.md)
+
+## Data Preparation
+
+Prepare the data into the following directory:
+```text
+dataset/
+└── ILSVRC2012
+    ├── train
+    └── val
+```
+
+
+## How to Self-supervised Pre-Training
+
+With a batch size of 4096, ViT-Base is trained with 4 nodes:
+
+```bash
+# Note: Set the following environment variables
+# and then need to run the script on each node.
+unset PADDLE_TRAINER_ENDPOINTS
+export PADDLE_NNODES=4
+export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538"
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export FLAGS_stop_check_timeout=3600
+
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    passl-train \
+    -c ./configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml
+```
+
+## How to Linear Classification
+
+By default, we use momentum-SGD and a batch size of 1024 for linear classification on frozen features/weights. This can be done with a single 8-GPU node.
+
+```bash
+unset PADDLE_TRAINER_ENDPOINTS
+export PADDLE_NNODES=1
+export PADDLE_MASTER="127.0.0.1:12538"
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export FLAGS_stop_check_timeout=3600
+
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    passl-train \
+    -c ./configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml
+```
+
+## How to End-to-End Fine-tuning
+To perform end-to-end fine-tuning for ViT, use our script to convert the pre-trained ViT checkpoint to PASSL DeiT format:
+
+```bash
+python extract_weight.py \
+  --input pretrained/checkpoint_0299.pd \
+  --output pretrained/moco_vit_base.pdparams
+```
+
+Then run the training with the converted PASSL format checkpoint:
+
+```bash
+unset PADDLE_TRAINER_ENDPOINTS
+export PADDLE_NNODES=1
+export PADDLE_MASTER="127.0.0.1:12538"
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export FLAGS_stop_check_timeout=3600
+
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    passl-train \
+    -c ./configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml
+```
+
+## Other Configurations
+We provide more directly runnable configurations, see [MoCoV3 Configurations](./configs/).
+
+## Models
+
+### ViT-Base
+| Model         | Phase       | Dataset      | Configs                                                      | GPUs       | Epochs | Top1 Acc | Checkpoint                                                   |
+| ------------- | ----------- | ------------ | ------------------------------------------------------------ | ---------- | ------ | -------- | ------------------------------------------------------------ |
+| moco_vit_base | pretrain    | ImageNet2012 | [config](./configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml) | A100*N4C32 | 300    | -        | [download](https://plsc.bj.bcebos.com/models/mocov3/v2.4/moco_vit_base_in1k_300ep.pd) |
+| moco_vit_base | linear prob | ImageNet2012 | [config](./configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml) | A100*N1C8  | 90     | 0.7662   |                                                              |
+| moco_vit_base | finetune    | ImageNet2012 | [config](./configs/DeiT_base_patch16_224_in1k_1n8c_dp_fp16o1.yaml) | A100*N1C8  | 150    | 0.8288   |                                                              |
+
+## Citations
+
+```bibtex
+@Article{chen2021mocov3,
+  author  = {Xinlei Chen* and Saining Xie* and Kaiming He},
+  title   = {An Empirical Study of Training Self-Supervised Vision Transformers},
+  journal = {arXiv preprint arXiv:2104.02057},
+  year    = {2021},
+}
+```
diff --git a/tasks/ssl/swav/configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml
new file mode 100644
index 00000000..d70c6647
--- /dev/null
+++ b/tasks/ssl/swav/configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml
@@ -0,0 +1,148 @@
+# global configs
+Global:
+  task_type: Classification
+  train_loop: ClassificationTrainingEpochLoop
+  validate_loop: ClassificationEvaluationLoop
+  checkpoint: null
+  pretrained_model: ./pretrained/mocov3/mocov3_vit_base_in1k_300ep_pretrained
+  finetune: True
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  max_num_latest_checkpoint: 0
+  eval_during_train: True
+  eval_interval: 1
+  eval_unit: "epoch"
+  accum_steps: 1
+  epochs: 150
+  print_batch_step: 10
+  use_visualdl: False
+  seed: 2022
+
+# FP16 setting
+FP16:
+  level: O1
+  GradScaler:
+    init_loss_scaling: 65536.0
+
+DistributedStrategy:
+  data_parallel: True
+
+# model architecture
+Model:
+  name: DeiT_base_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+LRScheduler:
+    name: TimmCosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+    decay_unit: epoch
+
+Optimizer:
+  name: AdamW
+  betas: (0.9, 0.999)
+  eps: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: ["cls_token", "pos_embed", "norm", "bias"]
+  use_master_param: True
+  exp_avg_force_fp32: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageFolder
+      root: ./dataset/ILSVRC2012/train
+      transform:
+        - RandomResizedCrop:
+            size: 224
+            interpolation: bicubic
+        - RandomHorizontalFlip:
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+            mean: [0.485, 0.456, 0.406]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+        - ToCHWImage:
+      batch_transform:
+        - TransformOpSampler:
+            Mixup:
+              alpha: 0.8
+              prob: 0.5
+              epsilon: 0.1
+              class_num: 1000
+            Cutmix:
+              alpha: 1.0
+              prob: 0.5
+              epsilon: 0.1
+              class_num: 1000
+    sampler:
+      name: RepeatedAugSampler
+      batch_size: 128 # accum_steps: 1, total batchsize: 1024
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageFolder
+      root: ./dataset/ILSVRC2012/val
+      transform:
+        - Resize:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - CenterCrop:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
+Export:
+  export_type: paddle
+  input_shape: [None, 3, 224, 224]
diff --git a/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml
new file mode 100644
index 00000000..ae0efc7b
--- /dev/null
+++ b/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml
@@ -0,0 +1,109 @@
+# global configs
+Global:
+  task_type: Classification
+  train_loop: ClassificationTrainingEpochLoop
+  validate_loop: ClassificationEvaluationLoop
+  checkpoint: null
+  pretrained_model: ./pretrained/mocov3/mocov3_vit_base_in1k_300ep_pretrained
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  max_num_latest_checkpoint: 0
+  eval_during_train: True
+  eval_interval: 1
+  eval_unit: "epoch"
+  accum_steps: 1
+  epochs: 90
+  print_batch_step: 10
+  use_visualdl: False
+  seed: 2022
+
+# FP16 setting
+FP16:
+  level: O1
+
+DistributedStrategy:
+  data_parallel: True
+
+# model architecture
+Model:
+  name: mocov3_vit_base_linearprobe
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+LRScheduler:
+    name: TimmCosine
+    learning_rate: 12.0
+    decay_unit: epoch
+    last_epoch: 0
+    warmup_epoch: 0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 0.0
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageFolder
+      root: data/ILSVRC2012/train
+      transform:
+        - RandomResizedCrop:
+            size: 224
+        - RandomHorizontalFlip:
+        - ToTensor:
+        - Normalize:
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128 # accum_steps: 1, total batchsize: 1024
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageFolder
+      root: data/ILSVRC2012/val
+      transform:
+        - Resize:
+            size: 256
+        - CenterCrop:
+            size: 224
+        - ToTensor:
+        - Normalize:
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
+Export:
+  export_type: paddle
+  input_shape: [None, 3, 224, 224]
diff --git a/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml
new file mode 100644
index 00000000..cb3a7a9e
--- /dev/null
+++ b/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml
@@ -0,0 +1,108 @@
+# global configs
+Global:
+  task_type: ContrastiveLearning
+  train_loop: ContrastiveLearningTrainingEpochLoop
+  validate_loop: None
+  checkpoint: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  max_num_latest_checkpoint: 0
+  eval_during_train: False
+  eval_interval: 1
+  eval_unit: "epoch"
+  accum_steps: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  seed: 2023
+
+# FP16 setting
+FP16:
+  level: O1
+  GradScaler:
+    init_loss_scaling: 65536.0
+    incr_every_n_steps: 2000
+
+DistributedStrategy:
+  data_parallel: True
+
+# model architecture
+Model:
+  name: mocov3_vit_base_pretrain
+
+LRScheduler:
+    name: TimmCosine
+    learning_rate: 0.0024
+    eta_min: 0.0
+    warmup_epoch: 40
+    warmup_start_lr: 0.0
+    decay_unit: step
+    warmup_prefix: True
+
+Optimizer:
+  name: AdamW
+  betas: (0.9, 0.999)
+  eps: 1e-8
+  weight_decay: 0.1
+  use_master_param: True
+  exp_avg_force_fp32: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageFolder
+      root: ./dataset/ILSVRC2012/train
+      transform:
+        - TwoViewsTransform:
+            base_transform1:
+              - RandomResizedCrop:
+                  size: 224
+                  scale: [0.08, 1.0]
+                  interpolation: bicubic
+              - ColorJitter:
+                  p: 0.8
+                  brightness: 0.4
+                  contrast: 0.4
+                  saturation: 0.2
+                  hue: 0.1
+              - RandomGrayscale:
+                  p: 0.2
+              - SimCLRGaussianBlur:
+                  p: 1.0
+                  sigma: [.1, 2.]
+              - RandomHorizontalFlip:
+              - ToTensor:
+              - Normalize:
+                  mean: [0.485, 0.456, 0.406]
+                  std: [0.229, 0.224, 0.225]
+            base_transform2:
+              - RandomResizedCrop:
+                  size: 224
+                  scale: [0.08, 1.0]
+                  interpolation: bicubic
+              - ColorJitter:
+                  p: 0.8
+                  brightness: 0.4
+                  contrast: 0.4
+                  saturation: 0.2
+                  hue: 0.1
+              - RandomGrayscale:
+                  p: 0.2
+              - BYOLSolarize:
+                  p: 0.2
+              - RandomHorizontalFlip:
+              - ToTensor:
+              - Normalize:
+                  mean: [0.485, 0.456, 0.406]
+                  std: [0.229, 0.224, 0.225]
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128 # accum_steps: 1, total batchsize: 4096
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
new file mode 100644
index 00000000..3f8782ca
--- /dev/null
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
@@ -0,0 +1,110 @@
+# global configs
+Global:
+  task_type: Classification
+  train_loop: ClassificationTrainingEpochLoop
+  validate_loop: ClassificationEvaluationLoop
+  checkpoint: null
+  pretrained_model: swav_800ep_pretrain.pdparams
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  max_num_latest_checkpoint: 0
+  eval_during_train: True
+  eval_interval: 1
+  eval_unit: "epoch"
+  accum_steps: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  seed: 31
+
+# FP16 setting ignore in align
+# FP16:
+#   level: O1
+
+DistributedStrategy:
+  data_parallel: True
+
+# model architecture
+Model:
+  name: swav_resnet50_linearprobe
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+LRScheduler:
+    name: TimmCosine
+    learning_rate: 0.3
+    decay_unit: epoch
+    last_epoch: 0
+    warmup_epoch: 0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 1e-6
+  
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageFolder
+      root: ./dataset/ILSVRC2012/train
+      transform:
+        - RandomResizedCrop:
+            size: 224
+        - RandomHorizontalFlip:
+        - ToTensor:
+        - Normalize:
+            mean: [0.485, 0.456, 0.406]
+            std: [0.228, 0.224, 0.225]
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32 # accum_steps: 1, total batchsize: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageFolder
+      root: ./dataset/ILSVRC2012/val
+      transform:
+        - Resize:
+            size: 256
+        - CenterCrop:
+            size: 224
+        - ToTensor:
+        - Normalize:
+            mean: [0.485, 0.456, 0.406]
+            std: [0.228, 0.224, 0.225]
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
+Export:
+  export_type: paddle
+  input_shape: [None, 3, 224, 224]
diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh
new file mode 100644
index 00000000..cae7ebba
--- /dev/null
+++ b/tasks/ssl/swav/finetune.sh
@@ -0,0 +1,28 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note: Set the following environment variables 
+# and then need to run the script on each node.
+unset PADDLE_TRAINER_ENDPOINTS
+export PADDLE_NNODES=1
+export PADDLE_MASTER="127.0.0.1:12538"
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export FLAGS_stop_check_timeout=3600
+
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    passl-train \
+    -c ./configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml
diff --git a/tasks/ssl/swav/linearprobe.sh b/tasks/ssl/swav/linearprobe.sh
new file mode 100644
index 00000000..a0a26b4e
--- /dev/null
+++ b/tasks/ssl/swav/linearprobe.sh
@@ -0,0 +1,26 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#unset PADDLE_TRAINER_ENDPOINTS
+#export PADDLE_NNODES=1
+#export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538"
+#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export FLAGS_stop_check_timeout=3600
+
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    passl-train \
+    -c ./configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml
diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh
new file mode 100644
index 00000000..f5dfc176
--- /dev/null
+++ b/tasks/ssl/swav/pretrain.sh
@@ -0,0 +1,26 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#unset PADDLE_TRAINER_ENDPOINTS
+#export PADDLE_NNODES=4
+#export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538"
+#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export FLAGS_stop_check_timeout=3600
+
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    passl-train \
+    -c ./configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml

From 31921ca0be98649ee28a6a124d4ff8344621d8f6 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Tue, 18 Apr 2023 20:58:16 +0800
Subject: [PATCH 02/46] valid_train

---
 passl/models/__init__.py                      |  1 +
 passl/models/resnet.py                        |  9 ++-
 passl/models/swav.py                          | 69 +++++++++++--------
 ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml | 12 ++--
 tasks/ssl/swav/linearprobe.sh                 | 18 +++--
 tools/train.py                                |  0
 6 files changed, 67 insertions(+), 42 deletions(-)
 mode change 100644 => 100755 tools/train.py

diff --git a/passl/models/__init__.py b/passl/models/__init__.py
index ad01e964..6174f44e 100644
--- a/passl/models/__init__.py
+++ b/passl/models/__init__.py
@@ -25,6 +25,7 @@
 from .cae import *
 from .convnext import *
 from .mocov3 import *
+from .swav import *
 
 __all__ = ["build_model"]
 
diff --git a/passl/models/resnet.py b/passl/models/resnet.py
index c76709cc..9abfc0d4 100644
--- a/passl/models/resnet.py
+++ b/passl/models/resnet.py
@@ -1,4 +1,6 @@
 import paddle
+import paddle.nn as nn
+
 from passl.models.base_model import Model
 
 def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
@@ -95,6 +97,7 @@ def constant_init(param, **kwargs):
     initializer = nn.initializer.Constant(**kwargs)
     initializer(param, param.block)
     
+    
 class ResNet(paddle.nn.Layer):
     def __init__(self, block, layers, zero_init_residual=False, groups=1,
         widen=1, width_per_group=64, replace_stride_with_dilation=None,
@@ -159,8 +162,8 @@ def __init__(self, block, layers, zero_init_residual=False, groups=1,
                 if isinstance(sublayer, nn.Conv2D):
                     kaiming_normal_init(sublayer.weight) # todo mode='fan_out',
                 elif isinstance(sublayer, (nn.BatchNorm2D, nn.GroupNorm)):
-                    param_init.constant_init(sublayer.weight, value=1.0)
-                    param_init.constant_init(sublayer.bias, value=0.0)
+                    constant_init(sublayer.weight, value=1.0)
+                    constant_init(sublayer.bias, value=0.0)
 
         if zero_init_residual:
             for sublayer in self.sublayers():
@@ -219,7 +222,7 @@ def forward(self, inputs):
             inputs = [inputs]
         idx_crops = paddle.cumsum(x=paddle.unique_consecutive(x=paddle.
             to_tensor(data=[inp.shape[-1] for inp in inputs]),
-            return_counts=True)[1], dim=0)
+            return_counts=True)[1], axis=0) # padiff
         start_idx = 0
         for end_idx in idx_crops:
             _out = self.forward_backbone(paddle.concat(x=inputs[start_idx:
diff --git a/passl/models/swav.py b/passl/models/swav.py
index 9c64df62..7cba9acc 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -1,3 +1,5 @@
+import os
+
 import paddle
 import paddle.nn as nn
 
@@ -6,7 +8,7 @@
 
 
 __all__ = [
-    'swav_resnet50',
+    # 'swav_resnet50',
     'swav_resnet50_linearprobe',
     # 'swav_resnet50_pretrain',
     'SwAV',
@@ -52,42 +54,48 @@ def save(self, path, local_rank=0, rank=0):
 
         
 class SwAVLinearProbe(SwAV):
-    def __init__(self, num_classes=1000, linear_arch="resnet50", global_avg=True, use_bn=False, **kwargs):
+    def __init__(self, class_num=1000, linear_arch="resnet50", global_avg=True, use_bn=False, **kwargs):
         super().__init__(**kwargs)
         self.linear = RegLog(1000, "resnet50", global_avg=True, use_bn=False)
         self.res_model.eval()
-        self.criterion = nn.CrossEntropyLoss()
     
-    def load_pretrained(self, path):
+    def load_pretrained(self, path, rank=0, finetune=False):
         # only load res_model
-        model = path + ".pdparams"
         if os.path.isfile(path):
-            state_dict = paddle.load(path)
-
-            # remove prefixe "module."
-            state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
-            for k, v in model.state_dict().items():
-                if k not in list(state_dict):
-                    logger.info('key "{}" could not be found in provided state dict'.format(k))
-                elif state_dict[k].shape != v.shape:
-                    logger.info('key "{}" is of different shape in model and provided state dict'.format(k))
-                    state_dict[k] = v
-            msg = self.res_model.set_dict(state_dict, strict=False)
-            logger.info("Load pretrained model with msg: {}".format(msg))
+            para_state_dict = paddle.load(path)
+            
+            # resnet
+            model_state_dict = self.res_model.state_dict()
+            keys = model_state_dict.keys()
+            num_params_loaded = 0
+            for k in keys:
+                if k not in para_state_dict:
+                    print("{} is not in pretrained model".format(k))
+                elif list(para_state_dict[k].shape) != list(model_state_dict[k]
+                                                            .shape):
+                    print(
+                        "[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})"
+                        .format(k, para_state_dict[k].shape, model_state_dict[k]
+                                .shape))
+                else:
+                    model_state_dict[k] = para_state_dict[k]
+                    num_params_loaded += 1
+            self.res_model.set_dict(model_state_dict)
+            print("There are {}/{} variables loaded into {}.".format(
+                num_params_loaded, len(model_state_dict), "backbone"))
         else:
-            logger.info("No pretrained weights found => training with random weights")
+            print("No pretrained weights found => training with random weights")
         
-    def forward()
+    def forward(self, inp):
         with paddle.no_grad():
             output = self.res_model(inp)
-        output = reglog(output)
+        output = self.linear(output)
         
         return output
 
         
 def swav_resnet50_linearprobe(**kwargs):
-    model = SwAVLinearProbe(num_classes=1000, 
-                            linear_arch="resnet50", 
+    model = SwAVLinearProbe(linear_arch="resnet50", 
                             global_avg=True, 
                             use_bn=False,
                             output_dim=0, 
@@ -96,7 +104,14 @@ def swav_resnet50_linearprobe(**kwargs):
     return model
         
             
+def normal_init(param, **kwargs):
+    initializer = nn.initializer.Normal(**kwargs)
+    initializer(param, param.block)
 
+def constant_init(param, **kwargs):
+    initializer = nn.initializer.Constant(**kwargs)
+    initializer(param, param.block)
+        
 class RegLog(paddle.nn.Layer):
     """Creates logistic regression on top of frozen features"""
 
@@ -120,16 +135,16 @@ def __init__(self, num_labels, arch='resnet50', global_avg=False,
                 self.bn = paddle.nn.BatchNorm2D(num_features=2048, momentum
                     =1 - 0.1, epsilon=1e-05, weight_attr=None, bias_attr=
                     None, use_global_stats=True)
+        
         self.linear = paddle.nn.Linear(in_features=s, out_features=num_labels)
-        x = self.linear.weight.data
-        paddle.assign(paddle.normal(mean=0.0, std=0.01, shape=x.shape).
-            astype(x.dtype), x)
-        self.linear.bias.data.zero_()
+        normal_init(self.linear.weight, mean=0.0, std=0.01)
+        constant_init(self.linear.bias, value=0.0) # padiff
+
 
     def forward(self, x):
         x = self.av_pool(x)
         if self.bn is not None:
             x = self.bn(x)
 
-        x = x.view((x.shape[0], -1))
+        x = x.reshape((x.shape[0], -1))
         return self.linear(x)
\ No newline at end of file
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
index 3f8782ca..569f2e86 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
@@ -4,8 +4,8 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-  pretrained_model: swav_800ep_pretrain.pdparams
-  output_dir: ./output/
+  pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams
+  output_dir: ./output/baseline
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
@@ -14,7 +14,7 @@ Global:
   eval_unit: "epoch"
   accum_steps: 1
   epochs: 100
-  print_batch_step: 10
+  print_batch_step: 100
   use_visualdl: False
   seed: 31
 
@@ -41,7 +41,7 @@ Loss:
 
 LRScheduler:
     name: TimmCosine
-    learning_rate: 0.3
+    learning_rate: 0.6
     decay_unit: epoch
     last_epoch: 0
     warmup_epoch: 0
@@ -57,7 +57,7 @@ DataLoader:
   Train:
     dataset:
       name: ImageFolder
-      root: ./dataset/ILSVRC2012/train
+      root: data/ILSVRC2012/train
       transform:
         - RandomResizedCrop:
             size: 224
@@ -78,7 +78,7 @@ DataLoader:
   Eval:
     dataset:
       name: ImageFolder
-      root: ./dataset/ILSVRC2012/val
+      root: data/ILSVRC2012/val
       transform:
         - Resize:
             size: 256
diff --git a/tasks/ssl/swav/linearprobe.sh b/tasks/ssl/swav/linearprobe.sh
index a0a26b4e..31511a45 100644
--- a/tasks/ssl/swav/linearprobe.sh
+++ b/tasks/ssl/swav/linearprobe.sh
@@ -13,14 +13,20 @@
 # limitations under the License.
 
 #unset PADDLE_TRAINER_ENDPOINTS
-#export PADDLE_NNODES=1
-#export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538"
-#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-export FLAGS_stop_check_timeout=3600
+# export PADDLE_NNODES=1
+# #export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538"
+# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+# export FLAGS_stop_check_timeout=3600
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+export PADDLE_NNODES=1
+export PADDLE_MASTER="127.0.0.1:12538"
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 
 python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
-    passl-train \
-    -c ./configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml
+    tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
+
+# python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c
\ No newline at end of file
diff --git a/tools/train.py b/tools/train.py
old mode 100644
new mode 100755

From cc7f630b56899752cecb9ccdfbc49d09d6a147e1 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Thu, 20 Apr 2023 17:52:34 +0800
Subject: [PATCH 03/46] freeze_align

---
 passl/core/param_fuse.py                      |  31 ++--
 passl/engine/loops/classification_loop.py     | 108 +++++++++++-
 passl/models/resnet.py                        |   4 +-
 passl/models/swav.py                          |  62 +++++--
 passl/optimizer/__init__.py                   | 156 +++++++++++++++---
 passl/optimizer/momentum.py                   |   2 +-
 ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml |   7 +-
 7 files changed, 305 insertions(+), 65 deletions(-)

diff --git a/passl/core/param_fuse.py b/passl/core/param_fuse.py
index e98cca62..87fc5cb3 100644
--- a/passl/core/param_fuse.py
+++ b/passl/core/param_fuse.py
@@ -459,18 +459,6 @@ def flatten_dense_tensors(parameters):
 
     param_storage.add_rank_params(parameters, _param2align)
 
-    # process gradient
-    # grad_storage = None
-    grad_storage = GradStorage(
-        size=_buffer_size,
-        dtype=dtype,
-        device="gpu",
-        destination="0",
-        parm2align=_param2align)
-
-    for param in parameters:
-        grad_storage.add_grad(param, _param2align[param.name])
-
     if in_dygraph_mode():
         fused_param = EagerParamBase(
             shape=param_storage.buffer.shape,
@@ -482,7 +470,22 @@ def flatten_dense_tensors(parameters):
             dtype=dtype,
             name=unique_name.generate('fused_param'))
     param_storage.buffer._share_buffer_to(fused_param)
-    fused_param._copy_gradient_from(grad_storage.buffer)
+
+    if not stop_gradient:
+        # process gradient
+        # grad_storage = None
+        grad_storage = GradStorage(
+            size=_buffer_size,
+            dtype=dtype,
+            device="gpu",
+            destination="0",
+            parm2align=_param2align)
+
+        for param in parameters:
+            grad_storage.add_grad(param, _param2align[param.name])
+
+        fused_param._copy_gradient_from(grad_storage.buffer)
+
     fused_param.__dict__.update(state)
     fused_param.stop_gradient = stop_gradient
 
@@ -501,4 +504,4 @@ def get_fused_params(params):
     for group_idx, parameters in var_groups.items():
         fused_param = flatten_dense_tensors(parameters)
         fused_params.append(fused_param)
-    return fused_params
+    return fused_params
\ No newline at end of file
diff --git a/passl/engine/loops/classification_loop.py b/passl/engine/loops/classification_loop.py
index edf9be7c..cf61ad78 100644
--- a/passl/engine/loops/classification_loop.py
+++ b/passl/engine/loops/classification_loop.py
@@ -31,6 +31,91 @@
 from passl.utils import logger
 from .loop import _Loop, TrainingEpochLoop
 
+
+import os
+import logging
+import time
+from datetime import timedelta
+import pandas as pd
+
+
+class LogFormatter:
+    def __init__(self):
+        self.start_time = time.time()
+
+    def format(self, record):
+        elapsed_seconds = round(record.created - self.start_time)
+
+        prefix = "%s - %s - %s" % (
+            record.levelname,
+            time.strftime("%x %X"),
+            timedelta(seconds=elapsed_seconds),
+        )
+        message = record.getMessage()
+        message = message.replace("\n", "\n" + " " * (len(prefix) + 3))
+        return "%s - %s" % (prefix, message) if message else ""
+
+
+def create_logger(filepath, rank):
+    """
+    Create a logger.
+    Use a different log file for each process.
+    """
+    # create log formatter
+    log_formatter = LogFormatter()
+
+    # create file handler and set level to debug
+    if filepath is not None:
+        if rank > 0:
+            filepath = "%s-%i" % (filepath, rank)
+        file_handler = logging.FileHandler(filepath, "a")
+        file_handler.setLevel(logging.DEBUG)
+        file_handler.setFormatter(log_formatter)
+
+    # create console handler and set level to info
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    console_handler.setFormatter(log_formatter)
+
+    # create logger and set level to debug
+    logger = logging.getLogger()
+    logger.handlers = []
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+    if filepath is not None:
+        logger.addHandler(file_handler)
+    logger.addHandler(console_handler)
+
+    # reset logger elapsed time
+    def reset_time():
+        log_formatter.start_time = time.time()
+
+    logger.reset_time = reset_time
+
+    return logger
+
+
+def init_logger(name):
+    logger = create_logger(
+        os.path.join("{}.log".format(name)), rank=0
+    )
+    logger.info("============ Initialized logger ============")
+    logger.info("")
+    return logger
+
+
+def log_model(model, logger):
+    model1 = model.res_model
+    for name, param in model1.named_parameters():
+        logger.info(name)
+        logger.info(param.abs().sum())
+        
+    model2 = model.linear
+    for name, param in model2.named_parameters():
+        logger.info(name)
+        logger.info(param.abs().sum())
+
+        
 class ClassificationTrainingEpochLoop(TrainingEpochLoop):
 
     def __init__(self, trainer, epochs, max_train_step=None, val_loop=None):
@@ -60,8 +145,13 @@ def forward_backward(self, batch):
 
                 out = self.trainer.model(data)
                 final_out.append(out)
-
+                
+            # label = paddle.to_tensor([133, 141, 371, 254,  89, 244,  33,  64, 542,  93, 262, 674, 898, 796, 785, 727, 228, 792, 853, 639, 410, 357, 545, 473, 637, 400, 863, 386, 689, 359, 476, 960]).cast('int32')
+            
             loss_dict = self.trainer.train_loss_func(out, label)
+            
+            # logger1 = init_logger('first')
+            # log_model(self.trainer.model, logger1)
 
             for key in loss_dict:
                 loss_dict[key] = loss_dict[key] / self.trainer.accum_steps
@@ -72,9 +162,23 @@ def forward_backward(self, batch):
             # loss scaling if using fp16 otherwise do nothing
             scaled = self.trainer.scaler.scale(loss_dict["loss"])
             scaled.backward()
+            
+#             grad_sync(self.trainer.optimizer.param_groups)
+
+#             # do unscale and step if using fp16 and not found nan/inf
+#             # otherwise do nothing
+#             self.trainer.scaler.step(self.trainer.optimizer)
+#             # do update loss scaling if using fp16
+#             # otherwise do nothing
+#             self.trainer.scaler.update()
+            
+            # logger2 = init_logger('second')
+            # log_model(self.trainer.model, logger2)
+            # import pdb; pdb.set_trace()
+            
 
         out = paddle.concat(final_out, axis=0)
-        return out, final_loss_dict
+        return out, final_loss_dict, 
 
     def train_one_step(self, batch):
 
diff --git a/passl/models/resnet.py b/passl/models/resnet.py
index 9abfc0d4..07e9362f 100644
--- a/passl/models/resnet.py
+++ b/passl/models/resnet.py
@@ -1,3 +1,5 @@
+import functools
+
 import paddle
 import paddle.nn as nn
 
@@ -106,7 +108,7 @@ def __init__(self, block, layers, zero_init_residual=False, groups=1,
         
         super(ResNet, self).__init__()
         if norm_layer is None:
-            norm_layer = paddle.nn.BatchNorm2D
+            norm_layer = functools.partial(paddle.nn.BatchNorm2D, use_global_stats=True)
         self._norm_layer = norm_layer
         self.eval_mode = eval_mode
         self.padding = paddle.nn.Pad2D(padding=1, value=0.0)
diff --git a/passl/models/swav.py b/passl/models/swav.py
index 7cba9acc..8f20b6d9 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -3,6 +3,7 @@
 import paddle
 import paddle.nn as nn
 
+from passl.nn import init
 from passl.models.resnet import resnet50
 from passl.models.base_model import Model
 
@@ -51,21 +52,36 @@ def load_pretrained(self, path, rank=0, finetune=False):
 
     def save(self, path, local_rank=0, rank=0):
         paddle.save(self.state_dict(), path + ".pdparams")
-
+        
         
 class SwAVLinearProbe(SwAV):
     def __init__(self, class_num=1000, linear_arch="resnet50", global_avg=True, use_bn=False, **kwargs):
         super().__init__(**kwargs)
         self.linear = RegLog(1000, "resnet50", global_avg=True, use_bn=False)
         self.res_model.eval()
-    
-    def load_pretrained(self, path, rank=0, finetune=False):
-        # only load res_model
+        
+        # freeze all layers but the last fc
+        for name, param in self.named_parameters():
+            if name not in ['linear.linear.weight', 'linear.linear.bias']:
+                param.stop_gradient = True
+
+        # optimize only the linear classifier
+        parameters = list(
+            filter(lambda p: not p.stop_gradient, self.parameters()))
+        assert len(parameters) == 2  # weight, bias
+        
+        self.apply(self._freeze_norm)
+
+    def _freeze_norm(self, layer):
+        if isinstance(layer, (nn.layer.norm._BatchNormBase)):
+            layer._use_global_stats = True
+
+    def _load_model(self, path, model, tag):
         if os.path.isfile(path):
             para_state_dict = paddle.load(path)
             
             # resnet
-            model_state_dict = self.res_model.state_dict()
+            model_state_dict = model.state_dict()
             keys = model_state_dict.keys()
             num_params_loaded = 0
             for k in keys:
@@ -80,13 +96,25 @@ def load_pretrained(self, path, rank=0, finetune=False):
                 else:
                     model_state_dict[k] = para_state_dict[k]
                     num_params_loaded += 1
-            self.res_model.set_dict(model_state_dict)
+            model.set_dict(model_state_dict)
             print("There are {}/{} variables loaded into {}.".format(
-                num_params_loaded, len(model_state_dict), "backbone"))
+                num_params_loaded, len(model_state_dict), tag))
         else:
-            print("No pretrained weights found => training with random weights")
+            print("No pretrained weights found in {} => training with random weights".format(tag))
+    
+    def load_pretrained(self, path, rank=0, finetune=False):
+        self._load_model(path, self.res_model, 'backbone')
+        self._load_model("linear.pdparams", self.linear, 'linear')
+
         
     def forward(self, inp):
+#         import numpy as np
+        # import pdb; pdb.set_trace()
+        
+#         np.random.seed(42)
+#         a = np.random.rand(32, 3, 224, 224)
+#         inp = paddle.to_tensor(a).astype('float32')
+        
         with paddle.no_grad():
             output = self.res_model(inp)
         output = self.linear(output)
@@ -104,13 +132,14 @@ def swav_resnet50_linearprobe(**kwargs):
     return model
         
             
-def normal_init(param, **kwargs):
-    initializer = nn.initializer.Normal(**kwargs)
-    initializer(param, param.block)
+# def normal_init(param, **kwargs):
+#     initializer = nn.initializer.Normal(**kwargs)
+#     initializer(param, param.block)
 
-def constant_init(param, **kwargs):
-    initializer = nn.initializer.Constant(**kwargs)
-    initializer(param, param.block)
+# def constant_init(param, **kwargs):
+#     initializer = nn.initializer.Constant(**kwargs)
+#     initializer(param, param.block)
+        
         
 class RegLog(paddle.nn.Layer):
     """Creates logistic regression on top of frozen features"""
@@ -137,9 +166,8 @@ def __init__(self, num_labels, arch='resnet50', global_avg=False,
                     None, use_global_stats=True)
         
         self.linear = paddle.nn.Linear(in_features=s, out_features=num_labels)
-        normal_init(self.linear.weight, mean=0.0, std=0.01)
-        constant_init(self.linear.bias, value=0.0) # padiff
-
+        init.normal_(self.linear.weight, mean=0.0, std=0.01)
+        init.zeros_(self.linear.bias)
 
     def forward(self, x):
         x = self.av_pool(x)
diff --git a/passl/optimizer/__init__.py b/passl/optimizer/__init__.py
index 2d87f3f3..451da1ff 100644
--- a/passl/optimizer/__init__.py
+++ b/passl/optimizer/__init__.py
@@ -1,3 +1,88 @@
+# # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #     http://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing, software
+# # distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
+
+# from __future__ import absolute_import
+# from __future__ import division
+# from __future__ import print_function
+
+# from collections import defaultdict
+
+# import copy
+# import paddle
+
+# from passl.core.grad_clip import ClipGradByGlobalNorm
+# from passl.core.param_fuse import get_fused_params
+
+# from passl.utils import logger
+
+# from .optimizer import Optimizer
+# from .adamw import AdamW
+# from .adafactor import Adafactor
+# from .momentum import Momentum
+# from .momentum_lars import MomentumLARS
+
+
+# def build_optimizer(config, lr_scheduler, model=None):
+#     config = copy.deepcopy(config)
+
+#     grad_clip = None
+#     grad_clip_config = config.pop('grad_clip', None)
+#     if grad_clip_config is not None:
+#         grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm')
+#         grad_clip = eval(grad_clip_name)(**grad_clip_config)
+
+#     no_weight_decay_name = config.pop('no_weight_decay_name', [])
+
+#     param_group = defaultdict(list)
+#     for n, p in model.named_parameters():
+#         state = copy.deepcopy(p.__dict__)
+#         if any(nd in n for nd in no_weight_decay_name):
+#             state['no_weight_decay'] = True
+#         param_group[str(state)].append(p)
+
+#     # fuse params
+#     for key in param_group:
+#         if 'gpu' not in paddle.get_device():
+#             continue
+#         if "'is_distributed': True" in key:
+#             continue
+#         if "'has_sparse_grad': True" in key:
+#             continue
+
+#         param_group[key] = get_fused_params(param_group[key])
+
+#     # bulid optimizer params
+#     params = []
+#     for key in param_group:
+#         group = {'params': param_group[key]}
+
+#         if "'is_distributed': True" in key:
+#             group['is_distributed'] = True
+
+#         if 'no_weight_decay' in key:
+#             group['weight_decay'] = 0.0
+
+#         params.append(group)
+
+#     optim_name = config.pop('name')
+#     optim = eval(optim_name)(params,
+#                              lr=lr_scheduler,
+#                              grad_clip=grad_clip,
+#                              **config)
+#     logger.debug("build optimizer ({}) success..".format(optim))
+#     return optim
+
+
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -34,7 +119,8 @@
 
 def build_optimizer(config, lr_scheduler, model=None):
     config = copy.deepcopy(config)
-
+    optim_name = config.pop('name')
+    
     grad_clip = None
     grad_clip_config = config.pop('grad_clip', None)
     if grad_clip_config is not None:
@@ -42,40 +128,56 @@ def build_optimizer(config, lr_scheduler, model=None):
         grad_clip = eval(grad_clip_name)(**grad_clip_config)
 
     no_weight_decay_name = config.pop('no_weight_decay_name', [])
+    tensor_fusion = config.pop('tensor_fusion', True)
+    if 'LAR' in optim_name:
+        tensor_fusion = False
+        logger.info('LARS or LARC Optimizer can not use tensor fusion technology. It automatically fall back to `tensor_fusion = False`.')
 
-    param_group = defaultdict(list)
-    for n, p in model.named_parameters():
-        state = copy.deepcopy(p.__dict__)
-        if any(nd in n for nd in no_weight_decay_name):
-            state['no_weight_decay'] = True
-        param_group[str(state)].append(p)
 
-    # fuse params
-    for key in param_group:
-        if 'gpu' not in paddle.get_device():
-            continue
-        if "'is_distributed': True" in key:
-            continue
-        if "'has_sparse_grad': True" in key:
-            continue
+    if hasattr(model, 'param_groups'):
+        param_group = model.param_groups(no_weight_decay_name, tensor_fusion)
+        for group in param_group:
+            if 'tensor_fusion' in group and group['tensor_fusion']:
+                group['params'] = get_fused_params(group['params'])
+    else:
+        param_group_map = defaultdict(list)
+        for n, p in model.named_parameters():
+            state = copy.deepcopy(p.__dict__)
+            state['stop_gradient'] = p.stop_gradient
+            if any(nd in n for nd in no_weight_decay_name):
+                state['no_weight_decay'] = True
+            param_group_map[str(state)].append(p)
 
-        param_group[key] = get_fused_params(param_group[key])
 
-    # bulid optimizer params
-    params = []
-    for key in param_group:
-        group = {'params': param_group[key]}
+        if tensor_fusion:
+            # fuse params
+            for key in param_group_map:
+                if 'gpu' not in paddle.get_device():
+                    continue
+                if "'is_distributed': True" in key:
+                    continue
+                if "'has_sparse_grad': True" in key:
+                    continue
+                param_group_map[key] = get_fused_params(param_group_map[key])
 
-        if "'is_distributed': True" in key:
-            group['is_distributed'] = True
 
-        if 'no_weight_decay' in key:
-            group['weight_decay'] = 0.0
+        # bulid optimizer params
+        param_group = []
+        for key in param_group_map:
+            group = {'params': param_group_map[key]}
 
-        params.append(group)
 
-    optim_name = config.pop('name')
-    optim = eval(optim_name)(params,
+            if "'is_distributed': True" in key:
+                group['is_distributed'] = True
+
+
+            if 'no_weight_decay' in key:
+                group['weight_decay'] = 0.0
+
+
+            param_group.append(group)
+
+    optim = eval(optim_name)(param_group,
                              lr=lr_scheduler,
                              grad_clip=grad_clip,
                              **config)
diff --git a/passl/optimizer/momentum.py b/passl/optimizer/momentum.py
index 55402fd4..8b569c7c 100644
--- a/passl/optimizer/momentum.py
+++ b/passl/optimizer/momentum.py
@@ -72,7 +72,7 @@ def step(self):
                 grad = p.grad
                 if grad is None:
                     continue
-
+                # print('###########',p.name)
                 if grad.is_selected_rows():
                     raise RuntimeError(
                         'Momentum does not support sparse gradients.')
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
index 569f2e86..a290ea19 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
@@ -5,7 +5,7 @@ Global:
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
   pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams
-  output_dir: ./output/baseline
+  output_dir: ./output/baseline_0420_align_trackTrue
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
@@ -41,7 +41,8 @@ Loss:
 
 LRScheduler:
     name: TimmCosine
-    learning_rate: 0.6
+    learning_rate: 0.3
+    eta_min: 0.0
     decay_unit: epoch
     last_epoch: 0
     warmup_epoch: 0
@@ -50,8 +51,8 @@ Optimizer:
   name: Momentum
   momentum: 0.9
   weight_decay: 1e-6
+  tensor_fusion: True
   
-
 # data loader for train and eval
 DataLoader:
   Train:

From 4d8dc6b29b6ec2c12d92f91b2fe4ed349aa6e38c Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Fri, 21 Apr 2023 11:01:32 +0800
Subject: [PATCH 04/46] add_ft_swav

---
 passl/data/dataset/imagefolder_dataset.py     |  20 ++-
 passl/models/swav.py                          | 107 ++++++-------
 passl/scheduler/lr_scheduler.py               |   4 +-
 ...se_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml | 148 ------------------
 ...v_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml | 112 +++++++++++++
 ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml |   2 +
 6 files changed, 180 insertions(+), 213 deletions(-)
 delete mode 100644 tasks/ssl/swav/configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml
 create mode 100644 tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml

diff --git a/passl/data/dataset/imagefolder_dataset.py b/passl/data/dataset/imagefolder_dataset.py
index 618f5d77..ef03f1c4 100644
--- a/passl/data/dataset/imagefolder_dataset.py
+++ b/passl/data/dataset/imagefolder_dataset.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
-import numpy as np
 import os
+import urllib
+import numpy as np
+from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
 
 import paddle
 
@@ -56,11 +57,22 @@ def __init__(self,
                  transform=None,
                  target_transform=None,
                  loader=default_loader,
-                 extensions=IMG_EXTENSIONS):
+                 extensions=IMG_EXTENSIONS,
+                 samples_tag=None):
 
         self.root = root
         classes, class_to_idx = self.find_classes(self.root)
-        samples = self.make_dataset(self.root, class_to_idx, extensions)
+        if samples_tag is None:
+            samples = self.make_dataset(self.root, class_to_idx, extensions)
+        elif samples_tag == "semi_1" or samples == "semi_10":
+            train_data_path  = os.path.join(root, "train")
+            percent = samples_tag.split('_')[-1]
+            subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(percent) + "percent.txt")
+            list_imgs = [li.decode("utf-8").split('\n')[0] for li in subset_file]
+            samples = [(os.path.join(train_data_path, li.split('_')[0], li), class_to_idx[li.split('_')[0]]) for li in list_imgs]
+        else:
+            raise NotImplementedError('{} is not implemented'.format(samples))
+
         print(f'find total {len(classes)} classes and {len(samples)} images.')
 
         self.extensions = extensions
diff --git a/passl/models/swav.py b/passl/models/swav.py
index 8f20b6d9..e6fb79fb 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -14,6 +14,7 @@
     # 'swav_resnet50_pretrain',
     'SwAV',
     'SwAVLinearProbe',
+    'SwAVFinetune',
     # 'SwAVPretrain',
 ]
 
@@ -23,7 +24,32 @@ def __init__(self, **kwargs):
         super().__init__()
         self.res_model = resnet50(**kwargs)
     
-        
+    def _load_model(self, path, model, tag):
+        if os.path.isfile(path):
+            para_state_dict = paddle.load(path)
+            
+            # resnet
+            model_state_dict = model.state_dict()
+            keys = model_state_dict.keys()
+            num_params_loaded = 0
+            for k in keys:
+                if k not in para_state_dict:
+                    print("{} is not in pretrained model".format(k))
+                elif list(para_state_dict[k].shape) != list(model_state_dict[k]
+                                                            .shape):
+                    print(
+                        "[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})"
+                        .format(k, para_state_dict[k].shape, model_state_dict[k]
+                                .shape))
+                else:
+                    model_state_dict[k] = para_state_dict[k]
+                    num_params_loaded += 1
+            model.set_dict(model_state_dict)
+            print("There are {}/{} variables loaded into {}.".format(
+                num_params_loaded, len(model_state_dict), tag))
+        else:
+            print("No pretrained weights found in {} => training with random weights".format(tag))
+
     def load_pretrained(self, path, rank=0, finetune=False):
         pass
 #         if not os.path.exists(path + '.pdparams'):
@@ -55,9 +81,9 @@ def save(self, path, local_rank=0, rank=0):
         
         
 class SwAVLinearProbe(SwAV):
-    def __init__(self, class_num=1000, linear_arch="resnet50", global_avg=True, use_bn=False, **kwargs):
+    def __init__(self, class_num=1000, **kwargs):
         super().__init__(**kwargs)
-        self.linear = RegLog(1000, "resnet50", global_avg=True, use_bn=False)
+        self.linear = RegLog(class_num)
         self.res_model.eval()
         
         # freeze all layers but the last fc
@@ -75,38 +101,11 @@ def __init__(self, class_num=1000, linear_arch="resnet50", global_avg=True, use_
     def _freeze_norm(self, layer):
         if isinstance(layer, (nn.layer.norm._BatchNormBase)):
             layer._use_global_stats = True
-
-    def _load_model(self, path, model, tag):
-        if os.path.isfile(path):
-            para_state_dict = paddle.load(path)
-            
-            # resnet
-            model_state_dict = model.state_dict()
-            keys = model_state_dict.keys()
-            num_params_loaded = 0
-            for k in keys:
-                if k not in para_state_dict:
-                    print("{} is not in pretrained model".format(k))
-                elif list(para_state_dict[k].shape) != list(model_state_dict[k]
-                                                            .shape):
-                    print(
-                        "[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})"
-                        .format(k, para_state_dict[k].shape, model_state_dict[k]
-                                .shape))
-                else:
-                    model_state_dict[k] = para_state_dict[k]
-                    num_params_loaded += 1
-            model.set_dict(model_state_dict)
-            print("There are {}/{} variables loaded into {}.".format(
-                num_params_loaded, len(model_state_dict), tag))
-        else:
-            print("No pretrained weights found in {} => training with random weights".format(tag))
     
     def load_pretrained(self, path, rank=0, finetune=False):
         self._load_model(path, self.res_model, 'backbone')
         self._load_model("linear.pdparams", self.linear, 'linear')
 
-        
     def forward(self, inp):
 #         import numpy as np
         # import pdb; pdb.set_trace()
@@ -121,14 +120,23 @@ def forward(self, inp):
         
         return output
 
+class SwAVFinetune(SwAV):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    
+    def load_pretrained(self, path, rank=0, finetune=False):
+        self._load_model(path, self.res_model, 'backbone') 
+    
+    def forward(self, inp):
+        return self.res_model(inp)
+
         
 def swav_resnet50_linearprobe(**kwargs):
-    model = SwAVLinearProbe(linear_arch="resnet50", 
-                            global_avg=True, 
-                            use_bn=False,
-                            output_dim=0, 
-                            eval_mode=True,
-                            **kwargs)
+    model = SwAVLinearProbe(**kwargs)
+    return model
+
+def swav_resnet50_finetune(**kwargs):
+    model = SwAVFinetune(**kwargs)
     return model
         
             
@@ -144,35 +152,16 @@ def swav_resnet50_linearprobe(**kwargs):
 class RegLog(paddle.nn.Layer):
     """Creates logistic regression on top of frozen features"""
 
-    def __init__(self, num_labels, arch='resnet50', global_avg=False,
-        use_bn=True):
+    def __init__(self, num_labels):
         super(RegLog, self).__init__()
-        self.bn = None
-        if global_avg:
-            if arch == 'resnet50':
-                s = 2048
-            elif arch == 'resnet50w2':
-                s = 4096
-            elif arch == 'resnet50w4':
-                s = 8192
-            self.av_pool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1))
-        else:
-            assert arch == 'resnet50'
-            s = 8192
-            self.av_pool = paddle.nn.AvgPool2D(6, stride=1)
-            if use_bn:
-                self.bn = paddle.nn.BatchNorm2D(num_features=2048, momentum
-                    =1 - 0.1, epsilon=1e-05, weight_attr=None, bias_attr=
-                    None, use_global_stats=True)
-        
+        s = 2048
+        self.av_pool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1))
         self.linear = paddle.nn.Linear(in_features=s, out_features=num_labels)
+        
         init.normal_(self.linear.weight, mean=0.0, std=0.01)
         init.zeros_(self.linear.bias)
 
     def forward(self, x):
         x = self.av_pool(x)
-        if self.bn is not None:
-            x = self.bn(x)
-
         x = x.reshape((x.shape[0], -1))
         return self.linear(x)
\ No newline at end of file
diff --git a/passl/scheduler/lr_scheduler.py b/passl/scheduler/lr_scheduler.py
index 223ca349..6a492920 100644
--- a/passl/scheduler/lr_scheduler.py
+++ b/passl/scheduler/lr_scheduler.py
@@ -123,8 +123,8 @@ class Step(lr.LRScheduler):
     def __init__(self,
                  step_each_epoch,
                  epochs,
-                 boundaries,
-                 values,
+                 boundaries, # [12, 16]
+                 values,    #[0.01, 0.002, 0.0004],
                  warmup_steps=0,
                  warmup_epochs=0,
                  decay_unit='epoch',
diff --git a/tasks/ssl/swav/configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml
deleted file mode 100644
index d70c6647..00000000
--- a/tasks/ssl/swav/configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml
+++ /dev/null
@@ -1,148 +0,0 @@
-# global configs
-Global:
-  task_type: Classification
-  train_loop: ClassificationTrainingEpochLoop
-  validate_loop: ClassificationEvaluationLoop
-  checkpoint: null
-  pretrained_model: ./pretrained/mocov3/mocov3_vit_base_in1k_300ep_pretrained
-  finetune: True
-  output_dir: ./output/
-  device: gpu
-  save_interval: 1
-  max_num_latest_checkpoint: 0
-  eval_during_train: True
-  eval_interval: 1
-  eval_unit: "epoch"
-  accum_steps: 1
-  epochs: 150
-  print_batch_step: 10
-  use_visualdl: False
-  seed: 2022
-
-# FP16 setting
-FP16:
-  level: O1
-  GradScaler:
-    init_loss_scaling: 65536.0
-
-DistributedStrategy:
-  data_parallel: True
-
-# model architecture
-Model:
-  name: DeiT_base_patch16_224
-  drop_path_rate : 0.1
-  drop_rate : 0.0
-  class_num: 1000
-
-# loss function config for traing/eval process
-Loss:
-  Train:
-    - CELoss:
-        weight: 1.0
-  Eval:
-    - CELoss:
-        weight: 1.0
-
-LRScheduler:
-    name: TimmCosine
-    learning_rate: 1e-3
-    eta_min: 1e-5
-    warmup_epoch: 5
-    warmup_start_lr: 1e-6
-    decay_unit: epoch
-
-Optimizer:
-  name: AdamW
-  betas: (0.9, 0.999)
-  eps: 1e-8
-  weight_decay: 0.05
-  no_weight_decay_name: ["cls_token", "pos_embed", "norm", "bias"]
-  use_master_param: True
-  exp_avg_force_fp32: True
-
-# data loader for train and eval
-DataLoader:
-  Train:
-    dataset:
-      name: ImageFolder
-      root: ./dataset/ILSVRC2012/train
-      transform:
-        - RandomResizedCrop:
-            size: 224
-            interpolation: bicubic
-        - RandomHorizontalFlip:
-        - TimmAutoAugment:
-            config_str: rand-m9-mstd0.5-inc1
-            interpolation: bicubic
-            img_size: 224
-            mean: [0.485, 0.456, 0.406]
-        - NormalizeImage:
-            scale: 1.0/255.0
-            mean: [0.485, 0.456, 0.406]
-            std: [0.229, 0.224, 0.225]
-            order: ''
-        - RandomErasing:
-            EPSILON: 0.25
-            sl: 0.02
-            sh: 1.0/3.0
-            r1: 0.3
-            attempt: 10
-            use_log_aspect: True
-            mode: pixel
-        - ToCHWImage:
-      batch_transform:
-        - TransformOpSampler:
-            Mixup:
-              alpha: 0.8
-              prob: 0.5
-              epsilon: 0.1
-              class_num: 1000
-            Cutmix:
-              alpha: 1.0
-              prob: 0.5
-              epsilon: 0.1
-              class_num: 1000
-    sampler:
-      name: RepeatedAugSampler
-      batch_size: 128 # accum_steps: 1, total batchsize: 1024
-      drop_last: False
-      shuffle: True
-    loader:
-      num_workers: 8
-      use_shared_memory: True
-
-  Eval:
-    dataset:
-      name: ImageFolder
-      root: ./dataset/ILSVRC2012/val
-      transform:
-        - Resize:
-            size: 256
-            interpolation: bicubic
-            backend: pil
-        - CenterCrop:
-            size: 224
-        - NormalizeImage:
-            scale: 1.0/255.0
-            mean: [0.485, 0.456, 0.406]
-            std: [0.229, 0.224, 0.225]
-            order: ''
-        - ToCHWImage:
-    sampler:
-      name: DistributedBatchSampler
-      batch_size: 256
-      drop_last: False
-      shuffle: False
-    loader:
-      num_workers: 8
-      use_shared_memory: True
-
-Metric:
-  Eval:
-    - TopkAcc:
-        topk: [1, 5]
-
-Export:
-  export_type: paddle
-  input_shape: [None, 3, 224, 224]
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
new file mode 100644
index 00000000..f2a229a0
--- /dev/null
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
@@ -0,0 +1,112 @@
+# global configs
+Global:
+  task_type: Classification
+  train_loop: ClassificationTrainingEpochLoop
+  validate_loop: ClassificationEvaluationLoop
+  checkpoint: null
+  pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams
+  finetune: True
+  output_dir: ./output/semi_0420
+  device: gpu
+  save_interval: 1
+  max_num_latest_checkpoint: 0
+  eval_during_train: True
+  eval_interval: 1
+  eval_unit: "epoch"
+  accum_steps: 1
+  epochs: 20
+  print_batch_step: 100
+  use_visualdl: False
+  seed: 31
+
+# FP16 setting
+# FP16:
+#   level: O1
+
+DistributedStrategy:
+  data_parallel: True
+
+# model architecture
+Model:
+  name: swav_resnet50_finetune
+  output_dim: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+LRScheduler:
+    name: Step
+    learning_rate: 0.01
+    boundaries: [12, 16]
+    values: [0.01, 0.002, 0.0004]
+    decay_unit: epoch
+    last_epoch: 0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 0.0
+  tensor_fusion: False
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageFolder
+      root: data/ILSVRC2012/train
+      transform:
+        - RandomResizedCrop:
+            size: 224
+        - RandomHorizontalFlip:
+        - ToTensor:
+        - Normalize:
+            mean: [0.485, 0.456, 0.406]
+            std: [0.228, 0.224, 0.225]
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32 # accum_steps: 1, total batchsize: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageFolder
+      root: data/ILSVRC2012/val
+      transform:
+        - Resize:
+            size: 256
+        - CenterCrop:
+            size: 224
+        - ToTensor:
+        - Normalize:
+            mean: [0.485, 0.456, 0.406]
+            std: [0.228, 0.224, 0.225]
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
+Export:
+  export_type: paddle
+  input_shape: [None, 3, 224, 224]
\ No newline at end of file
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
index a290ea19..251f5f49 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
@@ -28,6 +28,8 @@ DistributedStrategy:
 # model architecture
 Model:
   name: swav_resnet50_linearprobe
+  output_dim: 0 
+  eval_mode: True
   class_num: 1000
 
 # loss function config for traing/eval process

From 32b94c36f115c3c94124246936dbef151e705ba4 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Fri, 21 Apr 2023 17:07:26 +0800
Subject: [PATCH 05/46] add_pretrain

---
 passl/data/dataset/multicrop_dataset.py       |  94 +++++++++++++++
 passl/data/preprocess/basic_transforms.py     |  19 +++
 passl/models/swav.py                          |  18 ++-
 passl/scheduler/lr_scheduler.py               |   2 +-
 ...se_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml | 109 ------------------
 ...e_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml | 108 -----------------
 ...v_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml |  71 ++++++++++++
 tasks/ssl/swav/finetune.sh                    |   5 +-
 tasks/ssl/swav/linearprobe.sh                 |   4 -
 tasks/ssl/swav/pretrain.sh                    |  13 +--
 10 files changed, 207 insertions(+), 236 deletions(-)
 create mode 100644 passl/data/dataset/multicrop_dataset.py
 delete mode 100644 tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml
 delete mode 100644 tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml
 create mode 100644 tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml

diff --git a/passl/data/dataset/multicrop_dataset.py b/passl/data/dataset/multicrop_dataset.py
new file mode 100644
index 00000000..926d4a59
--- /dev/null
+++ b/passl/data/dataset/multicrop_dataset.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.vision.transforms import (
+    Compose,
+    Transpose,
+    ColorJitter,
+    RandomResizedCrop,
+    RandomHorizontalFlip,
+)
+from passl.data.dataset.imagefolder_dataset import ImageFolder
+from passl.data.preprocess import (
+    RandomApply,
+    GaussianBlur,
+    NormalizeImage,
+    RandomGrayscale,
+)
+
+
+class MultiCropDataset(ImageFolder):
+    def __init__(self,
+                 dataroot,
+                 size_crops,
+                 num_crops,
+                 min_scale_crops,
+                 max_scale_crops,
+                 return_label=False):
+        super(MultiCropDataset, self).__init__(dataroot)
+
+        assert len(size_crops) == len(num_crops)
+        assert len(min_scale_crops) == len(num_crops)
+        assert len(max_scale_crops) == len(num_crops)
+        self.return_label = return_label
+
+        color_transform = [get_color_distortion(), get_pil_gaussian_blur()]
+        mean = [0.485, 0.456, 0.406]
+        std = [0.229, 0.224, 0.225]
+        trans = []
+        for i in range(len(size_crops)):
+            randomresizedcrop = RandomResizedCrop(
+                size_crops[i],
+                scale=(min_scale_crops[i], max_scale_crops[i]),
+            )
+            trans.extend([Compose([
+                randomresizedcrop,
+                RandomHorizontalFlip(prob=0.5),
+                Compose(color_transform),
+                Transpose(),
+                NormalizeImage(scale='1.0/255.0', mean=mean, std=std)])
+            ] * num_crops[i])
+        self.trans = trans
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: (sample, target) where target is class_index of the target class.
+        """
+        path, target = self.samples[index]
+        sample = self.loader(path)
+        sample = list(map(lambda trans: trans(sample), self.trans))
+        if self.return_label:
+            return sample, target
+
+        return sample
+
+
+def get_pil_gaussian_blur(p=0.5):
+    gaussian_blur = GaussianBlur(sigma=[.1, 2.], _PIL=True)
+    rnd_gaussian_blur = RandomApply([gaussian_blur], p=p)
+    return rnd_gaussian_blur
+
+
+def get_color_distortion(s=1.0):
+    # s is the strength of color distortion.
+    color_jitter = ColorJitter(0.8*s, 0.8*s, 0.8*s, 0.2*s)
+    rnd_color_jitter = RandomApply([color_jitter], p=0.8)
+    rnd_gray = RandomGrayscale(p=0.2)
+    color_distort = Compose([rnd_color_jitter, rnd_gray])
+    return color_distort
\ No newline at end of file
diff --git a/passl/data/preprocess/basic_transforms.py b/passl/data/preprocess/basic_transforms.py
index 7be2b26a..9d9eb132 100644
--- a/passl/data/preprocess/basic_transforms.py
+++ b/passl/data/preprocess/basic_transforms.py
@@ -57,6 +57,7 @@
     "SimCLRGaussianBlur",
     "BYOLSolarize",
     "MAERandCropImage",
+    "GaussianBlur"
 ]
 
 
@@ -941,3 +942,21 @@ def __call__(self, img):
             else:
                 img = ImageOps.solarize(img)
         return img
+
+
+class GaussianBlur(object):
+    """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709"""
+    def __init__(self, sigma=[.1, 2.], _PIL=False):
+        self.sigma = sigma
+        self.kernel_size = 23
+        self._PIL = _PIL
+
+    def __call__(self, x):
+        sigma = np.random.uniform(self.sigma[0], self.sigma[1])
+        if self._PIL:
+            x = x.filter(ImageFilter.GaussianBlur(radius=sigma))
+            return x
+        else:
+            x = cv2.GaussianBlur(np.array(x),
+                                 (self.kernel_size, self.kernel_size), sigma)
+            return Image.fromarray(x.astype(np.uint8))
\ No newline at end of file
diff --git a/passl/models/swav.py b/passl/models/swav.py
index e6fb79fb..0181eb73 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -9,13 +9,13 @@
 
 
 __all__ = [
-    # 'swav_resnet50',
+    'swav_resnet50_finetune',
     'swav_resnet50_linearprobe',
-    # 'swav_resnet50_pretrain',
+    'swav_resnet50_pretrain',
     'SwAV',
     'SwAVLinearProbe',
     'SwAVFinetune',
-    # 'SwAVPretrain',
+    'SwAVPretrain',
 ]
 
 # def model and 
@@ -130,6 +130,13 @@ def load_pretrained(self, path, rank=0, finetune=False):
     def forward(self, inp):
         return self.res_model(inp)
 
+class SwAVPretrain(SwAV):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    
+    def forward(self, inp):
+        return self.res_model(inp)
+
         
 def swav_resnet50_linearprobe(**kwargs):
     model = SwAVLinearProbe(**kwargs)
@@ -138,7 +145,10 @@ def swav_resnet50_linearprobe(**kwargs):
 def swav_resnet50_finetune(**kwargs):
     model = SwAVFinetune(**kwargs)
     return model
-        
+
+def swav_resnet50_pretrain(**kwargs):
+    model = SwAVPretrain(**kwargs)
+    return model       
             
 # def normal_init(param, **kwargs):
 #     initializer = nn.initializer.Normal(**kwargs)
diff --git a/passl/scheduler/lr_scheduler.py b/passl/scheduler/lr_scheduler.py
index 6a492920..fb8c7c97 100644
--- a/passl/scheduler/lr_scheduler.py
+++ b/passl/scheduler/lr_scheduler.py
@@ -23,7 +23,7 @@
 class TimmCosine(lr.LRScheduler):
     def __init__(self,
                  learning_rate,
-                 step_each_epoch,
+                 step_each_epoch, # len(train_loader) = dataset/total_bs
                  epochs,
                  decay_unit='epoch',
                  eta_min=0.0,
diff --git a/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml
deleted file mode 100644
index ae0efc7b..00000000
--- a/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml
+++ /dev/null
@@ -1,109 +0,0 @@
-# global configs
-Global:
-  task_type: Classification
-  train_loop: ClassificationTrainingEpochLoop
-  validate_loop: ClassificationEvaluationLoop
-  checkpoint: null
-  pretrained_model: ./pretrained/mocov3/mocov3_vit_base_in1k_300ep_pretrained
-  output_dir: ./output/
-  device: gpu
-  save_interval: 1
-  max_num_latest_checkpoint: 0
-  eval_during_train: True
-  eval_interval: 1
-  eval_unit: "epoch"
-  accum_steps: 1
-  epochs: 90
-  print_batch_step: 10
-  use_visualdl: False
-  seed: 2022
-
-# FP16 setting
-FP16:
-  level: O1
-
-DistributedStrategy:
-  data_parallel: True
-
-# model architecture
-Model:
-  name: mocov3_vit_base_linearprobe
-  class_num: 1000
-
-# loss function config for traing/eval process
-Loss:
-  Train:
-    - CELoss:
-        weight: 1.0
-  Eval:
-    - CELoss:
-        weight: 1.0
-
-LRScheduler:
-    name: TimmCosine
-    learning_rate: 12.0
-    decay_unit: epoch
-    last_epoch: 0
-    warmup_epoch: 0
-
-Optimizer:
-  name: Momentum
-  momentum: 0.9
-  weight_decay: 0.0
-
-# data loader for train and eval
-DataLoader:
-  Train:
-    dataset:
-      name: ImageFolder
-      root: data/ILSVRC2012/train
-      transform:
-        - RandomResizedCrop:
-            size: 224
-        - RandomHorizontalFlip:
-        - ToTensor:
-        - Normalize:
-            mean: [0.485, 0.456, 0.406]
-            std: [0.229, 0.224, 0.225]
-    sampler:
-      name: DistributedBatchSampler
-      batch_size: 128 # accum_steps: 1, total batchsize: 1024
-      drop_last: False
-      shuffle: True
-    loader:
-      num_workers: 8
-      use_shared_memory: True
-
-  Eval:
-    dataset:
-      name: ImageFolder
-      root: data/ILSVRC2012/val
-      transform:
-        - Resize:
-            size: 256
-        - CenterCrop:
-            size: 224
-        - ToTensor:
-        - Normalize:
-            mean: [0.485, 0.456, 0.406]
-            std: [0.229, 0.224, 0.225]
-    sampler:
-      name: DistributedBatchSampler
-      batch_size: 256
-      drop_last: False
-      shuffle: False
-    loader:
-      num_workers: 8
-      use_shared_memory: True
-
-Metric:
-  Train:
-    - TopkAcc:
-        topk: [1, 5]
-  Eval:
-    - TopkAcc:
-        topk: [1, 5]
-
-Export:
-  export_type: paddle
-  input_shape: [None, 3, 224, 224]
diff --git a/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml
deleted file mode 100644
index cb3a7a9e..00000000
--- a/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml
+++ /dev/null
@@ -1,108 +0,0 @@
-# global configs
-Global:
-  task_type: ContrastiveLearning
-  train_loop: ContrastiveLearningTrainingEpochLoop
-  validate_loop: None
-  checkpoint: null
-  pretrained_model: null
-  output_dir: ./output/
-  device: gpu
-  save_interval: 1
-  max_num_latest_checkpoint: 0
-  eval_during_train: False
-  eval_interval: 1
-  eval_unit: "epoch"
-  accum_steps: 1
-  epochs: 300
-  print_batch_step: 10
-  use_visualdl: False
-  seed: 2023
-
-# FP16 setting
-FP16:
-  level: O1
-  GradScaler:
-    init_loss_scaling: 65536.0
-    incr_every_n_steps: 2000
-
-DistributedStrategy:
-  data_parallel: True
-
-# model architecture
-Model:
-  name: mocov3_vit_base_pretrain
-
-LRScheduler:
-    name: TimmCosine
-    learning_rate: 0.0024
-    eta_min: 0.0
-    warmup_epoch: 40
-    warmup_start_lr: 0.0
-    decay_unit: step
-    warmup_prefix: True
-
-Optimizer:
-  name: AdamW
-  betas: (0.9, 0.999)
-  eps: 1e-8
-  weight_decay: 0.1
-  use_master_param: True
-  exp_avg_force_fp32: True
-
-# data loader for train and eval
-DataLoader:
-  Train:
-    dataset:
-      name: ImageFolder
-      root: ./dataset/ILSVRC2012/train
-      transform:
-        - TwoViewsTransform:
-            base_transform1:
-              - RandomResizedCrop:
-                  size: 224
-                  scale: [0.08, 1.0]
-                  interpolation: bicubic
-              - ColorJitter:
-                  p: 0.8
-                  brightness: 0.4
-                  contrast: 0.4
-                  saturation: 0.2
-                  hue: 0.1
-              - RandomGrayscale:
-                  p: 0.2
-              - SimCLRGaussianBlur:
-                  p: 1.0
-                  sigma: [.1, 2.]
-              - RandomHorizontalFlip:
-              - ToTensor:
-              - Normalize:
-                  mean: [0.485, 0.456, 0.406]
-                  std: [0.229, 0.224, 0.225]
-            base_transform2:
-              - RandomResizedCrop:
-                  size: 224
-                  scale: [0.08, 1.0]
-                  interpolation: bicubic
-              - ColorJitter:
-                  p: 0.8
-                  brightness: 0.4
-                  contrast: 0.4
-                  saturation: 0.2
-                  hue: 0.1
-              - RandomGrayscale:
-                  p: 0.2
-              - BYOLSolarize:
-                  p: 0.2
-              - RandomHorizontalFlip:
-              - ToTensor:
-              - Normalize:
-                  mean: [0.485, 0.456, 0.406]
-                  std: [0.229, 0.224, 0.225]
-    sampler:
-      name: DistributedBatchSampler
-      batch_size: 128 # accum_steps: 1, total batchsize: 4096
-      drop_last: False
-      shuffle: True
-    loader:
-      num_workers: 8
-      use_shared_memory: True
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
new file mode 100644
index 00000000..749ef7c6
--- /dev/null
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
@@ -0,0 +1,71 @@
+# global configs
+Global:
+  task_type: ContrastiveLearning
+  train_loop: ContrastiveLearningTrainingEpochLoop
+  validate_loop: None
+  checkpoint: null
+  pretrained_model: null
+  output_dir: ./output/pretrain_0420
+  device: gpu
+  save_interval: 1
+  max_num_latest_checkpoint: 0
+  eval_during_train: False
+  eval_interval: 1
+  eval_unit: "epoch"
+  accum_steps: 1
+  epochs: 800
+  print_batch_step: 100
+  use_visualdl: False
+  seed: 31
+
+# FP16 setting
+# FP16:
+#   level: O1
+#   GradScaler:
+#     init_loss_scaling: 65536.0
+#     incr_every_n_steps: 2000
+
+DistributedStrategy:
+  data_parallel: True
+
+# model architecture
+Model:
+  name: swav_resnet50_pretrain
+  normalize: True
+  hidden_mlp: 2048
+  output_dim: 128
+  nmb_prototypes: 3000
+
+LRScheduler:
+    name: TimmCosine
+    learning_rate: 4.8
+    decay_unit: step
+    eta_min: 0.0048
+    warmup_epoch: 10
+    warmup_start_lr: 0.3
+    warmup_prefix: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 1e-6
+  tensor_fusion: False
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiCropDataset
+      root: ./dataset/ILSVRC2012/train
+      size_crops: [224, 96]
+      num_crops: [2, 6]
+      min_scale_crops: [0.14, 0.05]
+      max_scale_crops: [1, 0.14]
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128 # accum_steps: 1, total batchsize: 4096
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh
index cae7ebba..466ecef3 100644
--- a/tasks/ssl/swav/finetune.sh
+++ b/tasks/ssl/swav/finetune.sh
@@ -15,14 +15,13 @@
 # Note: Set the following environment variables 
 # and then need to run the script on each node.
 unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
 export PADDLE_NNODES=1
 export PADDLE_MASTER="127.0.0.1:12538"
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-export FLAGS_stop_check_timeout=3600
 
 python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
-    passl-train \
-    -c ./configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml
+    tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yml
\ No newline at end of file
diff --git a/tasks/ssl/swav/linearprobe.sh b/tasks/ssl/swav/linearprobe.sh
index 31511a45..866322e1 100644
--- a/tasks/ssl/swav/linearprobe.sh
+++ b/tasks/ssl/swav/linearprobe.sh
@@ -12,10 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#unset PADDLE_TRAINER_ENDPOINTS
-# export PADDLE_NNODES=1
-# #export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538"
-# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 # export FLAGS_stop_check_timeout=3600
 unset PADDLE_TRAINER_ENDPOINTS
 unset DISTRIBUTED_TRAINER_ENDPOINTS
diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh
index f5dfc176..fb44a0d4 100644
--- a/tasks/ssl/swav/pretrain.sh
+++ b/tasks/ssl/swav/pretrain.sh
@@ -12,15 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#unset PADDLE_TRAINER_ENDPOINTS
-#export PADDLE_NNODES=4
-#export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538"
-#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-export FLAGS_stop_check_timeout=3600
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+export PADDLE_NNODES=1
+export PADDLE_MASTER="127.0.0.1:12538"
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 
 python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
-    passl-train \
-    -c ./configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml
+    tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yml
\ No newline at end of file

From 1da07b192d1e17d8f6bda712830e6b23c012dcc2 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Sun, 23 Apr 2023 16:50:22 +0800
Subject: [PATCH 06/46] update_pretrain

---
 passl/models/swav.py                                      | 7 ++++++-
 .../configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml  | 2 +-
 ...aml => swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml} | 8 +++++---
 3 files changed, 12 insertions(+), 5 deletions(-)
 rename tasks/ssl/swav/configs/{swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml => swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml} (94%)

diff --git a/passl/models/swav.py b/passl/models/swav.py
index 0181eb73..dec12dcb 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -1,4 +1,5 @@
 import os
+from sys import flags
 
 import paddle
 import paddle.nn as nn
@@ -146,7 +147,11 @@ def swav_resnet50_finetune(**kwargs):
     model = SwAVFinetune(**kwargs)
     return model
 
-def swav_resnet50_pretrain(**kwargs):
+def swav_resnet50_pretrain(**kwargs): # todo
+    flags = {}
+    flags['FLAGS_cudnn_exhaustive_search'] = True
+    flags['FLAGS_cudnn_deterministic'] = True
+    paddle.set_flags(flags)
     model = SwAVPretrain(**kwargs)
     return model       
             
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
index 251f5f49..33563b0e 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
@@ -5,7 +5,7 @@ Global:
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
   pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams
-  output_dir: ./output/baseline_0420_align_trackTrue
+  output_dir: ./output/baseline_0421_align_trackTrue_nolinearload
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
similarity index 94%
rename from tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
rename to tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
index 749ef7c6..8c3f9603 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
@@ -19,8 +19,8 @@ Global:
   seed: 31
 
 # FP16 setting
-# FP16:
-#   level: O1
+FP16:
+  level: O1
 #   GradScaler:
 #     init_loss_scaling: 65536.0
 #     incr_every_n_steps: 2000
@@ -46,9 +46,11 @@ LRScheduler:
     warmup_prefix: True
 
 Optimizer:
-  name: Momentum
+  name: MomentumLARC
   momentum: 0.9
   weight_decay: 1e-6
+  trust_coefficient: 0.001
+  clip: False
   tensor_fusion: False
 
 # data loader for train and eval

From 709ea4dd375a62c2a5a6c3a36ea9776e30ead382 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Sun, 23 Apr 2023 19:16:38 +0800
Subject: [PATCH 07/46] ready_for_semi

---
 passl/models/swav.py                          | 60 +++++++++++++++++++
 passl/optimizer/__init__.py                   |  4 +-
 ...v_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml | 21 ++++---
 tasks/ssl/swav/finetune.sh                    |  2 +-
 4 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/passl/models/swav.py b/passl/models/swav.py
index dec12dcb..01627b66 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -1,3 +1,5 @@
+from collections import defaultdict
+import copy
 import os
 from sys import flags
 
@@ -5,6 +7,7 @@
 import paddle.nn as nn
 
 from passl.nn import init
+from passl.utils import logger
 from passl.models.resnet import resnet50
 from passl.models.base_model import Model
 
@@ -127,6 +130,63 @@ def __init__(self, **kwargs):
     
     def load_pretrained(self, path, rank=0, finetune=False):
         self._load_model(path, self.res_model, 'backbone') 
+
+    def param_groups(self, config, tensor_fusion=True, custom_cfg=None):
+        """
+        lr_group(dict|optional): [{'name': 'backbone', 'lr_mult': 0.1}, {'name': 'norm', 'weight_decay_mult': 0}]
+        """
+        if custom_cfg is not None:
+            assert isinstance(custom_cfg, list), "`custom_cfg` must be a list."
+            for item in custom_cfg:
+                assert isinstance(
+                    item, dict), "The item of `custom_cfg` must be a dict"
+        
+        param_group = self._collect_params(self.res_model, tensor_fusion, config)
+
+        return param_group
+    
+    def _collect_params(self, config, model, tensor_fusion):
+        # Collect different parameter groups
+        if self.custom_cfg is None or len(self.custom_cfg) == 0:
+            return {'params': model.parameters(), 'tensor_fusion': tensor_fusion}
+
+        self.weight_decay = config['weight_decay']
+        groups_num = len(self.custom_cfg) + 1
+        params_list = [[] for _ in range(groups_num)]
+        for name, param in model.named_parameters():
+            if param.stop_gradient:
+                continue
+            for idx, item in enumerate(self.custom_cfg):
+                if item['name'] in name:
+                    params_list[idx].append(param)
+                    break
+            else:
+                params_list[-1].append(param)
+
+        res = []
+        for idx, item in enumerate(self.custom_cfg):
+            lr_mult = item.get("lr_mult", 1.0)
+            weight_decay_mult = item.get("weight_decay_mult", None)
+            param_dict = {'params': params_list[idx], 'learning_rate': lr_mult}
+            if self.weight_decay is not None and weight_decay_mult is not None:
+                param_dict['weight_decay'] = self.weight_decay * weight_decay_mult
+            param_dict['tensor_fusion'] = tensor_fusion
+            res.append(param_dict)
+        res.append({'params': params_list[-1]})
+
+        msg = 'Parameter groups for optimizer: \n'
+        for idx, item in enumerate(self.custom_cfg):
+            params_name = [p.name for p in params_list[idx]]
+            item = item.copy()
+            item['params_name'] = params_name
+            msg += 'Group {}: \n{} \n'.format(idx, item)
+        msg += 'Last group:\n params_name: {}'.format(
+            [p.name for p in params_list[-1]])
+        logger.info(msg)
+
+        return res
+
+    
     
     def forward(self, inp):
         return self.res_model(inp)
diff --git a/passl/optimizer/__init__.py b/passl/optimizer/__init__.py
index b73f0a90..609e83e9 100644
--- a/passl/optimizer/__init__.py
+++ b/passl/optimizer/__init__.py
@@ -122,6 +122,7 @@
 def build_optimizer(config, lr_scheduler, model=None):
     config = copy.deepcopy(config)
     optim_name = config.pop('name')
+    custom_cfg = config.pop('custom_cfg', None)
     
     grad_clip = None
     grad_clip_config = config.pop('grad_clip', None)
@@ -136,7 +137,8 @@ def build_optimizer(config, lr_scheduler, model=None):
         logger.info('LARS or LARC Optimizer can not use tensor fusion technology. It automatically fall back to `tensor_fusion = False`.')
 
     if hasattr(model, 'param_groups'):
-        param_group = model.param_groups(no_weight_decay_name, tensor_fusion)
+        # param_group = model.param_groups(no_weight_decay_name, tensor_fusion) # todo compact simsaim
+        param_group = model.param_groups(config, tensor_fusion, custom_cfg)
         for group in param_group:
             if 'tensor_fusion' in group and group['tensor_fusion']:
                 group['params'] = get_fused_params(group['params'])
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
index f2a229a0..f7950c0b 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
@@ -4,7 +4,7 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-  pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams
+  pretrained_model: /ssd2/tangshiyu/Code/PASSL/swav_800ep_pretrain.pdparams
   finetune: True
   output_dir: ./output/semi_0420
   device: gpu
@@ -41,18 +41,20 @@ Loss:
         weight: 1.0
 
 LRScheduler:
-    name: Step
-    learning_rate: 0.01
-    boundaries: [12, 16]
-    values: [0.01, 0.002, 0.0004]
-    decay_unit: epoch
-    last_epoch: 0
+    name: MultiStepDecay
+    learning_rate: 0.02
+    milestones: [12, 16]
+    gamma: 0.2
+    last_epoch: -1
 
 Optimizer:
   name: Momentum
   momentum: 0.9
   weight_decay: 0.0
   tensor_fusion: False
+  custom_config:
+    - name: head
+      lr_mult: 250
 
 # data loader for train and eval
 DataLoader:
@@ -68,9 +70,10 @@ DataLoader:
         - Normalize:
             mean: [0.485, 0.456, 0.406]
             std: [0.228, 0.224, 0.225]
+      samples_tag: semi_1
     sampler:
       name: DistributedBatchSampler
-      batch_size: 32 # accum_steps: 1, total batchsize: 256
+      batch_size: 64 # accum_steps: 1, total batchsize: 256
       drop_last: False
       shuffle: True
     loader:
@@ -92,7 +95,7 @@ DataLoader:
             std: [0.228, 0.224, 0.225]
     sampler:
       name: DistributedBatchSampler
-      batch_size: 32
+      batch_size: 64
       drop_last: False
       shuffle: False
     loader:
diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh
index 466ecef3..c06a84cc 100644
--- a/tasks/ssl/swav/finetune.sh
+++ b/tasks/ssl/swav/finetune.sh
@@ -18,7 +18,7 @@ unset PADDLE_TRAINER_ENDPOINTS
 unset DISTRIBUTED_TRAINER_ENDPOINTS
 export PADDLE_NNODES=1
 export PADDLE_MASTER="127.0.0.1:12538"
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export CUDA_VISIBLE_DEVICES=4,1,2,3 #,4,5,6,7
 
 python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \

From 767494f24beb2fd9946ffff34ae4ebbb10e1584b Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Sun, 23 Apr 2023 19:25:09 +0800
Subject: [PATCH 08/46] same

---
 passl/models/swav.py                          | 190 ++++++++++++------
 ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml |  40 ++--
 2 files changed, 149 insertions(+), 81 deletions(-)

diff --git a/passl/models/swav.py b/passl/models/swav.py
index 8f20b6d9..01627b66 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -1,20 +1,25 @@
+from collections import defaultdict
+import copy
 import os
+from sys import flags
 
 import paddle
 import paddle.nn as nn
 
 from passl.nn import init
+from passl.utils import logger
 from passl.models.resnet import resnet50
 from passl.models.base_model import Model
 
 
 __all__ = [
-    # 'swav_resnet50',
+    'swav_resnet50_finetune',
     'swav_resnet50_linearprobe',
-    # 'swav_resnet50_pretrain',
+    'swav_resnet50_pretrain',
     'SwAV',
     'SwAVLinearProbe',
-    # 'SwAVPretrain',
+    'SwAVFinetune',
+    'SwAVPretrain',
 ]
 
 # def model and 
@@ -23,7 +28,32 @@ def __init__(self, **kwargs):
         super().__init__()
         self.res_model = resnet50(**kwargs)
     
-        
+    def _load_model(self, path, model, tag):
+        if os.path.isfile(path):
+            para_state_dict = paddle.load(path)
+            
+            # resnet
+            model_state_dict = model.state_dict()
+            keys = model_state_dict.keys()
+            num_params_loaded = 0
+            for k in keys:
+                if k not in para_state_dict:
+                    print("{} is not in pretrained model".format(k))
+                elif list(para_state_dict[k].shape) != list(model_state_dict[k]
+                                                            .shape):
+                    print(
+                        "[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})"
+                        .format(k, para_state_dict[k].shape, model_state_dict[k]
+                                .shape))
+                else:
+                    model_state_dict[k] = para_state_dict[k]
+                    num_params_loaded += 1
+            model.set_dict(model_state_dict)
+            print("There are {}/{} variables loaded into {}.".format(
+                num_params_loaded, len(model_state_dict), tag))
+        else:
+            print("No pretrained weights found in {} => training with random weights".format(tag))
+
     def load_pretrained(self, path, rank=0, finetune=False):
         pass
 #         if not os.path.exists(path + '.pdparams'):
@@ -55,9 +85,9 @@ def save(self, path, local_rank=0, rank=0):
         
         
 class SwAVLinearProbe(SwAV):
-    def __init__(self, class_num=1000, linear_arch="resnet50", global_avg=True, use_bn=False, **kwargs):
+    def __init__(self, class_num=1000, **kwargs):
         super().__init__(**kwargs)
-        self.linear = RegLog(1000, "resnet50", global_avg=True, use_bn=False)
+        self.linear = RegLog(class_num)
         self.res_model.eval()
         
         # freeze all layers but the last fc
@@ -75,38 +105,11 @@ def __init__(self, class_num=1000, linear_arch="resnet50", global_avg=True, use_
     def _freeze_norm(self, layer):
         if isinstance(layer, (nn.layer.norm._BatchNormBase)):
             layer._use_global_stats = True
-
-    def _load_model(self, path, model, tag):
-        if os.path.isfile(path):
-            para_state_dict = paddle.load(path)
-            
-            # resnet
-            model_state_dict = model.state_dict()
-            keys = model_state_dict.keys()
-            num_params_loaded = 0
-            for k in keys:
-                if k not in para_state_dict:
-                    print("{} is not in pretrained model".format(k))
-                elif list(para_state_dict[k].shape) != list(model_state_dict[k]
-                                                            .shape):
-                    print(
-                        "[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})"
-                        .format(k, para_state_dict[k].shape, model_state_dict[k]
-                                .shape))
-                else:
-                    model_state_dict[k] = para_state_dict[k]
-                    num_params_loaded += 1
-            model.set_dict(model_state_dict)
-            print("There are {}/{} variables loaded into {}.".format(
-                num_params_loaded, len(model_state_dict), tag))
-        else:
-            print("No pretrained weights found in {} => training with random weights".format(tag))
     
     def load_pretrained(self, path, rank=0, finetune=False):
         self._load_model(path, self.res_model, 'backbone')
         self._load_model("linear.pdparams", self.linear, 'linear')
 
-        
     def forward(self, inp):
 #         import numpy as np
         # import pdb; pdb.set_trace()
@@ -121,16 +124,96 @@ def forward(self, inp):
         
         return output
 
+class SwAVFinetune(SwAV):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    
+    def load_pretrained(self, path, rank=0, finetune=False):
+        self._load_model(path, self.res_model, 'backbone') 
+
+    def param_groups(self, config, tensor_fusion=True, custom_cfg=None):
+        """
+        lr_group(dict|optional): [{'name': 'backbone', 'lr_mult': 0.1}, {'name': 'norm', 'weight_decay_mult': 0}]
+        """
+        if custom_cfg is not None:
+            assert isinstance(custom_cfg, list), "`custom_cfg` must be a list."
+            for item in custom_cfg:
+                assert isinstance(
+                    item, dict), "The item of `custom_cfg` must be a dict"
+        
+        param_group = self._collect_params(self.res_model, tensor_fusion, config)
+
+        return param_group
+    
+    def _collect_params(self, config, model, tensor_fusion):
+        # Collect different parameter groups
+        if self.custom_cfg is None or len(self.custom_cfg) == 0:
+            return {'params': model.parameters(), 'tensor_fusion': tensor_fusion}
+
+        self.weight_decay = config['weight_decay']
+        groups_num = len(self.custom_cfg) + 1
+        params_list = [[] for _ in range(groups_num)]
+        for name, param in model.named_parameters():
+            if param.stop_gradient:
+                continue
+            for idx, item in enumerate(self.custom_cfg):
+                if item['name'] in name:
+                    params_list[idx].append(param)
+                    break
+            else:
+                params_list[-1].append(param)
+
+        res = []
+        for idx, item in enumerate(self.custom_cfg):
+            lr_mult = item.get("lr_mult", 1.0)
+            weight_decay_mult = item.get("weight_decay_mult", None)
+            param_dict = {'params': params_list[idx], 'learning_rate': lr_mult}
+            if self.weight_decay is not None and weight_decay_mult is not None:
+                param_dict['weight_decay'] = self.weight_decay * weight_decay_mult
+            param_dict['tensor_fusion'] = tensor_fusion
+            res.append(param_dict)
+        res.append({'params': params_list[-1]})
+
+        msg = 'Parameter groups for optimizer: \n'
+        for idx, item in enumerate(self.custom_cfg):
+            params_name = [p.name for p in params_list[idx]]
+            item = item.copy()
+            item['params_name'] = params_name
+            msg += 'Group {}: \n{} \n'.format(idx, item)
+        msg += 'Last group:\n params_name: {}'.format(
+            [p.name for p in params_list[-1]])
+        logger.info(msg)
+
+        return res
+
+    
+    
+    def forward(self, inp):
+        return self.res_model(inp)
+
+class SwAVPretrain(SwAV):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    
+    def forward(self, inp):
+        return self.res_model(inp)
+
         
 def swav_resnet50_linearprobe(**kwargs):
-    model = SwAVLinearProbe(linear_arch="resnet50", 
-                            global_avg=True, 
-                            use_bn=False,
-                            output_dim=0, 
-                            eval_mode=True,
-                            **kwargs)
+    model = SwAVLinearProbe(**kwargs)
     return model
-        
+
+def swav_resnet50_finetune(**kwargs):
+    model = SwAVFinetune(**kwargs)
+    return model
+
+def swav_resnet50_pretrain(**kwargs): # todo
+    flags = {}
+    flags['FLAGS_cudnn_exhaustive_search'] = True
+    flags['FLAGS_cudnn_deterministic'] = True
+    paddle.set_flags(flags)
+    model = SwAVPretrain(**kwargs)
+    return model       
             
 # def normal_init(param, **kwargs):
 #     initializer = nn.initializer.Normal(**kwargs)
@@ -144,35 +227,16 @@ def swav_resnet50_linearprobe(**kwargs):
 class RegLog(paddle.nn.Layer):
     """Creates logistic regression on top of frozen features"""
 
-    def __init__(self, num_labels, arch='resnet50', global_avg=False,
-        use_bn=True):
+    def __init__(self, num_labels):
         super(RegLog, self).__init__()
-        self.bn = None
-        if global_avg:
-            if arch == 'resnet50':
-                s = 2048
-            elif arch == 'resnet50w2':
-                s = 4096
-            elif arch == 'resnet50w4':
-                s = 8192
-            self.av_pool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1))
-        else:
-            assert arch == 'resnet50'
-            s = 8192
-            self.av_pool = paddle.nn.AvgPool2D(6, stride=1)
-            if use_bn:
-                self.bn = paddle.nn.BatchNorm2D(num_features=2048, momentum
-                    =1 - 0.1, epsilon=1e-05, weight_attr=None, bias_attr=
-                    None, use_global_stats=True)
-        
+        s = 2048
+        self.av_pool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1))
         self.linear = paddle.nn.Linear(in_features=s, out_features=num_labels)
+        
         init.normal_(self.linear.weight, mean=0.0, std=0.01)
         init.zeros_(self.linear.bias)
 
     def forward(self, x):
         x = self.av_pool(x)
-        if self.bn is not None:
-            x = self.bn(x)
-
         x = x.reshape((x.shape[0], -1))
         return self.linear(x)
\ No newline at end of file
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
index a290ea19..f7950c0b 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
@@ -4,8 +4,9 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-  pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams
-  output_dir: ./output/baseline_0420_align_trackTrue
+  pretrained_model: /ssd2/tangshiyu/Code/PASSL/swav_800ep_pretrain.pdparams
+  finetune: True
+  output_dir: ./output/semi_0420
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
@@ -13,12 +14,12 @@ Global:
   eval_interval: 1
   eval_unit: "epoch"
   accum_steps: 1
-  epochs: 100
+  epochs: 20
   print_batch_step: 100
   use_visualdl: False
   seed: 31
 
-# FP16 setting ignore in align
+# FP16 setting
 # FP16:
 #   level: O1
 
@@ -27,8 +28,8 @@ DistributedStrategy:
 
 # model architecture
 Model:
-  name: swav_resnet50_linearprobe
-  class_num: 1000
+  name: swav_resnet50_finetune
+  output_dim: 1000
 
 # loss function config for traing/eval process
 Loss:
@@ -40,19 +41,21 @@ Loss:
         weight: 1.0
 
 LRScheduler:
-    name: TimmCosine
-    learning_rate: 0.3
-    eta_min: 0.0
-    decay_unit: epoch
-    last_epoch: 0
-    warmup_epoch: 0
+    name: MultiStepDecay
+    learning_rate: 0.02
+    milestones: [12, 16]
+    gamma: 0.2
+    last_epoch: -1
 
 Optimizer:
   name: Momentum
   momentum: 0.9
-  weight_decay: 1e-6
-  tensor_fusion: True
-  
+  weight_decay: 0.0
+  tensor_fusion: False
+  custom_config:
+    - name: head
+      lr_mult: 250
+
 # data loader for train and eval
 DataLoader:
   Train:
@@ -67,9 +70,10 @@ DataLoader:
         - Normalize:
             mean: [0.485, 0.456, 0.406]
             std: [0.228, 0.224, 0.225]
+      samples_tag: semi_1
     sampler:
       name: DistributedBatchSampler
-      batch_size: 32 # accum_steps: 1, total batchsize: 256
+      batch_size: 64 # accum_steps: 1, total batchsize: 256
       drop_last: False
       shuffle: True
     loader:
@@ -91,7 +95,7 @@ DataLoader:
             std: [0.228, 0.224, 0.225]
     sampler:
       name: DistributedBatchSampler
-      batch_size: 32
+      batch_size: 64
       drop_last: False
       shuffle: False
     loader:
@@ -108,4 +112,4 @@ Metric:
 
 Export:
   export_type: paddle
-  input_shape: [None, 3, 224, 224]
+  input_shape: [None, 3, 224, 224]
\ No newline at end of file

From 0021d00252d6d64f2a014ee09aaa66b32529131e Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Mon, 24 Apr 2023 15:41:15 +0800
Subject: [PATCH 09/46] split_params

---
 passl/data/__init__.py                        |  1 -
 passl/data/dataset/imagefolder_dataset.py     | 12 +++--
 passl/engine/engine.py                        | 25 ++++++----
 passl/engine/loops/classification_loop.py     |  4 +-
 passl/models/__init__.py                      |  2 +-
 passl/models/swav.py                          | 44 ++++++++--------
 passl/optimizer/__init__.py                   | 50 +++++++++++--------
 passl/optimizer/momentum.py                   |  4 --
 passl/optimizer/optimizer.py                  |  8 +--
 passl/scheduler/__init__.py                   | 10 +++-
 ...v_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml | 27 ++++++----
 ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml | 42 ++++++----------
 tasks/ssl/swav/finetune.sh                    |  3 +-
 tasks/ssl/swav/pretrain.sh                    |  2 +-
 14 files changed, 121 insertions(+), 113 deletions(-)

diff --git a/passl/data/__init__.py b/passl/data/__init__.py
index 50ce7ec5..049606f6 100644
--- a/passl/data/__init__.py
+++ b/passl/data/__init__.py
@@ -50,7 +50,6 @@ def build_dataloader(config, mode, device, use_dali=False,
     if config_batch_transform_ops is not None:
         batch_transform = utils.create_preprocess_operators(
             config_batch_transform_ops)
-
     dataset = eval("dataset.{}".format(dataset_name))(**config_dataset)
     logger.debug("build dataset({}) success...".format(dataset))
 
diff --git a/passl/data/dataset/imagefolder_dataset.py b/passl/data/dataset/imagefolder_dataset.py
index ef03f1c4..5ad4e208 100644
--- a/passl/data/dataset/imagefolder_dataset.py
+++ b/passl/data/dataset/imagefolder_dataset.py
@@ -14,6 +14,7 @@
 
 import os
 import urllib
+import urllib.request
 import numpy as np
 from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
 
@@ -65,11 +66,14 @@ def __init__(self,
         if samples_tag is None:
             samples = self.make_dataset(self.root, class_to_idx, extensions)
         elif samples_tag == "semi_1" or samples == "semi_10":
-            train_data_path  = os.path.join(root, "train")
+            # train_data_path  = os.path.join(root, "train")
             percent = samples_tag.split('_')[-1]
-            subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(percent) + "percent.txt")
-            list_imgs = [li.decode("utf-8").split('\n')[0] for li in subset_file]
-            samples = [(os.path.join(train_data_path, li.split('_')[0], li), class_to_idx[li.split('_')[0]]) for li in list_imgs]
+            # subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(percent) + "percent.txt")
+            subset_file = str(percent) + "percent.txt"
+            with open(subset_file, 'r') as f:
+                list_imgs = [li.split('\n')[0] for li in f.readlines()]
+            # print(list_imgs)
+            samples = [(os.path.join(root, li.split('_')[0], li), class_to_idx[li.split('_')[0]]) for li in list_imgs]
         else:
             raise NotImplementedError('{} is not implemented'.format(samples))
 
diff --git a/passl/engine/engine.py b/passl/engine/engine.py
index 23c59ab9..24301420 100644
--- a/passl/engine/engine.py
+++ b/passl/engine/engine.py
@@ -214,17 +214,20 @@ def worker_init_fn(worker_id):
 
         # build optimizer and lr scheduler
         if self.mode == 'train':
-            config_lr_scheduler = self.config.get('LRScheduler', None)
-            self.lr_scheduler = None
-            if config_lr_scheduler is not None:
-                self.lr_decay_unit = config_lr_scheduler.get('decay_unit',
-                                                             'step')
-                self.lr_scheduler = build_lr_scheduler(
-                    config_lr_scheduler, self.config["Global"]["epochs"],
-                    len(self.train_dataloader))
-
-            self.optimizer = build_optimizer(self.config["Optimizer"],
-                                             self.lr_scheduler, self.model)
+            if self.config["Optimizer"].get('decay_unit', None) is not None:
+                self.lr_decay_unit = self.config["Optimizer"]['decay_unit']
+            else:
+                self.lr_decay_unit = 'step'
+                Warning('lr_decay_unit is not set in optimizer config, set to step by default')
+            # self.lr_scheduler = None
+            #     self.lr_scheduler = build_lr_scheduler(
+            #         config_lr_scheduler, self.config["Global"]["epochs"],
+            #         len(self.train_dataloader))
+            #         # todo add lr scheduler for different group
+
+            # self.optimizer = build_optimizer(self.config["Optimizer"],
+            #                                  self.lr_scheduler, self.model)
+            self.optimizer = build_optimizer(self.config["Optimizer"], self.model, self.config, len(self.train_dataloader))
 
         # load pretrained model
         if self.config["Global"]["pretrained_model"] is not None:
diff --git a/passl/engine/loops/classification_loop.py b/passl/engine/loops/classification_loop.py
index b1d8d47f..3f3c29d9 100644
--- a/passl/engine/loops/classification_loop.py
+++ b/passl/engine/loops/classification_loop.py
@@ -195,8 +195,8 @@ def train_one_step(self, batch):
         self.trainer.scaler.update()
         # clear gradients
         self.trainer.optimizer.clear_grad()
-
-        if self.trainer.lr_decay_unit == 'step':
+        
+        if self.trainer.lr_decay_unit == 'step': # default is step
             self.trainer.optimizer.lr_step(self.global_step)
 
         return out, loss_dict
diff --git a/passl/models/__init__.py b/passl/models/__init__.py
index 0792faae..38ea440d 100644
--- a/passl/models/__init__.py
+++ b/passl/models/__init__.py
@@ -27,7 +27,7 @@
 from .convnext import *
 from .mocov3 import *
 from .swav import *
-from .simsiam import *
+# from .simsiam import *
 
 __all__ = ["build_model"]
 
diff --git a/passl/models/swav.py b/passl/models/swav.py
index 01627b66..9ae220e9 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -7,6 +7,7 @@
 import paddle.nn as nn
 
 from passl.nn import init
+from passl.scheduler import build_lr_scheduler, lr_scheduler
 from passl.utils import logger
 from passl.models.resnet import resnet50
 from passl.models.base_model import Model
@@ -131,62 +132,61 @@ def __init__(self, **kwargs):
     def load_pretrained(self, path, rank=0, finetune=False):
         self._load_model(path, self.res_model, 'backbone') 
 
-    def param_groups(self, config, tensor_fusion=True, custom_cfg=None):
+    def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=None):
         """
-        lr_group(dict|optional): [{'name': 'backbone', 'lr_mult': 0.1}, {'name': 'norm', 'weight_decay_mult': 0}]
+        custom_cfg(dict|optional): [{'name': 'backbone', 'lr': 0.1, 'LRScheduler': {"lr":1.0}}, {'name': 'norm', 'weight_decay_mult': 0}]
         """
-        if custom_cfg is not None:
-            assert isinstance(custom_cfg, list), "`custom_cfg` must be a list."
-            for item in custom_cfg:
+
+        self.custom_cfg = config.pop('custom_cfg', None)
+        if self.custom_cfg is not None:
+            assert isinstance(self.custom_cfg, list), "`custom_cfg` must be a list."
+            for item in self.custom_cfg:
                 assert isinstance(
                     item, dict), "The item of `custom_cfg` must be a dict"
         
-        param_group = self._collect_params(self.res_model, tensor_fusion, config)
+        param_group = self._collect_params(config, self.res_model, tensor_fusion, epochs, trainset_length)
 
         return param_group
     
-    def _collect_params(self, config, model, tensor_fusion):
+    def _collect_params(self, config, model, tensor_fusion, epochs, trainset_length):
         # Collect different parameter groups
         if self.custom_cfg is None or len(self.custom_cfg) == 0:
-            return {'params': model.parameters(), 'tensor_fusion': tensor_fusion}
+            return [{'params': model.parameters(), 'tensor_fusion': tensor_fusion}]
 
+        # split params
         self.weight_decay = config['weight_decay']
-        groups_num = len(self.custom_cfg) + 1
-        params_list = [[] for _ in range(groups_num)]
+        params_dict = {item['name']: [] for item in self.custom_cfg}
         for name, param in model.named_parameters():
             if param.stop_gradient:
                 continue
             for idx, item in enumerate(self.custom_cfg):
-                if item['name'] in name:
-                    params_list[idx].append(param)
+                if item['name'] in name and item['name']!='PasslDefault':
+                    params_dict[item['name']].append(param)
                     break
             else:
-                params_list[-1].append(param)
+                params_dict['PasslDefault'].append(param)
 
         res = []
-        for idx, item in enumerate(self.custom_cfg):
-            lr_mult = item.get("lr_mult", 1.0)
+        for item in self.custom_cfg:
             weight_decay_mult = item.get("weight_decay_mult", None)
-            param_dict = {'params': params_list[idx], 'learning_rate': lr_mult}
+            if item.get("LRScheduler", None) is not None:
+                lr_scheduler = build_lr_scheduler(item['LRScheduler'], epochs, trainset_length, config['decay_unit'])
+                param_dict = {'params': params_dict[item['name']], 'lr': lr_scheduler}
+
             if self.weight_decay is not None and weight_decay_mult is not None:
                 param_dict['weight_decay'] = self.weight_decay * weight_decay_mult
             param_dict['tensor_fusion'] = tensor_fusion
             res.append(param_dict)
-        res.append({'params': params_list[-1]})
 
         msg = 'Parameter groups for optimizer: \n'
         for idx, item in enumerate(self.custom_cfg):
-            params_name = [p.name for p in params_list[idx]]
+            params_name = [p.name for p in params_dict[item['name']]]
             item = item.copy()
             item['params_name'] = params_name
             msg += 'Group {}: \n{} \n'.format(idx, item)
-        msg += 'Last group:\n params_name: {}'.format(
-            [p.name for p in params_list[-1]])
         logger.info(msg)
 
         return res
-
-    
     
     def forward(self, inp):
         return self.res_model(inp)
diff --git a/passl/optimizer/__init__.py b/passl/optimizer/__init__.py
index 609e83e9..2fb4a14f 100644
--- a/passl/optimizer/__init__.py
+++ b/passl/optimizer/__init__.py
@@ -32,16 +32,16 @@
 # from .momentum_lars import MomentumLARS
 
 
-# def build_optimizer(config, lr_scheduler, model=None):
-#     config = copy.deepcopy(config)
+# def build_optimizer(optim_config, lr_scheduler, model=None):
+#     optim_config = copy.deepcopy(optim_config)
 
 #     grad_clip = None
-#     grad_clip_config = config.pop('grad_clip', None)
+#     grad_clip_config = optim_config.pop('grad_clip', None)
 #     if grad_clip_config is not None:
 #         grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm')
 #         grad_clip = eval(grad_clip_name)(**grad_clip_config)
 
-#     no_weight_decay_name = config.pop('no_weight_decay_name', [])
+#     no_weight_decay_name = optim_config.pop('no_weight_decay_name', [])
 
 #     param_group = defaultdict(list)
 #     for n, p in model.named_parameters():
@@ -74,11 +74,11 @@
 
 #         params.append(group)
 
-#     optim_name = config.pop('name')
+#     optim_name = optim_config.pop('name')
 #     optim = eval(optim_name)(params,
 #                              lr=lr_scheduler,
 #                              grad_clip=grad_clip,
-#                              **config)
+#                              **optim_config)
 #     logger.debug("build optimizer ({}) success..".format(optim))
 #     return optim
 
@@ -119,29 +119,30 @@
 from .momentum_larc import MomentumLARC
 
 
-def build_optimizer(config, lr_scheduler, model=None):
-    config = copy.deepcopy(config)
-    optim_name = config.pop('name')
-    custom_cfg = config.pop('custom_cfg', None)
+def build_optimizer(optim_config, model, config, trainset_length):
+    optim_config = copy.deepcopy(optim_config)
+    optim_name = optim_config.pop('name')
     
     grad_clip = None
-    grad_clip_config = config.pop('grad_clip', None)
+    grad_clip_config = optim_config.pop('grad_clip', None)
     if grad_clip_config is not None:
         grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm')
         grad_clip = eval(grad_clip_name)(**grad_clip_config)
 
-    no_weight_decay_name = config.pop('no_weight_decay_name', [])
-    tensor_fusion = config.pop('tensor_fusion', True)
+    no_weight_decay_name = optim_config.pop('no_weight_decay_name', [])
+    tensor_fusion = optim_config.pop('tensor_fusion', True)
     if 'LAR' in optim_name:
         tensor_fusion = False
         logger.info('LARS or LARC Optimizer can not use tensor fusion technology. It automatically fall back to `tensor_fusion = False`.')
 
     if hasattr(model, 'param_groups'):
         # param_group = model.param_groups(no_weight_decay_name, tensor_fusion) # todo compact simsaim
-        param_group = model.param_groups(config, tensor_fusion, custom_cfg)
+        param_group = model.param_groups(optim_config, tensor_fusion, config["Global"]["epochs"], trainset_length)
         for group in param_group:
             if 'tensor_fusion' in group and group['tensor_fusion']:
                 group['params'] = get_fused_params(group['params'])
+        optim_config.pop('custom_cfg', None)
+
     else:
         param_group_map = defaultdict(list)
         for n, p in model.named_parameters():
@@ -175,16 +176,21 @@ def build_optimizer(config, lr_scheduler, model=None):
 
             param_group.append(group)
 
-    lr = lr_scheduler
-    lr_func = None
-    if isinstance(lr_scheduler, LRCallable):
-        lr = lr_scheduler.lr
-        lr_func = lr_scheduler
+    # lr = lr_scheduler
+    # lr_func = None
+    # if isinstance(lr_scheduler, LRCallable): # 如果是自定义的 scheduler，则lr为数字，使用lr_func 进行lr的迭代
+    #     lr = lr_scheduler.lr
+    #     lr_func = lr_scheduler
+
+    for i, item in enumerate(param_group):
+        for key, val in item.items():
+            if key != 'params':
+                print(' {} is {}'.format(key, val))
+            else:
+                print("Group {}: param: {}".format(i, [p.name for p in item[key]]))
 
     optim = eval(optim_name)(param_group,
-                             lr=lr,
-                             lr_func=lr_func,
                              grad_clip=grad_clip,
-                             **config)
+                             **optim_config)
     logger.debug("build optimizer ({}) success..".format(optim))
     return optim
diff --git a/passl/optimizer/momentum.py b/passl/optimizer/momentum.py
index 8b569c7c..179839fc 100644
--- a/passl/optimizer/momentum.py
+++ b/passl/optimizer/momentum.py
@@ -26,8 +26,6 @@
 class Momentum(Optimizer):
     def __init__(self,
                  params,
-                 lr=0.001,
-                 lr_func=None,
                  momentum=0.9,
                  weight_decay=0.0,
                  use_master_param=True,
@@ -35,8 +33,6 @@ def __init__(self,
                  **args):
 
         defaults = dict(
-            lr=lr,
-            lr_func=lr_func,
             momentum=momentum,
             weight_decay=weight_decay,
             use_master_param=use_master_param,
diff --git a/passl/optimizer/optimizer.py b/passl/optimizer/optimizer.py
index 98e6a3b1..d3f4ae63 100644
--- a/passl/optimizer/optimizer.py
+++ b/passl/optimizer/optimizer.py
@@ -206,12 +206,12 @@ def clear_grad(self, set_to_zero=True):
 
     @paddle.no_grad()
     def lr_step(self, step=None):
-        for group in self.param_groups:
+        for i, group in enumerate(self.param_groups):
             lr = group['lr']
-            if isinstance(lr, paddle.optimizer.lr.LRScheduler):
+            
+            if isinstance(lr, paddle.optimizer.lr.LRScheduler): # group defined lr scheduler
                 lr.step(step)
-            elif 'lr_func' in group and callable(group['lr_func']):
-                group['lr_func'](group, step)
+        print("####lr0 {}, lr0 {}".format(self.param_groups[0]['lr'].get_lr(), self.param_groups[1]['lr'].get_lr()))
 
     @paddle.no_grad()
     def get_lr(self, group_id=0):
diff --git a/passl/scheduler/__init__.py b/passl/scheduler/__init__.py
index ecfb2cf6..4f31e170 100644
--- a/passl/scheduler/__init__.py
+++ b/passl/scheduler/__init__.py
@@ -12,6 +12,7 @@
 # limitations under the License.
 
 import paddle
+from paddle.optimizer.lr import MultiStepDecay
 
 from passl.utils import logger
 
@@ -19,10 +20,15 @@
 from .lr_callable import LRCallable, CosineWithFixLR
 
 
-def build_lr_scheduler(lr_config, epochs, step_each_epoch):
-    lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch})
+def build_lr_scheduler(lr_config, epochs, step_each_epoch, decay_unit):
+    lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch, 'decay_unit': decay_unit })
     if 'name' in lr_config:
         lr_name = lr_config.pop('name')
+        if "MultiStepDecay" in lr_name:
+            lr_config.pop('epochs')
+            lr_config.pop('step_each_epoch')
+            lr_config.pop('decay_unit')
+            print(lr_config)
         lr = eval(lr_name)(**lr_config)
         if isinstance(lr, paddle.optimizer.lr.LRScheduler):
             return lr
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
index f7950c0b..9781c34e 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
@@ -4,9 +4,9 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-  pretrained_model: /ssd2/tangshiyu/Code/PASSL/swav_800ep_pretrain.pdparams
+  pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams
   finetune: True
-  output_dir: ./output/semi_0420
+  output_dir: ./output/semi_0424
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
@@ -40,21 +40,28 @@ Loss:
     - CELoss:
         weight: 1.0
 
-LRScheduler:
-    name: MultiStepDecay
-    learning_rate: 0.02
-    milestones: [12, 16]
-    gamma: 0.2
-    last_epoch: -1
 
 Optimizer:
   name: Momentum
   momentum: 0.9
   weight_decay: 0.0
   tensor_fusion: False
-  custom_config:
+  decay_unit: epoch
+  custom_cfg:
     - name: head
-      lr_mult: 250
+      LRScheduler:
+        name: MultiStepDecay
+        learning_rate: 5
+        milestones: [12, 16]
+        gamma: 0.2
+        last_epoch: -1
+    - name: PasslDefault
+      LRScheduler:
+        name: MultiStepDecay
+        learning_rate: 0.02
+        milestones: [12, 16]
+        gamma: 0.2
+        last_epoch: -1
 
 # data loader for train and eval
 DataLoader:
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
index 3136121c..33563b0e 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
@@ -4,14 +4,8 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-<<<<<<< HEAD
-  pretrained_model: /ssd2/tangshiyu/Code/PASSL/swav_800ep_pretrain.pdparams
-  finetune: True
-  output_dir: ./output/semi_0420
-=======
   pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams
   output_dir: ./output/baseline_0421_align_trackTrue_nolinearload
->>>>>>> 709ea4dd375a62c2a5a6c3a36ea9776e30ead382
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
@@ -19,12 +13,12 @@ Global:
   eval_interval: 1
   eval_unit: "epoch"
   accum_steps: 1
-  epochs: 20
+  epochs: 100
   print_batch_step: 100
   use_visualdl: False
   seed: 31
 
-# FP16 setting
+# FP16 setting ignore in align
 # FP16:
 #   level: O1
 
@@ -33,15 +27,10 @@ DistributedStrategy:
 
 # model architecture
 Model:
-<<<<<<< HEAD
-  name: swav_resnet50_finetune
-  output_dim: 1000
-=======
   name: swav_resnet50_linearprobe
   output_dim: 0 
   eval_mode: True
   class_num: 1000
->>>>>>> 709ea4dd375a62c2a5a6c3a36ea9776e30ead382
 
 # loss function config for traing/eval process
 Loss:
@@ -53,21 +42,19 @@ Loss:
         weight: 1.0
 
 LRScheduler:
-    name: MultiStepDecay
-    learning_rate: 0.02
-    milestones: [12, 16]
-    gamma: 0.2
-    last_epoch: -1
+    name: TimmCosine
+    learning_rate: 0.3
+    eta_min: 0.0
+    decay_unit: epoch
+    last_epoch: 0
+    warmup_epoch: 0
 
 Optimizer:
   name: Momentum
   momentum: 0.9
-  weight_decay: 0.0
-  tensor_fusion: False
-  custom_config:
-    - name: head
-      lr_mult: 250
-
+  weight_decay: 1e-6
+  tensor_fusion: True
+  
 # data loader for train and eval
 DataLoader:
   Train:
@@ -82,10 +69,9 @@ DataLoader:
         - Normalize:
             mean: [0.485, 0.456, 0.406]
             std: [0.228, 0.224, 0.225]
-      samples_tag: semi_1
     sampler:
       name: DistributedBatchSampler
-      batch_size: 64 # accum_steps: 1, total batchsize: 256
+      batch_size: 32 # accum_steps: 1, total batchsize: 256
       drop_last: False
       shuffle: True
     loader:
@@ -107,7 +93,7 @@ DataLoader:
             std: [0.228, 0.224, 0.225]
     sampler:
       name: DistributedBatchSampler
-      batch_size: 64
+      batch_size: 32
       drop_last: False
       shuffle: False
     loader:
@@ -124,4 +110,4 @@ Metric:
 
 Export:
   export_type: paddle
-  input_shape: [None, 3, 224, 224]
\ No newline at end of file
+  input_shape: [None, 3, 224, 224]
diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh
index c06a84cc..5aa3ff33 100644
--- a/tasks/ssl/swav/finetune.sh
+++ b/tasks/ssl/swav/finetune.sh
@@ -19,9 +19,10 @@ unset DISTRIBUTED_TRAINER_ENDPOINTS
 export PADDLE_NNODES=1
 export PADDLE_MASTER="127.0.0.1:12538"
 export CUDA_VISIBLE_DEVICES=4,1,2,3 #,4,5,6,7
+export https_proxy="http://172.19.56.199:3128"
 
 python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
-    tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yml
\ No newline at end of file
+   passl-train -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh
index fb44a0d4..d1c866c6 100644
--- a/tasks/ssl/swav/pretrain.sh
+++ b/tasks/ssl/swav/pretrain.sh
@@ -22,4 +22,4 @@ python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
-    tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yml
\ No newline at end of file
+    tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
\ No newline at end of file

From 57af8e9814fa3b244cea0d608b813ac6df8a4eb3 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Mon, 24 Apr 2023 15:41:15 +0800
Subject: [PATCH 10/46] split_params

---
 passl/data/__init__.py                        |   1 -
 passl/data/dataset/imagefolder_dataset.py     |  12 +-
 passl/engine/engine.py                        |  17 +--
 passl/engine/loops/classification_loop.py     |   4 +-
 passl/models/__init__.py                      |   2 +-
 passl/models/swav.py                          |  51 ++++----
 passl/optimizer/__init__.py                   | 112 ++----------------
 passl/optimizer/momentum.py                   |   4 -
 passl/optimizer/optimizer.py                  |   7 +-
 passl/scheduler/__init__.py                   |  10 +-
 passl/scheduler/lr_scheduler.py               |   6 +-
 ...v_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml |  27 +++--
 ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml |  47 +++-----
 ..._resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml |  20 ++--
 tasks/ssl/swav/finetune.sh                    |   3 +-
 tasks/ssl/swav/pretrain.sh                    |   2 +-
 16 files changed, 119 insertions(+), 206 deletions(-)

diff --git a/passl/data/__init__.py b/passl/data/__init__.py
index 50ce7ec5..049606f6 100644
--- a/passl/data/__init__.py
+++ b/passl/data/__init__.py
@@ -50,7 +50,6 @@ def build_dataloader(config, mode, device, use_dali=False,
     if config_batch_transform_ops is not None:
         batch_transform = utils.create_preprocess_operators(
             config_batch_transform_ops)
-
     dataset = eval("dataset.{}".format(dataset_name))(**config_dataset)
     logger.debug("build dataset({}) success...".format(dataset))
 
diff --git a/passl/data/dataset/imagefolder_dataset.py b/passl/data/dataset/imagefolder_dataset.py
index ef03f1c4..5ad4e208 100644
--- a/passl/data/dataset/imagefolder_dataset.py
+++ b/passl/data/dataset/imagefolder_dataset.py
@@ -14,6 +14,7 @@
 
 import os
 import urllib
+import urllib.request
 import numpy as np
 from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
 
@@ -65,11 +66,14 @@ def __init__(self,
         if samples_tag is None:
             samples = self.make_dataset(self.root, class_to_idx, extensions)
         elif samples_tag == "semi_1" or samples == "semi_10":
-            train_data_path  = os.path.join(root, "train")
+            # train_data_path  = os.path.join(root, "train")
             percent = samples_tag.split('_')[-1]
-            subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(percent) + "percent.txt")
-            list_imgs = [li.decode("utf-8").split('\n')[0] for li in subset_file]
-            samples = [(os.path.join(train_data_path, li.split('_')[0], li), class_to_idx[li.split('_')[0]]) for li in list_imgs]
+            # subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(percent) + "percent.txt")
+            subset_file = str(percent) + "percent.txt"
+            with open(subset_file, 'r') as f:
+                list_imgs = [li.split('\n')[0] for li in f.readlines()]
+            # print(list_imgs)
+            samples = [(os.path.join(root, li.split('_')[0], li), class_to_idx[li.split('_')[0]]) for li in list_imgs]
         else:
             raise NotImplementedError('{} is not implemented'.format(samples))
 
diff --git a/passl/engine/engine.py b/passl/engine/engine.py
index 23c59ab9..378a387b 100644
--- a/passl/engine/engine.py
+++ b/passl/engine/engine.py
@@ -214,17 +214,12 @@ def worker_init_fn(worker_id):
 
         # build optimizer and lr scheduler
         if self.mode == 'train':
-            config_lr_scheduler = self.config.get('LRScheduler', None)
-            self.lr_scheduler = None
-            if config_lr_scheduler is not None:
-                self.lr_decay_unit = config_lr_scheduler.get('decay_unit',
-                                                             'step')
-                self.lr_scheduler = build_lr_scheduler(
-                    config_lr_scheduler, self.config["Global"]["epochs"],
-                    len(self.train_dataloader))
-
-            self.optimizer = build_optimizer(self.config["Optimizer"],
-                                             self.lr_scheduler, self.model)
+            if self.config["Optimizer"].get('decay_unit', None) is not None:
+                self.lr_decay_unit = self.config["Optimizer"]['decay_unit']
+            else:
+                self.lr_decay_unit = 'step'
+                Warning('lr_decay_unit is not set in optimizer config, set to step by default')
+            self.optimizer = build_optimizer(self.config["Optimizer"], self.model, self.config, len(self.train_dataloader))
 
         # load pretrained model
         if self.config["Global"]["pretrained_model"] is not None:
diff --git a/passl/engine/loops/classification_loop.py b/passl/engine/loops/classification_loop.py
index b1d8d47f..3f3c29d9 100644
--- a/passl/engine/loops/classification_loop.py
+++ b/passl/engine/loops/classification_loop.py
@@ -195,8 +195,8 @@ def train_one_step(self, batch):
         self.trainer.scaler.update()
         # clear gradients
         self.trainer.optimizer.clear_grad()
-
-        if self.trainer.lr_decay_unit == 'step':
+        
+        if self.trainer.lr_decay_unit == 'step': # default is step
             self.trainer.optimizer.lr_step(self.global_step)
 
         return out, loss_dict
diff --git a/passl/models/__init__.py b/passl/models/__init__.py
index 0792faae..38ea440d 100644
--- a/passl/models/__init__.py
+++ b/passl/models/__init__.py
@@ -27,7 +27,7 @@
 from .convnext import *
 from .mocov3 import *
 from .swav import *
-from .simsiam import *
+# from .simsiam import *
 
 __all__ = ["build_model"]
 
diff --git a/passl/models/swav.py b/passl/models/swav.py
index 01627b66..c9ee2e6a 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -7,6 +7,7 @@
 import paddle.nn as nn
 
 from passl.nn import init
+from passl.scheduler import build_lr_scheduler, lr_scheduler
 from passl.utils import logger
 from passl.models.resnet import resnet50
 from passl.models.base_model import Model
@@ -131,62 +132,66 @@ def __init__(self, **kwargs):
     def load_pretrained(self, path, rank=0, finetune=False):
         self._load_model(path, self.res_model, 'backbone') 
 
-    def param_groups(self, config, tensor_fusion=True, custom_cfg=None):
+    def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=None):
         """
-        lr_group(dict|optional): [{'name': 'backbone', 'lr_mult': 0.1}, {'name': 'norm', 'weight_decay_mult': 0}]
+        custom_cfg(dict|optional): [{'name': 'backbone', 'lr': 0.1, 'LRScheduler': {"lr":1.0}}, {'name': 'norm', 'weight_decay_mult': 0}]
         """
-        if custom_cfg is not None:
-            assert isinstance(custom_cfg, list), "`custom_cfg` must be a list."
-            for item in custom_cfg:
+
+        self.custom_cfg = config.pop('custom_cfg', None)
+        if self.custom_cfg is not None:
+            assert isinstance(self.custom_cfg, list), "`custom_cfg` must be a list."
+        assert self.custom_cfg['PasslDefault'].get('LRScheduler', None) is not None, 'LRScheduler is not set in group with name PasslDefault, please set them.'
+            for item in self.custom_cfg:
                 assert isinstance(
                     item, dict), "The item of `custom_cfg` must be a dict"
         
-        param_group = self._collect_params(self.res_model, tensor_fusion, config)
+        param_group = self._collect_params(config, self.res_model, tensor_fusion, epochs, trainset_length)
 
         return param_group
     
-    def _collect_params(self, config, model, tensor_fusion):
+    def _collect_params(self, config, model, tensor_fusion, epochs, trainset_length):
         # Collect different parameter groups
         if self.custom_cfg is None or len(self.custom_cfg) == 0:
-            return {'params': model.parameters(), 'tensor_fusion': tensor_fusion}
+            return [{'params': model.parameters(), 'tensor_fusion': tensor_fusion}]
 
+        # split params
         self.weight_decay = config['weight_decay']
-        groups_num = len(self.custom_cfg) + 1
-        params_list = [[] for _ in range(groups_num)]
+        params_dict = {item['name']: [] for item in self.custom_cfg}
         for name, param in model.named_parameters():
             if param.stop_gradient:
                 continue
             for idx, item in enumerate(self.custom_cfg):
-                if item['name'] in name:
-                    params_list[idx].append(param)
+                if item['name'] in name and item['name']!='PasslDefault':
+                    params_dict[item['name']].append(param)
                     break
             else:
-                params_list[-1].append(param)
-
+                params_dict['PasslDefault'].append(param)
         res = []
-        for idx, item in enumerate(self.custom_cfg):
-            lr_mult = item.get("lr_mult", 1.0)
+        for item in self.custom_cfg:
             weight_decay_mult = item.get("weight_decay_mult", None)
-            param_dict = {'params': params_list[idx], 'learning_rate': lr_mult}
+            if item.get("LRScheduler", None) is not None:
+                lr_scheduler = build_lr_scheduler(item['LRScheduler'], epochs, trainset_length, config['decay_unit'])
+
+            else:
+                Warning('The LRScheduler is not set for group with name {}, use default LRScheduler'.format(item['name']))
+            # todo: initialize LRCallable here.
+                lr_scheduler = build_lr_scheduler(self.custom_cfg['PasslDefault']['LRScheduler'], epochs, trainset_length, config['decay_unit'])
+            param_dict = {'params': params_dict[item['name']], 'lr': lr_scheduler}    
+
             if self.weight_decay is not None and weight_decay_mult is not None:
                 param_dict['weight_decay'] = self.weight_decay * weight_decay_mult
             param_dict['tensor_fusion'] = tensor_fusion
             res.append(param_dict)
-        res.append({'params': params_list[-1]})
 
         msg = 'Parameter groups for optimizer: \n'
         for idx, item in enumerate(self.custom_cfg):
-            params_name = [p.name for p in params_list[idx]]
+            params_name = [p.name for p in params_dict[item['name']]]
             item = item.copy()
             item['params_name'] = params_name
             msg += 'Group {}: \n{} \n'.format(idx, item)
-        msg += 'Last group:\n params_name: {}'.format(
-            [p.name for p in params_list[-1]])
         logger.info(msg)
 
         return res
-
-    
     
     def forward(self, inp):
         return self.res_model(inp)
diff --git a/passl/optimizer/__init__.py b/passl/optimizer/__init__.py
index 609e83e9..5a2add56 100644
--- a/passl/optimizer/__init__.py
+++ b/passl/optimizer/__init__.py
@@ -1,88 +1,3 @@
-# # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# #
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# # you may not use this file except in compliance with the License.
-# # You may obtain a copy of the License at
-# #     http://www.apache.org/licenses/LICENSE-2.0
-# #
-# # Unless required by applicable law or agreed to in writing, software
-# # distributed under the License is distributed on an "AS IS" BASIS,
-# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# # See the License for the specific language governing permissions and
-# # limitations under the License.
-
-# from __future__ import absolute_import
-# from __future__ import division
-# from __future__ import print_function
-
-# from collections import defaultdict
-
-# import copy
-# import paddle
-
-# from passl.core.grad_clip import ClipGradByGlobalNorm
-# from passl.core.param_fuse import get_fused_params
-
-# from passl.utils import logger
-
-# from .optimizer import Optimizer
-# from .adamw import AdamW
-# from .adafactor import Adafactor
-# from .momentum import Momentum
-# from .momentum_lars import MomentumLARS
-
-
-# def build_optimizer(config, lr_scheduler, model=None):
-#     config = copy.deepcopy(config)
-
-#     grad_clip = None
-#     grad_clip_config = config.pop('grad_clip', None)
-#     if grad_clip_config is not None:
-#         grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm')
-#         grad_clip = eval(grad_clip_name)(**grad_clip_config)
-
-#     no_weight_decay_name = config.pop('no_weight_decay_name', [])
-
-#     param_group = defaultdict(list)
-#     for n, p in model.named_parameters():
-#         state = copy.deepcopy(p.__dict__)
-#         if any(nd in n for nd in no_weight_decay_name):
-#             state['no_weight_decay'] = True
-#         param_group[str(state)].append(p)
-
-#     # fuse params
-#     for key in param_group:
-#         if 'gpu' not in paddle.get_device():
-#             continue
-#         if "'is_distributed': True" in key:
-#             continue
-#         if "'has_sparse_grad': True" in key:
-#             continue
-
-#         param_group[key] = get_fused_params(param_group[key])
-
-#     # bulid optimizer params
-#     params = []
-#     for key in param_group:
-#         group = {'params': param_group[key]}
-
-#         if "'is_distributed': True" in key:
-#             group['is_distributed'] = True
-
-#         if 'no_weight_decay' in key:
-#             group['weight_decay'] = 0.0
-
-#         params.append(group)
-
-#     optim_name = config.pop('name')
-#     optim = eval(optim_name)(params,
-#                              lr=lr_scheduler,
-#                              grad_clip=grad_clip,
-#                              **config)
-#     logger.debug("build optimizer ({}) success..".format(optim))
-#     return optim
-
-
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -119,29 +34,30 @@
 from .momentum_larc import MomentumLARC
 
 
-def build_optimizer(config, lr_scheduler, model=None):
-    config = copy.deepcopy(config)
-    optim_name = config.pop('name')
-    custom_cfg = config.pop('custom_cfg', None)
+def build_optimizer(optim_config, model, config, trainset_length):
+    optim_config = copy.deepcopy(optim_config)
+    optim_name = optim_config.pop('name')
     
     grad_clip = None
-    grad_clip_config = config.pop('grad_clip', None)
+    grad_clip_config = optim_config.pop('grad_clip', None)
     if grad_clip_config is not None:
         grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm')
         grad_clip = eval(grad_clip_name)(**grad_clip_config)
 
-    no_weight_decay_name = config.pop('no_weight_decay_name', [])
-    tensor_fusion = config.pop('tensor_fusion', True)
+    no_weight_decay_name = optim_config.pop('no_weight_decay_name', [])
+    tensor_fusion = optim_config.pop('tensor_fusion', True)
     if 'LAR' in optim_name:
         tensor_fusion = False
         logger.info('LARS or LARC Optimizer can not use tensor fusion technology. It automatically fall back to `tensor_fusion = False`.')
 
     if hasattr(model, 'param_groups'):
         # param_group = model.param_groups(no_weight_decay_name, tensor_fusion) # todo compact simsaim
-        param_group = model.param_groups(config, tensor_fusion, custom_cfg)
+        param_group = model.param_groups(optim_config, tensor_fusion, config["Global"]["epochs"], trainset_length)
         for group in param_group:
             if 'tensor_fusion' in group and group['tensor_fusion']:
                 group['params'] = get_fused_params(group['params'])
+        optim_config.pop('custom_cfg', None)
+
     else:
         param_group_map = defaultdict(list)
         for n, p in model.named_parameters():
@@ -175,16 +91,8 @@ def build_optimizer(config, lr_scheduler, model=None):
 
             param_group.append(group)
 
-    lr = lr_scheduler
-    lr_func = None
-    if isinstance(lr_scheduler, LRCallable):
-        lr = lr_scheduler.lr
-        lr_func = lr_scheduler
-
     optim = eval(optim_name)(param_group,
-                             lr=lr,
-                             lr_func=lr_func,
                              grad_clip=grad_clip,
-                             **config)
+                             **optim_config)
     logger.debug("build optimizer ({}) success..".format(optim))
     return optim
diff --git a/passl/optimizer/momentum.py b/passl/optimizer/momentum.py
index 8b569c7c..179839fc 100644
--- a/passl/optimizer/momentum.py
+++ b/passl/optimizer/momentum.py
@@ -26,8 +26,6 @@
 class Momentum(Optimizer):
     def __init__(self,
                  params,
-                 lr=0.001,
-                 lr_func=None,
                  momentum=0.9,
                  weight_decay=0.0,
                  use_master_param=True,
@@ -35,8 +33,6 @@ def __init__(self,
                  **args):
 
         defaults = dict(
-            lr=lr,
-            lr_func=lr_func,
             momentum=momentum,
             weight_decay=weight_decay,
             use_master_param=use_master_param,
diff --git a/passl/optimizer/optimizer.py b/passl/optimizer/optimizer.py
index 98e6a3b1..b0026c76 100644
--- a/passl/optimizer/optimizer.py
+++ b/passl/optimizer/optimizer.py
@@ -206,12 +206,15 @@ def clear_grad(self, set_to_zero=True):
 
     @paddle.no_grad()
     def lr_step(self, step=None):
-        for group in self.param_groups:
+        for i, group in enumerate(self.param_groups):
             lr = group['lr']
-            if isinstance(lr, paddle.optimizer.lr.LRScheduler):
+            
+            if isinstance(lr, paddle.optimizer.lr.LRScheduler): # group defined lr scheduler
                 lr.step(step)
             elif 'lr_func' in group and callable(group['lr_func']):
                 group['lr_func'](group, step)
+            # todo: compact LRCallable
+        print("####lr0 {}, lr0 {}".format(self.param_groups[0]['lr'].get_lr(), self.param_groups[1]['lr'].get_lr()))
 
     @paddle.no_grad()
     def get_lr(self, group_id=0):
diff --git a/passl/scheduler/__init__.py b/passl/scheduler/__init__.py
index ecfb2cf6..4f31e170 100644
--- a/passl/scheduler/__init__.py
+++ b/passl/scheduler/__init__.py
@@ -12,6 +12,7 @@
 # limitations under the License.
 
 import paddle
+from paddle.optimizer.lr import MultiStepDecay
 
 from passl.utils import logger
 
@@ -19,10 +20,15 @@
 from .lr_callable import LRCallable, CosineWithFixLR
 
 
-def build_lr_scheduler(lr_config, epochs, step_each_epoch):
-    lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch})
+def build_lr_scheduler(lr_config, epochs, step_each_epoch, decay_unit):
+    lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch, 'decay_unit': decay_unit })
     if 'name' in lr_config:
         lr_name = lr_config.pop('name')
+        if "MultiStepDecay" in lr_name:
+            lr_config.pop('epochs')
+            lr_config.pop('step_each_epoch')
+            lr_config.pop('decay_unit')
+            print(lr_config)
         lr = eval(lr_name)(**lr_config)
         if isinstance(lr, paddle.optimizer.lr.LRScheduler):
             return lr
diff --git a/passl/scheduler/lr_scheduler.py b/passl/scheduler/lr_scheduler.py
index fb8c7c97..1159a27c 100644
--- a/passl/scheduler/lr_scheduler.py
+++ b/passl/scheduler/lr_scheduler.py
@@ -23,7 +23,7 @@
 class TimmCosine(lr.LRScheduler):
     def __init__(self,
                  learning_rate,
-                 step_each_epoch, # len(train_loader) = dataset/total_bs
+                 step_each_epoch, 
                  epochs,
                  decay_unit='epoch',
                  eta_min=0.0,
@@ -123,8 +123,8 @@ class Step(lr.LRScheduler):
     def __init__(self,
                  step_each_epoch,
                  epochs,
-                 boundaries, # [12, 16]
-                 values,    #[0.01, 0.002, 0.0004],
+                 boundaries,
+                 values,   
                  warmup_steps=0,
                  warmup_epochs=0,
                  decay_unit='epoch',
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
index f7950c0b..9781c34e 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
@@ -4,9 +4,9 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-  pretrained_model: /ssd2/tangshiyu/Code/PASSL/swav_800ep_pretrain.pdparams
+  pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams
   finetune: True
-  output_dir: ./output/semi_0420
+  output_dir: ./output/semi_0424
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
@@ -40,21 +40,28 @@ Loss:
     - CELoss:
         weight: 1.0
 
-LRScheduler:
-    name: MultiStepDecay
-    learning_rate: 0.02
-    milestones: [12, 16]
-    gamma: 0.2
-    last_epoch: -1
 
 Optimizer:
   name: Momentum
   momentum: 0.9
   weight_decay: 0.0
   tensor_fusion: False
-  custom_config:
+  decay_unit: epoch
+  custom_cfg:
     - name: head
-      lr_mult: 250
+      LRScheduler:
+        name: MultiStepDecay
+        learning_rate: 5
+        milestones: [12, 16]
+        gamma: 0.2
+        last_epoch: -1
+    - name: PasslDefault
+      LRScheduler:
+        name: MultiStepDecay
+        learning_rate: 0.02
+        milestones: [12, 16]
+        gamma: 0.2
+        last_epoch: -1
 
 # data loader for train and eval
 DataLoader:
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
index 3136121c..f80b2fa5 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
@@ -4,14 +4,8 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-<<<<<<< HEAD
-  pretrained_model: /ssd2/tangshiyu/Code/PASSL/swav_800ep_pretrain.pdparams
-  finetune: True
-  output_dir: ./output/semi_0420
-=======
   pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams
   output_dir: ./output/baseline_0421_align_trackTrue_nolinearload
->>>>>>> 709ea4dd375a62c2a5a6c3a36ea9776e30ead382
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
@@ -19,12 +13,12 @@ Global:
   eval_interval: 1
   eval_unit: "epoch"
   accum_steps: 1
-  epochs: 20
+  epochs: 100
   print_batch_step: 100
   use_visualdl: False
   seed: 31
 
-# FP16 setting
+# FP16 setting ignore in align
 # FP16:
 #   level: O1
 
@@ -33,15 +27,10 @@ DistributedStrategy:
 
 # model architecture
 Model:
-<<<<<<< HEAD
-  name: swav_resnet50_finetune
-  output_dim: 1000
-=======
   name: swav_resnet50_linearprobe
   output_dim: 0 
   eval_mode: True
   class_num: 1000
->>>>>>> 709ea4dd375a62c2a5a6c3a36ea9776e30ead382
 
 # loss function config for traing/eval process
 Loss:
@@ -52,22 +41,21 @@ Loss:
     - CELoss:
         weight: 1.0
 
-LRScheduler:
-    name: MultiStepDecay
-    learning_rate: 0.02
-    milestones: [12, 16]
-    gamma: 0.2
-    last_epoch: -1
-
 Optimizer:
   name: Momentum
   momentum: 0.9
-  weight_decay: 0.0
-  tensor_fusion: False
-  custom_config:
-    - name: head
-      lr_mult: 250
-
+  weight_decay: 1e-6
+  tensor_fusion: True
+  decay_unit: epoch
+  custom_cfg:
+    - name: PasslDefault
+      LRScheduler:
+        name: TimmCosine
+        learning_rate: 0.3
+        eta_min: 0.0
+        last_epoch: 0
+        warmup_epoch: 0
+  
 # data loader for train and eval
 DataLoader:
   Train:
@@ -82,10 +70,9 @@ DataLoader:
         - Normalize:
             mean: [0.485, 0.456, 0.406]
             std: [0.228, 0.224, 0.225]
-      samples_tag: semi_1
     sampler:
       name: DistributedBatchSampler
-      batch_size: 64 # accum_steps: 1, total batchsize: 256
+      batch_size: 32 # accum_steps: 1, total batchsize: 256
       drop_last: False
       shuffle: True
     loader:
@@ -107,7 +94,7 @@ DataLoader:
             std: [0.228, 0.224, 0.225]
     sampler:
       name: DistributedBatchSampler
-      batch_size: 64
+      batch_size: 32
       drop_last: False
       shuffle: False
     loader:
@@ -124,4 +111,4 @@ Metric:
 
 Export:
   export_type: paddle
-  input_shape: [None, 3, 224, 224]
\ No newline at end of file
+  input_shape: [None, 3, 224, 224]
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
index 8c3f9603..c514c6bc 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
@@ -36,15 +36,6 @@ Model:
   output_dim: 128
   nmb_prototypes: 3000
 
-LRScheduler:
-    name: TimmCosine
-    learning_rate: 4.8
-    decay_unit: step
-    eta_min: 0.0048
-    warmup_epoch: 10
-    warmup_start_lr: 0.3
-    warmup_prefix: True
-
 Optimizer:
   name: MomentumLARC
   momentum: 0.9
@@ -52,6 +43,17 @@ Optimizer:
   trust_coefficient: 0.001
   clip: False
   tensor_fusion: False
+  decay_unit: epoch
+  custom_cfg:
+    - name: PasslDefault
+      LRScheduler:
+        name: TimmCosine
+        learning_rate: 4.8
+        decay_unit: step
+        eta_min: 0.0048
+        warmup_epoch: 10
+        warmup_start_lr: 0.3
+        warmup_prefix: True
 
 # data loader for train and eval
 DataLoader:
diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh
index c06a84cc..5aa3ff33 100644
--- a/tasks/ssl/swav/finetune.sh
+++ b/tasks/ssl/swav/finetune.sh
@@ -19,9 +19,10 @@ unset DISTRIBUTED_TRAINER_ENDPOINTS
 export PADDLE_NNODES=1
 export PADDLE_MASTER="127.0.0.1:12538"
 export CUDA_VISIBLE_DEVICES=4,1,2,3 #,4,5,6,7
+export https_proxy="http://172.19.56.199:3128"
 
 python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
-    tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yml
\ No newline at end of file
+   passl-train -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh
index fb44a0d4..d1c866c6 100644
--- a/tasks/ssl/swav/pretrain.sh
+++ b/tasks/ssl/swav/pretrain.sh
@@ -22,4 +22,4 @@ python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
-    tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yml
\ No newline at end of file
+    tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
\ No newline at end of file

From 277ab82ceebb9d9f49ee5f9b570b87ce3976e884 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Tue, 25 Apr 2023 17:49:44 +0800
Subject: [PATCH 11/46] validate_ft

---
 passl/data/dataset/imagefolder_dataset.py     |  2 +-
 passl/engine/loops/classification_loop.py     | 43 ++++++++++++-------
 passl/models/resnet.py                        |  3 +-
 passl/models/swav.py                          | 34 +++++++++------
 ...v_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml |  5 +--
 ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml |  2 +-
 tasks/ssl/swav/finetune.sh                    |  6 ++-
 7 files changed, 60 insertions(+), 35 deletions(-)

diff --git a/passl/data/dataset/imagefolder_dataset.py b/passl/data/dataset/imagefolder_dataset.py
index 5ad4e208..5d994267 100644
--- a/passl/data/dataset/imagefolder_dataset.py
+++ b/passl/data/dataset/imagefolder_dataset.py
@@ -65,7 +65,7 @@ def __init__(self,
         classes, class_to_idx = self.find_classes(self.root)
         if samples_tag is None:
             samples = self.make_dataset(self.root, class_to_idx, extensions)
-        elif samples_tag == "semi_1" or samples == "semi_10":
+        elif samples_tag == "semi_1" or samples_tag == "semi_10":
             # train_data_path  = os.path.join(root, "train")
             percent = samples_tag.split('_')[-1]
             # subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(percent) + "percent.txt")
diff --git a/passl/engine/loops/classification_loop.py b/passl/engine/loops/classification_loop.py
index 3f3c29d9..6463357a 100644
--- a/passl/engine/loops/classification_loop.py
+++ b/passl/engine/loops/classification_loop.py
@@ -108,12 +108,7 @@ def log_model(model, logger):
     model1 = model.res_model
     for name, param in model1.named_parameters():
         logger.info(name)
-        logger.info(param.abs().sum())
-        
-    model2 = model.linear
-    for name, param in model2.named_parameters():
-        logger.info(name)
-        logger.info(param.abs().sum())
+        logger.info(param.abs().mean())
 
         
 class ClassificationTrainingEpochLoop(TrainingEpochLoop):
@@ -135,7 +130,14 @@ def forward_backward(self, batch):
         for idx in range(self.trainer.accum_steps):
             data = batch[0][idx * step_size:(idx + 1) * step_size]
             label = batch[1][idx * step_size:(idx + 1) * step_size]
-
+            
+            ####### test            #######
+            # label = paddle.to_tensor([133, 141, 371, 254,  89, 244,  33,  64, 542,  93, 262, 674, 898, 796, 785, 727, 228, 792, 853, 639, 410, 357, 545, 473, 637, 400, 863, 386, 689, 359, 476, 960]).cast('int32')
+            # import numpy as np
+            # np.random.seed(42)
+            # a = np.random.rand(32, 3, 224, 224)
+            # data = paddle.to_tensor(a).astype('float32')
+            
             # do cast if using fp16 otherwise do nothing
             with paddle.amp.auto_cast(
                     enable=self.trainer.fp16,
@@ -145,12 +147,12 @@ def forward_backward(self, batch):
 
                 out = self.trainer.model(data)
                 final_out.append(out)
-                
-            # label = paddle.to_tensor([133, 141, 371, 254,  89, 244,  33,  64, 542,  93, 262, 674, 898, 796, 785, 727, 228, 792, 853, 639, 410, 357, 545, 473, 637, 400, 863, 386, 689, 359, 476, 960]).cast('int32')
-            
+                            
             loss_dict = self.trainer.train_loss_func(out, label)
-            
-            # logger1 = init_logger('first')
+            # import pdb; pdb.set_trace()
+
+            ####### test            #######
+            # logger1 = init_logger('before')
             # log_model(self.trainer.model, logger1)
 
             for key in loss_dict:
@@ -163,6 +165,7 @@ def forward_backward(self, batch):
             scaled = self.trainer.scaler.scale(loss_dict["loss"])
             scaled.backward()
             
+            ####### test            #######
 #             grad_sync(self.trainer.optimizer.param_groups)
 
 #             # do unscale and step if using fp16 and not found nan/inf
@@ -172,13 +175,12 @@ def forward_backward(self, batch):
 #             # otherwise do nothing
 #             self.trainer.scaler.update()
             
-            # logger2 = init_logger('second')
+            # logger2 = init_logger('after')
             # log_model(self.trainer.model, logger2)
-            # import pdb; pdb.set_trace()
             
 
         out = paddle.concat(final_out, axis=0)
-        return out, final_loss_dict, 
+        return out, final_loss_dict
 
     def train_one_step(self, batch):
 
@@ -278,9 +280,20 @@ def eval_one_dataset(self, eval_dataloader):
                     custom_white_list=self.trainer.fp16_custom_white_list,
                     custom_black_list=self.trainer.fp16_custom_black_list,
                     level=self.trainer.fp16_level):
+                
+                ####### test            #######
+                # label = paddle.to_tensor([133, 141, 371, 254,  89, 244,  33,  64, 542,  93, 262, 674, 898, 796, 785, 727, 228, 792, 853, 639, 410, 357, 545, 473, 637, 400, 863, 386, 689, 359, 476, 960, 133, 141, 371, 254,  89, 244,  33,  64, 542,  93, 262, 674, 898, 796, 785, 727, 228, 792, 853, 639, 410, 357, 545, 473, 637, 400, 863, 386, 689, 359, 476, 960]).cast('int32')
+                # import numpy as np
+                # np.random.seed(42)
+                # a = np.random.rand(32, 3, 224, 224)
+                # data = paddle.to_tensor(a).astype('float32')
+                
+                # import pdb; pdb.set_trace()
+                # out = self.trainer.model(data)
                 out = self.trainer.model(batch[0])
                 # calc loss
                 if self.trainer.eval_loss_func is not None:
+                    # loss_dict = self.trainer.eval_loss_func(out, target)
                     loss_dict = self.trainer.eval_loss_func(out, batch[1])
                     for key in loss_dict:
                         if key not in output_info:
diff --git a/passl/models/resnet.py b/passl/models/resnet.py
index 1fa48c34..34761215 100644
--- a/passl/models/resnet.py
+++ b/passl/models/resnet.py
@@ -109,7 +109,7 @@ def __init__(self, block, layers, zero_init_residual=False, groups=1,
         
         super(ResNet, self).__init__()
         if norm_layer is None:
-            norm_layer = functools.partial(paddle.nn.BatchNorm2D, use_global_stats=False)
+            norm_layer = functools.partial(paddle.nn.BatchNorm2D, use_global_stats=True)
         self._norm_layer = norm_layer
         self.eval_mode = eval_mode
         self.padding = paddle.nn.Pad2D(padding=1, value=0.0)
@@ -196,7 +196,6 @@ def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
         return paddle.nn.Sequential(*layers)
 
     def forward_backbone(self, x):
-        
         x = self.padding(x)
         x = self.conv1(x)
         x = self.bn1(x)
diff --git a/passl/models/swav.py b/passl/models/swav.py
index 414fd5f2..377e6839 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -109,16 +109,9 @@ def _freeze_norm(self, layer):
     
     def load_pretrained(self, path, rank=0, finetune=False):
         self._load_model(path, self.res_model, 'backbone')
-        self._load_model("linear.pdparams", self.linear, 'linear')
+        # self._load_model("linear.pdparams", self.linear, 'linear')
 
     def forward(self, inp):
-#         import numpy as np
-        # import pdb; pdb.set_trace()
-        
-#         np.random.seed(42)
-#         a = np.random.rand(32, 3, 224, 224)
-#         inp = paddle.to_tensor(a).astype('float32')
-        
         with paddle.no_grad():
             output = self.res_model(inp)
         output = self.linear(output)
@@ -128,10 +121,16 @@ def forward(self, inp):
 class SwAVFinetune(SwAV):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+        self.apply(self._freeze_norm)
     
     def load_pretrained(self, path, rank=0, finetune=False):
         self._load_model(path, self.res_model, 'backbone') 
-
+        # self._load_model("projection_head.pdparams", self.res_model.projection_head, 'projection_head')
+    
+    def _freeze_norm(self, layer):
+        if isinstance(layer, (nn.layer.norm._BatchNormBase)):
+            layer._use_global_stats = True
+        
     def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=None):
         """
         custom_cfg(dict|optional): [{'name': 'backbone', 'lr': 0.1, 'LRScheduler': {"lr":1.0}}, {'name': 'norm', 'weight_decay_mult': 0}]
@@ -140,9 +139,12 @@ def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=
         self.custom_cfg = config.pop('custom_cfg', None)
         if self.custom_cfg is not None:
             assert isinstance(self.custom_cfg, list), "`custom_cfg` must be a list."
-        assert self.custom_cfg['PasslDefault'].get('LRScheduler', None) is not None, 'LRScheduler is not set in group with name PasslDefault, please set them.'
             for item in self.custom_cfg:
-                assert isinstance(
+                if item['name']=='PasslDefault':
+                    assert item.get('LRScheduler', None) is not None, 'LRScheduler is not set in group with name PasslDefault, please set them.'
+        
+        for item in self.custom_cfg:
+            assert isinstance(
                     item, dict), "The item of `custom_cfg` must be a dict"
         
         param_group = self._collect_params(config, self.res_model, tensor_fusion, epochs, trainset_length)
@@ -209,14 +211,22 @@ def swav_resnet50_linearprobe(**kwargs):
     return model
 
 def swav_resnet50_finetune(**kwargs):
+    # flags = {}
+    # flags['FLAGS_cudnn_exhaustive_search'] = False
+    # flags['FLAGS_cudnn_deterministic'] = False
+    # paddle.set_flags(flags)
     model = SwAVFinetune(**kwargs)
+    if paddle.distributed.get_world_size() > 1:
+        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
     return model
 
 def swav_resnet50_pretrain(**kwargs): # todo
     flags = {}
     flags['FLAGS_cudnn_exhaustive_search'] = True
-    flags['FLAGS_cudnn_deterministic'] = True
+    flags['FLAGS_cudnn_deterministic'] = False
     paddle.set_flags(flags)
+    if paddle.distributed.get_world_size() > 1:
+        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
     model = SwAVPretrain(**kwargs)
     return model       
             
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
index 9781c34e..dd7fa58d 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
@@ -6,7 +6,7 @@ Global:
   checkpoint: null
   pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams
   finetune: True
-  output_dir: ./output/semi_0424
+  output_dir: ./output/semi_0425_readyagain
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
@@ -15,7 +15,7 @@ Global:
   eval_unit: "epoch"
   accum_steps: 1
   epochs: 20
-  print_batch_step: 100
+  print_batch_step: 50 # 50
   use_visualdl: False
   seed: 31
 
@@ -40,7 +40,6 @@ Loss:
     - CELoss:
         weight: 1.0
 
-
 Optimizer:
   name: Momentum
   momentum: 0.9
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
index e3b179bb..8e86cb41 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
@@ -5,7 +5,7 @@ Global:
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
   pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams
-  output_dir: ./output/baseline_0421_align_trackTrue_nolinearload
+  output_dir: ./output/baseline_0425
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh
index 5aa3ff33..f37e8767 100644
--- a/tasks/ssl/swav/finetune.sh
+++ b/tasks/ssl/swav/finetune.sh
@@ -18,7 +18,8 @@ unset PADDLE_TRAINER_ENDPOINTS
 unset DISTRIBUTED_TRAINER_ENDPOINTS
 export PADDLE_NNODES=1
 export PADDLE_MASTER="127.0.0.1:12538"
-export CUDA_VISIBLE_DEVICES=4,1,2,3 #,4,5,6,7
+# export CUDA_VISIBLE_DEVICES=4 #,1,2,3
+export CUDA_VISIBLE_DEVICES=5,6,7,0
 export https_proxy="http://172.19.56.199:3128"
 
 python -m paddle.distributed.launch \
@@ -26,3 +27,6 @@ python -m paddle.distributed.launch \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
    passl-train -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
+   
+    # --log_dir='output' \
+   
\ No newline at end of file

From 5e739fa5b3c8d90af82373a948c82b4575fe820c Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Wed, 26 Apr 2023 10:20:42 +0800
Subject: [PATCH 12/46] format

---
 passl/data/dataset/imagefolder_dataset.py           |  8 ++++----
 passl/models/swav.py                                | 13 -------------
 passl/scheduler/lr_scheduler.py                     |  4 ++--
 .../swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml   | 10 +++-------
 .../swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml    |  7 +------
 tasks/ssl/swav/finetune.sh                          |  9 ++-------
 tasks/ssl/swav/linearprobe.sh                       |  3 ---
 7 files changed, 12 insertions(+), 42 deletions(-)

diff --git a/passl/data/dataset/imagefolder_dataset.py b/passl/data/dataset/imagefolder_dataset.py
index 5d994267..dac2634a 100644
--- a/passl/data/dataset/imagefolder_dataset.py
+++ b/passl/data/dataset/imagefolder_dataset.py
@@ -66,13 +66,13 @@ def __init__(self,
         if samples_tag is None:
             samples = self.make_dataset(self.root, class_to_idx, extensions)
         elif samples_tag == "semi_1" or samples_tag == "semi_10":
-            # train_data_path  = os.path.join(root, "train")
-            percent = samples_tag.split('_')[-1]
+            # connection reset
             # subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(percent) + "percent.txt")
-            subset_file = str(percent) + "percent.txt"
+            # list_imgs = [li.decode("utf-8").split('\n')[0] for li in subset_file]
+            subset_file = str(samples_tag.split('_')[-1]) + "percent.txt"
             with open(subset_file, 'r') as f:
                 list_imgs = [li.split('\n')[0] for li in f.readlines()]
-            # print(list_imgs)
+
             samples = [(os.path.join(root, li.split('_')[0], li), class_to_idx[li.split('_')[0]]) for li in list_imgs]
         else:
             raise NotImplementedError('{} is not implemented'.format(samples))
diff --git a/passl/models/swav.py b/passl/models/swav.py
index 377e6839..3a318ca7 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -211,10 +211,6 @@ def swav_resnet50_linearprobe(**kwargs):
     return model
 
 def swav_resnet50_finetune(**kwargs):
-    # flags = {}
-    # flags['FLAGS_cudnn_exhaustive_search'] = False
-    # flags['FLAGS_cudnn_deterministic'] = False
-    # paddle.set_flags(flags)
     model = SwAVFinetune(**kwargs)
     if paddle.distributed.get_world_size() > 1:
         model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
@@ -230,15 +226,6 @@ def swav_resnet50_pretrain(**kwargs): # todo
     model = SwAVPretrain(**kwargs)
     return model       
             
-# def normal_init(param, **kwargs):
-#     initializer = nn.initializer.Normal(**kwargs)
-#     initializer(param, param.block)
-
-# def constant_init(param, **kwargs):
-#     initializer = nn.initializer.Constant(**kwargs)
-#     initializer(param, param.block)
-        
-        
 class RegLog(paddle.nn.Layer):
     """Creates logistic regression on top of frozen features"""
 
diff --git a/passl/scheduler/lr_scheduler.py b/passl/scheduler/lr_scheduler.py
index 1159a27c..223ca349 100644
--- a/passl/scheduler/lr_scheduler.py
+++ b/passl/scheduler/lr_scheduler.py
@@ -23,7 +23,7 @@
 class TimmCosine(lr.LRScheduler):
     def __init__(self,
                  learning_rate,
-                 step_each_epoch, 
+                 step_each_epoch,
                  epochs,
                  decay_unit='epoch',
                  eta_min=0.0,
@@ -124,7 +124,7 @@ def __init__(self,
                  step_each_epoch,
                  epochs,
                  boundaries,
-                 values,   
+                 values,
                  warmup_steps=0,
                  warmup_epochs=0,
                  decay_unit='epoch',
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
index dd7fa58d..946001dd 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
@@ -6,7 +6,7 @@ Global:
   checkpoint: null
   pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams
   finetune: True
-  output_dir: ./output/semi_0425_readyagain
+  output_dir: ./output/semi_0426_semi10
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
@@ -15,14 +15,10 @@ Global:
   eval_unit: "epoch"
   accum_steps: 1
   epochs: 20
-  print_batch_step: 50 # 50
+  print_batch_step: 50
   use_visualdl: False
   seed: 31
 
-# FP16 setting
-# FP16:
-#   level: O1
-
 DistributedStrategy:
   data_parallel: True
 
@@ -76,7 +72,7 @@ DataLoader:
         - Normalize:
             mean: [0.485, 0.456, 0.406]
             std: [0.228, 0.224, 0.225]
-      samples_tag: semi_1
+      samples_tag: semi_10
     sampler:
       name: DistributedBatchSampler
       batch_size: 64 # accum_steps: 1, total batchsize: 256
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
index 8e86cb41..4780a9e1 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
@@ -5,7 +5,7 @@ Global:
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
   pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams
-  output_dir: ./output/baseline_0425
+  output_dir: ./output
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
@@ -18,10 +18,6 @@ Global:
   use_visualdl: False
   seed: 31
 
-# FP16 setting ignore in align
-# FP16:
-#   level: O1
-
 DistributedStrategy:
   data_parallel: True
 
@@ -41,7 +37,6 @@ Loss:
     - CELoss:
         weight: 1.0
 
-
 Optimizer:
   name: Momentum
   momentum: 0.9
diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh
index f37e8767..c577ddb1 100644
--- a/tasks/ssl/swav/finetune.sh
+++ b/tasks/ssl/swav/finetune.sh
@@ -12,21 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Note: Set the following environment variables 
-# and then need to run the script on each node.
+
 unset PADDLE_TRAINER_ENDPOINTS
 unset DISTRIBUTED_TRAINER_ENDPOINTS
 export PADDLE_NNODES=1
 export PADDLE_MASTER="127.0.0.1:12538"
-# export CUDA_VISIBLE_DEVICES=4 #,1,2,3
-export CUDA_VISIBLE_DEVICES=5,6,7,0
-export https_proxy="http://172.19.56.199:3128"
+export CUDA_VISIBLE_DEVICES=4,5,6,7
 
 python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
    passl-train -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
-   
-    # --log_dir='output' \
    
\ No newline at end of file
diff --git a/tasks/ssl/swav/linearprobe.sh b/tasks/ssl/swav/linearprobe.sh
index 866322e1..4c37392b 100644
--- a/tasks/ssl/swav/linearprobe.sh
+++ b/tasks/ssl/swav/linearprobe.sh
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# export FLAGS_stop_check_timeout=3600
 unset PADDLE_TRAINER_ENDPOINTS
 unset DISTRIBUTED_TRAINER_ENDPOINTS
 export PADDLE_NNODES=1
@@ -24,5 +23,3 @@ python -m paddle.distributed.launch \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
     tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
-
-# python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c
\ No newline at end of file

From 76056f523b5ee3c1228a6982f2e0495dcc031df0 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Fri, 28 Apr 2023 15:18:52 +0800
Subject: [PATCH 13/46] add_pretrain

---
 passl/data/dataset/__init__.py                |   1 +
 passl/data/dataset/multicrop_dataset.py       |  10 +-
 passl/data/preprocess/basic_transforms.py     |  32 +++---
 passl/engine/engine.py                        |  10 +-
 passl/engine/loops/classification_loop.py     |   2 +-
 .../engine/loops/contrastive_learning_loop.py |   9 +-
 passl/engine/loops/loop.py                    |   9 +-
 passl/models/swav.py                          | 104 ++++++++++++++++--
 passl/optimizer/__init__.py                   |  11 +-
 ...v_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml |  13 +--
 ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml |  14 +--
 ..._resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml |  54 ++++++---
 tasks/ssl/swav/pretrain.sh                    |   4 +-
 13 files changed, 198 insertions(+), 75 deletions(-)

diff --git a/passl/data/dataset/__init__.py b/passl/data/dataset/__init__.py
index b3e14445..b19912e1 100644
--- a/passl/data/dataset/__init__.py
+++ b/passl/data/dataset/__init__.py
@@ -63,3 +63,4 @@ def default_loader(path: str):
 
 from .imagenet_dataset import ImageNetDataset
 from .imagefolder_dataset import ImageFolder
+from .multicrop_dataset import MultiCropDataset
diff --git a/passl/data/dataset/multicrop_dataset.py b/passl/data/dataset/multicrop_dataset.py
index 926d4a59..ffa30008 100644
--- a/passl/data/dataset/multicrop_dataset.py
+++ b/passl/data/dataset/multicrop_dataset.py
@@ -23,7 +23,8 @@
 from passl.data.dataset.imagefolder_dataset import ImageFolder
 from passl.data.preprocess import (
     RandomApply,
-    GaussianBlur,
+    # GaussianBlur,
+    SimCLRGaussianBlur,
     NormalizeImage,
     RandomGrayscale,
 )
@@ -31,13 +32,13 @@
 
 class MultiCropDataset(ImageFolder):
     def __init__(self,
-                 dataroot,
+                 root,
                  size_crops,
                  num_crops,
                  min_scale_crops,
                  max_scale_crops,
                  return_label=False):
-        super(MultiCropDataset, self).__init__(dataroot)
+        super(MultiCropDataset, self).__init__(root)
 
         assert len(size_crops) == len(num_crops)
         assert len(min_scale_crops) == len(num_crops)
@@ -80,7 +81,8 @@ def __getitem__(self, index):
 
 
 def get_pil_gaussian_blur(p=0.5):
-    gaussian_blur = GaussianBlur(sigma=[.1, 2.], _PIL=True)
+    # gaussian_blur = GaussianBlur(sigma=[.1, 2.], _PIL=True)
+    gaussian_blur = SimCLRGaussianBlur(sigma=[.1, 2.])
     rnd_gaussian_blur = RandomApply([gaussian_blur], p=p)
     return rnd_gaussian_blur
 
diff --git a/passl/data/preprocess/basic_transforms.py b/passl/data/preprocess/basic_transforms.py
index 9d9eb132..96a784c3 100644
--- a/passl/data/preprocess/basic_transforms.py
+++ b/passl/data/preprocess/basic_transforms.py
@@ -944,19 +944,19 @@ def __call__(self, img):
         return img
 
 
-class GaussianBlur(object):
-    """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709"""
-    def __init__(self, sigma=[.1, 2.], _PIL=False):
-        self.sigma = sigma
-        self.kernel_size = 23
-        self._PIL = _PIL
-
-    def __call__(self, x):
-        sigma = np.random.uniform(self.sigma[0], self.sigma[1])
-        if self._PIL:
-            x = x.filter(ImageFilter.GaussianBlur(radius=sigma))
-            return x
-        else:
-            x = cv2.GaussianBlur(np.array(x),
-                                 (self.kernel_size, self.kernel_size), sigma)
-            return Image.fromarray(x.astype(np.uint8))
\ No newline at end of file
+# class GaussianBlur(object):
+#     """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709"""
+#     def __init__(self, sigma=[.1, 2.], _PIL=False):
+#         self.sigma = sigma
+#         self.kernel_size = 23
+#         self._PIL = _PIL
+
+#     def __call__(self, x):
+#         sigma = np.random.uniform(self.sigma[0], self.sigma[1])
+#         if self._PIL:
+#             x = x.filter(ImageFilter.GaussianBlur(radius=sigma))
+#             return x
+#         else:
+#             x = cv2.GaussianBlur(np.array(x),
+#                                  (self.kernel_size, self.kernel_size), sigma)
+#             return Image.fromarray(x.astype(np.uint8))
\ No newline at end of file
diff --git a/passl/engine/engine.py b/passl/engine/engine.py
index 378a387b..aa8164a0 100644
--- a/passl/engine/engine.py
+++ b/passl/engine/engine.py
@@ -214,15 +214,21 @@ def worker_init_fn(worker_id):
 
         # build optimizer and lr scheduler
         if self.mode == 'train':
+            assert self.config.get("Optimizer", None) is not None, "Optimizer must be defined in config."
             if self.config["Optimizer"].get('decay_unit', None) is not None:
                 self.lr_decay_unit = self.config["Optimizer"]['decay_unit']
             else:
                 self.lr_decay_unit = 'step'
                 Warning('lr_decay_unit is not set in optimizer config, set to step by default')
-            self.optimizer = build_optimizer(self.config["Optimizer"], self.model, self.config, len(self.train_dataloader))
+            
+            config_lr_scheduler = self.config["Optimizer"].get('LRScheduler', None)
+            self.lr_scheduler = None	                
+            if config_lr_scheduler is not None:	        	  
+                self.lr_scheduler = build_lr_scheduler(config_lr_scheduler, self.config["Global"]["epochs"], len(self.train_dataloader), self.lr_decay_unit)	
+
+            self.optimizer = build_optimizer(self.config["Optimizer"], self.model, self.config, len(self.train_dataloader), self.lr_scheduler)
 
         # load pretrained model
-        if self.config["Global"]["pretrained_model"] is not None:
             assert isinstance(
                 self.config["Global"]["pretrained_model"], str
             ), "pretrained_model type is not available. Please use `string`."
diff --git a/passl/engine/loops/classification_loop.py b/passl/engine/loops/classification_loop.py
index 6463357a..4c7349a2 100644
--- a/passl/engine/loops/classification_loop.py
+++ b/passl/engine/loops/classification_loop.py
@@ -182,7 +182,7 @@ def forward_backward(self, batch):
         out = paddle.concat(final_out, axis=0)
         return out, final_loss_dict
 
-    def train_one_step(self, batch):
+    def train_one_step(self, batch, total_iterations=None):
 
         # do forward and backward
         out, loss_dict = self.forward_backward(batch)
diff --git a/passl/engine/loops/contrastive_learning_loop.py b/passl/engine/loops/contrastive_learning_loop.py
index a772a28d..f6fe4fbe 100644
--- a/passl/engine/loops/contrastive_learning_loop.py
+++ b/passl/engine/loops/contrastive_learning_loop.py
@@ -69,19 +69,24 @@ def forward_backward(self, batch):
 
         return final_loss_dict
 
-    def train_one_step(self, batch):
+    def train_one_step(self, batch, total_iterations):
 
         # remove label
         batch = batch[0]
 
         # do forward and backward
         loss_dict = self.forward_backward(batch)
+        
+        try: 
+            self.trainer.model.after_loss_backward(total_iterations)
+        except AttributeError:
+            logger.warning("Model has no after_loss_backward method, ignored this process")
 
         grad_sync(self.trainer.optimizer.param_groups)
 
         # do unscale and step if using fp16 and not found nan/inf
         # otherwise do nothing
-        self.trainer.scaler.step(self.trainer.optimizer)
+        self.trainer.scaler.step(self.trainer.optimizer) # todo  # check this will updata weight, before this weight is not updated
         # do update loss scaling if using fp16
         # otherwise do nothing
         self.trainer.scaler.update()
diff --git a/passl/engine/loops/loop.py b/passl/engine/loops/loop.py
index 35bdfa1d..dbe60dd6 100644
--- a/passl/engine/loops/loop.py
+++ b/passl/engine/loops/loop.py
@@ -219,7 +219,7 @@ def run(self):
                 self.trainer.train_dataloader.batch_sampler.set_epoch(epoch_id)
 
             # for one epoch train
-            self.train_one_epoch()
+            self.train_one_epoch(epoch_id)
 
             if self.trainer.lr_decay_unit == 'epoch':
                 self.trainer.optimizer.lr_step(self.cur_epoch_id)
@@ -257,13 +257,14 @@ def run(self):
         self.trainer.training = False
 
 
-    def train_one_epoch(self):
+    def train_one_epoch(self, epoch_id):
         self.trainer.model.train()
 
         tic = time.time()
 
         for batch_idx, batch in enumerate(self.trainer.train_dataloader):
             self.cur_batch_idx = batch_idx
+            total_iterations = epoch_id*self.total_batch_idx + batch_idx
 
             if self.max_train_step is not None and self.global_step >= self.max_train_step:
                 logger.info(
@@ -288,7 +289,7 @@ def train_one_epoch(self):
             self.global_step += 1
 
             # do forward and backward
-            out, loss_dict = self.train_one_step(batch)
+            out, loss_dict = self.train_one_step(batch, total_iterations)
 
             self.time_info["batch_cost"].update(time.time() - tic)
 
@@ -310,7 +311,7 @@ def train_one_epoch(self):
             tic = time.time()
 
 
-    def train_one_step(self, batch):
+    def train_one_step(self, batch, total_iterations):
         raise NotImplementedError
 
     def save_checkpoint(self):
diff --git a/passl/models/swav.py b/passl/models/swav.py
index 3a318ca7..502bff83 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -1,7 +1,8 @@
-from collections import defaultdict
-import copy
 import os
+import copy
+import numpy as np
 from sys import flags
+from collections import defaultdict
 
 import paddle
 import paddle.nn as nn
@@ -139,9 +140,6 @@ def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=
         self.custom_cfg = config.pop('custom_cfg', None)
         if self.custom_cfg is not None:
             assert isinstance(self.custom_cfg, list), "`custom_cfg` must be a list."
-            for item in self.custom_cfg:
-                if item['name']=='PasslDefault':
-                    assert item.get('LRScheduler', None) is not None, 'LRScheduler is not set in group with name PasslDefault, please set them.'
         
         for item in self.custom_cfg:
             assert isinstance(
@@ -158,12 +156,13 @@ def _collect_params(self, config, model, tensor_fusion, epochs, trainset_length)
 
         # split params
         self.weight_decay = config['weight_decay']
-        params_dict = {item['name']: [] for item in self.custom_cfg}
+        params_dict = {item['name']: [] for item in self.custom_cfg} # key name and a PasslDefault
+        params_dict['PasslDefault'] = []
         for name, param in model.named_parameters():
             if param.stop_gradient:
                 continue
             for idx, item in enumerate(self.custom_cfg):
-                if item['name'] in name and item['name']!='PasslDefault':
+                if item['name'] in name:
                     params_dict[item['name']].append(param)
                     break
             else:
@@ -177,13 +176,14 @@ def _collect_params(self, config, model, tensor_fusion, epochs, trainset_length)
             else:
                 Warning('The LRScheduler is not set for group with name {}, use default LRScheduler'.format(item['name']))
             # todo: initialize LRCallable here.
-                lr_scheduler = build_lr_scheduler(self.custom_cfg['PasslDefault']['LRScheduler'], epochs, trainset_length, config['decay_unit'])
             param_dict = {'params': params_dict[item['name']], 'lr': lr_scheduler}    
 
             if self.weight_decay is not None and weight_decay_mult is not None:
                 param_dict['weight_decay'] = self.weight_decay * weight_decay_mult
             param_dict['tensor_fusion'] = tensor_fusion
             res.append(param_dict)
+        else:
+            res.append({'params': params_dict['PasslDefault'], 'tensor_fusion': tensor_fusion})
 
         msg = 'Parameter groups for optimizer: \n'
         for idx, item in enumerate(self.custom_cfg):
@@ -199,11 +199,85 @@ def forward(self, inp):
         return self.res_model(inp)
 
 class SwAVPretrain(SwAV):
-    def __init__(self, **kwargs):
+    def __init__(self, queue_length=0, crops_for_assign=(0, 1), nmb_crops=[2, 6], epsilon=0.05, freeze_prototypes_niters=5005, **kwargs):
         super().__init__(**kwargs)
+        self.crops_for_assign = crops_for_assign
+        self.nmb_crops = nmb_crops
+        self.temperature = 0.1
+        self.epsilon = epsilon
+        self.freeze_prototypes_niters = freeze_prototypes_niters
+
+        # initialize queue
+        self.queue = None
+        # queue_path = os.path.join('.', "queue" + str(0) + ".pth")
+        # if os.path.isfile(queue_path):
+        #     self.queue = paddle.load(queue_path)["queue"]
+        # # the queue needs to be divisible by the batch size
+        # queue_length = queue_length
+        # queue_length -= queue_length % (256)
+        # if queue_length > 0 and epoch >= 15 and self.queue is None:
+        #     self.queue = paddle.zeros([len(crops_for_assign),
+        #             queue_length // 4, kwargs['output_dim']])
     
+    @paddle.no_grad()
+    def distributed_sinkhorn(self, out, sinkhorn_iterations=3):
+        Q = paddle.exp(x=out / self.epsilon).t()
+        B = Q.shape[1] * 4
+        K = Q.shape[0]
+        sum_Q = paddle.sum(x=Q)
+        paddle.distributed.all_reduce(sum_Q)
+        Q /= sum_Q
+        for it in range(sinkhorn_iterations):
+            sum_of_rows = paddle.sum(x=Q, axis=1, keepdim=True)
+            paddle.distributed.all_reduce(sum_of_rows)
+            Q /= sum_of_rows
+            Q /= K
+            Q /= paddle.sum(x=Q, axis=0, keepdim=True)
+            Q /= B
+        Q *= B
+        return Q.t()
+
     def forward(self, inp):
-        return self.res_model(inp)
+        bs = inp[0].shape[0]
+
+        # normalize the prototypes
+        with paddle.no_grad():
+            w = self.res_model.prototypes.weight.clone()
+            w = paddle.nn.functional.normalize(x=w, axis=1, p=2)
+            self.res_model.prototypes.weight.copy_(w)
+        embedding, output = self.res_model(inp)
+        embedding = embedding.detach()
+
+        # compute loss
+        loss = 0
+        for i, crop_id in enumerate(self.crops_for_assign):
+            with paddle.no_grad():
+                out = output[bs * crop_id:bs * (crop_id + 1)].detach()
+                if self.queue is not None:
+                    if use_the_queue or not paddle.all(x=self.queue[(i), (-1), :] == 0):
+                        use_the_queue = True
+                        out = paddle.concat(x=(paddle.mm(input=self.queue[i],
+                            mat2=self.res_model.prototypes.weight.t()), out))
+                    self.queue[(i), bs:] = self.queue[(i), :-bs].clone()
+                    self.queue[(i), :bs] = embedding[crop_id * bs:(crop_id + 1) * bs]
+
+                q = self.distributed_sinkhorn(out)[-bs:]
+            subloss = 0
+            for v in np.delete(np.arange(np.sum(self.nmb_crops)), crop_id):
+                x = output[bs * v:bs * (v + 1)] / self.temperature
+                subloss -= paddle.mean(x=paddle.sum(x=q * paddle.nn.
+                    functional.log_softmax(x=x, axis=1), axis=1))
+            loss += subloss / (np.sum(self.nmb_crops) - 1)
+        loss /= len(self.crops_for_assign)
+
+        return 
+    
+    def after_loss_backward(self, iteration):
+        if iteration < self.freeze_prototypes_niters:
+            for name, p in self.res_model.named_parameters():
+                if 'prototypes' in name:
+                    p.grad = None
+
 
         
 def swav_resnet50_linearprobe(**kwargs):
@@ -216,13 +290,19 @@ def swav_resnet50_finetune(**kwargs):
         model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
     return model
 
-def swav_resnet50_pretrain(**kwargs): # todo
+def swav_resnet50_pretrain(apex, **kwargs): # todo
     flags = {}
     flags['FLAGS_cudnn_exhaustive_search'] = True
     flags['FLAGS_cudnn_deterministic'] = False
     paddle.set_flags(flags)
     if paddle.distributed.get_world_size() > 1:
-        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        if not apex:
+            model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        else:
+            # with apex syncbn speeds up computation than global syncbn
+            process_group = apex.parallel.create_syncbn_process_group(8)
+            model = apex.parallel.convert_syncbn_model(model, process_group=process_group)
+    
     model = SwAVPretrain(**kwargs)
     return model       
             
diff --git a/passl/optimizer/__init__.py b/passl/optimizer/__init__.py
index 5a2add56..43216690 100644
--- a/passl/optimizer/__init__.py
+++ b/passl/optimizer/__init__.py
@@ -34,7 +34,7 @@
 from .momentum_larc import MomentumLARC
 
 
-def build_optimizer(optim_config, model, config, trainset_length):
+def build_optimizer(optim_config, model, config, trainset_length, lr_scheduler):
     optim_config = copy.deepcopy(optim_config)
     optim_name = optim_config.pop('name')
     
@@ -91,8 +91,17 @@ def build_optimizer(optim_config, model, config, trainset_length):
 
             param_group.append(group)
 
+    lr = lr_scheduler	
+    lr_func = None	
+    if isinstance(lr_scheduler, LRCallable):	
+        lr = lr_scheduler.lr	
+        lr_func = lr_scheduler
+
     optim = eval(optim_name)(param_group,
+                             lr=lr,	
+                             lr_func=lr_func,
                              grad_clip=grad_clip,
                              **optim_config)
+                             
     logger.debug("build optimizer ({}) success..".format(optim))
     return optim
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
index 946001dd..974d84b1 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
@@ -42,6 +42,12 @@ Optimizer:
   weight_decay: 0.0
   tensor_fusion: False
   decay_unit: epoch
+  LRScheduler:
+    name: MultiStepDecay
+    learning_rate: 0.02
+    milestones: [12, 16]
+    gamma: 0.2
+    last_epoch: -1
   custom_cfg:
     - name: head
       LRScheduler:
@@ -50,13 +56,6 @@ Optimizer:
         milestones: [12, 16]
         gamma: 0.2
         last_epoch: -1
-    - name: PasslDefault
-      LRScheduler:
-        name: MultiStepDecay
-        learning_rate: 0.02
-        milestones: [12, 16]
-        gamma: 0.2
-        last_epoch: -1
 
 # data loader for train and eval
 DataLoader:
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
index 4780a9e1..c67ddd2a 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
@@ -43,14 +43,12 @@ Optimizer:
   weight_decay: 1e-6
   tensor_fusion: True
   decay_unit: epoch
-  custom_cfg:
-    - name: PasslDefault
-      LRScheduler:
-        name: TimmCosine
-        learning_rate: 0.3
-        eta_min: 0.0
-        last_epoch: 0
-        warmup_epoch: 0
+  LRScheduler:
+    name: TimmCosine
+    learning_rate: 0.3
+    eta_min: 0.0
+    last_epoch: 0
+    warmup_epoch: 0
 
 # data loader for train and eval
 DataLoader:
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
index c514c6bc..935d4ec7 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
@@ -13,14 +13,14 @@ Global:
   eval_interval: 1
   eval_unit: "epoch"
   accum_steps: 1
-  epochs: 800
+  epochs: 400 # 800
   print_batch_step: 100
   use_visualdl: False
   seed: 31
 
 # FP16 setting
-FP16:
-  level: O1
+# FP16:
+#   level: O1
 #   GradScaler:
 #     init_loss_scaling: 65536.0
 #     incr_every_n_steps: 2000
@@ -31,11 +31,33 @@ DistributedStrategy:
 # model architecture
 Model:
   name: swav_resnet50_pretrain
+  apex: False
+  queue_length: 3804 # 0
+  crops_for_assign: [0, 1]
+  nmb_crops: [2, 6]
+  epsilon: 0.05
+  freeze_prototypes_niters: 5005 # 313
   normalize: True
   hidden_mlp: 2048
   output_dim: 128
   nmb_prototypes: 3000
 
+# Optimizer:
+#   name: MomentumLARC
+#   momentum: 0.9
+#   weight_decay: 1e-6
+#   trust_coefficient: 0.001
+#   clip: False
+#   tensor_fusion: False
+#   decay_unit: step
+#   LRScheduler:
+#     name: TimmCosine
+#     learning_rate: 4.8
+#     eta_min: 0.0048
+#     warmup_epoch: 10
+#     warmup_start_lr: 0.3
+#     warmup_prefix: True
+
 Optimizer:
   name: MomentumLARC
   momentum: 0.9
@@ -43,33 +65,31 @@ Optimizer:
   trust_coefficient: 0.001
   clip: False
   tensor_fusion: False
-  decay_unit: epoch
-  custom_cfg:
-    - name: PasslDefault
-      LRScheduler:
-        name: TimmCosine
-        learning_rate: 4.8
-        decay_unit: step
-        eta_min: 0.0048
-        warmup_epoch: 10
-        warmup_start_lr: 0.3
-        warmup_prefix: True
+  decay_unit: step
+  LRScheduler:
+    name: TimmCosine
+    learning_rate: 0.6
+    eta_min: 0.0006
+    warmup_epoch: 0
+    warmup_start_lr: 0.
+    warmup_prefix: True
+    last_epoch: 0
 
 # data loader for train and eval
 DataLoader:
   Train:
     dataset:
       name: MultiCropDataset
-      root: ./dataset/ILSVRC2012/train
+      root: ./data/ILSVRC2012/train
       size_crops: [224, 96]
       num_crops: [2, 6]
       min_scale_crops: [0.14, 0.05]
       max_scale_crops: [1, 0.14]
     sampler:
       name: DistributedBatchSampler
-      batch_size: 128 # accum_steps: 1, total batchsize: 4096
+      batch_size: 64 # 4card # 128 32 card # accum_steps: 1, total batchsize: 4096
       drop_last: False
       shuffle: True
     loader:
-      num_workers: 8
+      num_workers: 10
       use_shared_memory: True
diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh
index d1c866c6..6972eddd 100644
--- a/tasks/ssl/swav/pretrain.sh
+++ b/tasks/ssl/swav/pretrain.sh
@@ -16,7 +16,9 @@ unset PADDLE_TRAINER_ENDPOINTS
 unset DISTRIBUTED_TRAINER_ENDPOINTS
 export PADDLE_NNODES=1
 export PADDLE_MASTER="127.0.0.1:12538"
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export CUDA_VISIBLE_DEVICES=4,5,6,7
+# export CUDA_VISIBLE_DEVICES=7
 
 python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \

From 1d69baa127a7dddf22498a2546d7ed7691ea53c6 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Thu, 4 May 2023 20:52:07 +0800
Subject: [PATCH 14/46] valid_pretrain

---
 passl/data/dataset/multicrop_dataset.py       |  12 +-
 passl/data/preprocess/basic_transforms.py     |   2 +-
 passl/engine/engine.py                        |  15 +--
 .../engine/loops/contrastive_learning_loop.py | 116 ++++++++++++++++--
 passl/engine/loops/loop.py                    |   2 +-
 passl/models/resnet.py                        |  10 +-
 passl/models/swav.py                          |  51 +++++---
 passl/optimizer/optimizer.py                  |   2 +-
 ...v_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml |  95 ++++++++++++++
 ..._resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml |  43 ++-----
 tasks/ssl/swav/pretrain.sh                    |   7 +-
 11 files changed, 271 insertions(+), 84 deletions(-)
 create mode 100644 tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml

diff --git a/passl/data/dataset/multicrop_dataset.py b/passl/data/dataset/multicrop_dataset.py
index ffa30008..42b800f7 100644
--- a/passl/data/dataset/multicrop_dataset.py
+++ b/passl/data/dataset/multicrop_dataset.py
@@ -36,14 +36,12 @@ def __init__(self,
                  size_crops,
                  num_crops,
                  min_scale_crops,
-                 max_scale_crops,
-                 return_label=False):
+                 max_scale_crops):
         super(MultiCropDataset, self).__init__(root)
 
         assert len(size_crops) == len(num_crops)
         assert len(min_scale_crops) == len(num_crops)
         assert len(max_scale_crops) == len(num_crops)
-        self.return_label = return_label
 
         color_transform = [get_color_distortion(), get_pil_gaussian_blur()]
         mean = [0.485, 0.456, 0.406]
@@ -71,13 +69,11 @@ def __getitem__(self, index):
         Returns:
             tuple: (sample, target) where target is class_index of the target class.
         """
-        path, target = self.samples[index]
+        path, target = self.imgs[index]
         sample = self.loader(path)
         sample = list(map(lambda trans: trans(sample), self.trans))
-        if self.return_label:
-            return sample, target
-
-        return sample
+        
+        return sample, target
 
 
 def get_pil_gaussian_blur(p=0.5):
diff --git a/passl/data/preprocess/basic_transforms.py b/passl/data/preprocess/basic_transforms.py
index 96a784c3..ace7c1d2 100644
--- a/passl/data/preprocess/basic_transforms.py
+++ b/passl/data/preprocess/basic_transforms.py
@@ -57,7 +57,7 @@
     "SimCLRGaussianBlur",
     "BYOLSolarize",
     "MAERandCropImage",
-    "GaussianBlur"
+    # "GaussianBlur"
 ]
 
 
diff --git a/passl/engine/engine.py b/passl/engine/engine.py
index aa8164a0..c50b5084 100644
--- a/passl/engine/engine.py
+++ b/passl/engine/engine.py
@@ -229,13 +229,14 @@ def worker_init_fn(worker_id):
             self.optimizer = build_optimizer(self.config["Optimizer"], self.model, self.config, len(self.train_dataloader), self.lr_scheduler)
 
         # load pretrained model
-            assert isinstance(
-                self.config["Global"]["pretrained_model"], str
-            ), "pretrained_model type is not available. Please use `string`."
-            self.model.load_pretrained(
-                self.config["Global"]["pretrained_model"],
-                self.config["Global"]["rank"],
-                self.config["Global"].get("finetune", False))
+            if  self.config["Global"]["pretrained_model"] is not None:
+                assert isinstance(
+                    self.config["Global"]["pretrained_model"], str
+                ), "pretrained_model type is not available. Please use `string`."
+                self.model.load_pretrained(
+                    self.config["Global"]["pretrained_model"],
+                    self.config["Global"]["rank"],
+                    self.config["Global"].get("finetune", False))
 
         # for distributed
         if self.config["Global"]["distributed"]:
diff --git a/passl/engine/loops/contrastive_learning_loop.py b/passl/engine/loops/contrastive_learning_loop.py
index f6fe4fbe..4ebf1346 100644
--- a/passl/engine/loops/contrastive_learning_loop.py
+++ b/passl/engine/loops/contrastive_learning_loop.py
@@ -16,7 +16,11 @@
 from __future__ import division
 from __future__ import print_function
 
+import os
 import sys
+import logging
+from datetime import timedelta
+
 import time
 import collections
 import platform
@@ -28,12 +32,87 @@
 from passl.utils import logger
 from .loop import TrainingEpochLoop
 
+
+class LogFormatter:
+    def __init__(self):
+        self.start_time = time.time()
+
+    def format(self, record):
+        elapsed_seconds = round(record.created - self.start_time)
+
+        prefix = "%s - %s - %s" % (
+            record.levelname,
+            time.strftime("%x %X"),
+            timedelta(seconds=elapsed_seconds),
+        )
+        message = record.getMessage()
+        message = message.replace("\n", "\n" + " " * (len(prefix) + 3))
+        return "%s - %s" % (prefix, message) if message else ""
+
+
+def create_logger(filepath, rank):
+    """
+    Create a logger.
+    Use a different log file for each process.
+    """
+    # create log formatter
+    log_formatter = LogFormatter()
+
+    # create file handler and set level to debug
+    if filepath is not None:
+        if rank > 0:
+            filepath = "%s-%i" % (filepath, rank)
+        file_handler = logging.FileHandler(filepath, "a")
+        file_handler.setLevel(logging.DEBUG)
+        file_handler.setFormatter(log_formatter)
+
+    # create console handler and set level to info
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    console_handler.setFormatter(log_formatter)
+
+    # create logger and set level to debug
+    logger = logging.getLogger()
+    logger.handlers = []
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+    if filepath is not None:
+        logger.addHandler(file_handler)
+    logger.addHandler(console_handler)
+
+    # reset logger elapsed time
+    def reset_time():
+        log_formatter.start_time = time.time()
+
+    logger.reset_time = reset_time
+
+    return logger
+
+
+def init_logger(name):
+    logger = create_logger(
+        os.path.join("{}.log".format(name)), rank=0
+    )
+    logger.info("============ Initialized logger ============")
+    logger.info("")
+    return logger
+
+
+def log_model(model, logger):
+    model1 = model.res_model
+    for name, param in model1.named_parameters():
+        logger.info(name)
+        logger.info(param.abs().sum())
+        if param.grad is not None:
+            logger.info(name+'grad')
+            logger.info(param.grad.abs().sum())
+        
 class ContrastiveLearningTrainingEpochLoop(TrainingEpochLoop):
 
     def __init__(self, trainer, epochs, max_train_step=None, val_loop=None):
         super().__init__(trainer, epochs, max_train_step=max_train_step, val_loop=val_loop)
 
-    def forward_backward(self, batch):
+    def forward_backward(self, batch, total_iterations):
         # Gradient Merge(GuoxiaWang): Accumulate gradient over multiple
         # steps to save on memory.
 
@@ -57,6 +136,9 @@ def forward_backward(self, batch):
                 if isinstance(loss_dict, paddle.Tensor):
                     loss_dict = {'loss': loss_dict}
 
+            ####### test            #######
+            # logger1 = init_logger('before_pretrain')
+            # log_model(self.trainer.model, logger1)
             for key in loss_dict:
                 loss_dict[key] = loss_dict[key] / self.trainer.accum_steps
 
@@ -66,30 +148,44 @@ def forward_backward(self, batch):
             # loss scaling if using fp16 otherwise do nothing
             scaled = self.trainer.scaler.scale(loss_dict["loss"])
             scaled.backward()
-
+            
+            try: 
+                self.trainer.model.after_loss_backward(total_iterations)
+            except AttributeError:
+                logger.warning("Model has no after_loss_backward method, ignored this process")
+            
+            ####### test            #######
+#             grad_sync(self.trainer.optimizer.param_groups)
+
+#             # do unscale and step if using fp16 and not found nan/inf
+#             # otherwise do nothing
+#             self.trainer.scaler.step(self.trainer.optimizer)
+#             # do update loss scaling if using fp16
+#             # otherwise do nothing
+#             self.trainer.scaler.update()
+            
+#             logger2 = init_logger('after_pretrain')
+            # log_model(self.trainer.model, logger2)
+        # print('final_loss_dict', final_loss_dict)
         return final_loss_dict
 
     def train_one_step(self, batch, total_iterations):
 
         # remove label
         batch = batch[0]
-
-        # do forward and backward
-        loss_dict = self.forward_backward(batch)
         
-        try: 
-            self.trainer.model.after_loss_backward(total_iterations)
-        except AttributeError:
-            logger.warning("Model has no after_loss_backward method, ignored this process")
+        # do forward and backward
+        loss_dict = self.forward_backward(batch, total_iterations)
 
         grad_sync(self.trainer.optimizer.param_groups)
 
         # do unscale and step if using fp16 and not found nan/inf
         # otherwise do nothing
-        self.trainer.scaler.step(self.trainer.optimizer) # todo  # check this will updata weight, before this weight is not updated
+        self.trainer.scaler.step(self.trainer.optimizer) 
         # do update loss scaling if using fp16
         # otherwise do nothing
         self.trainer.scaler.update()
+        
         # clear gradients
         self.trainer.optimizer.clear_grad()
 
diff --git a/passl/engine/loops/loop.py b/passl/engine/loops/loop.py
index dbe60dd6..959bb386 100644
--- a/passl/engine/loops/loop.py
+++ b/passl/engine/loops/loop.py
@@ -264,7 +264,7 @@ def train_one_epoch(self, epoch_id):
 
         for batch_idx, batch in enumerate(self.trainer.train_dataloader):
             self.cur_batch_idx = batch_idx
-            total_iterations = epoch_id*self.total_batch_idx + batch_idx
+            total_iterations = (epoch_id-1)*self.total_batch_idx + batch_idx
 
             if self.max_train_step is not None and self.global_step >= self.max_train_step:
                 logger.info(
diff --git a/passl/models/resnet.py b/passl/models/resnet.py
index 34761215..735d6485 100644
--- a/passl/models/resnet.py
+++ b/passl/models/resnet.py
@@ -109,7 +109,7 @@ def __init__(self, block, layers, zero_init_residual=False, groups=1,
         
         super(ResNet, self).__init__()
         if norm_layer is None:
-            norm_layer = functools.partial(paddle.nn.BatchNorm2D, use_global_stats=True)
+            norm_layer = functools.partial(paddle.nn.BatchNorm2D, use_global_stats=False)
         self._norm_layer = norm_layer
         self.eval_mode = eval_mode
         self.padding = paddle.nn.Pad2D(padding=1, value=0.0)
@@ -198,7 +198,9 @@ def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
     def forward_backbone(self, x):
         x = self.padding(x)
         x = self.conv1(x)
+        # print("before bn mean var", self.bn1._mean.mean(), self.bn1._variance.mean())
         x = self.bn1(x)
+        # print("bn mean var", self.bn1._mean.mean(), self.bn1._variance.mean())
         x = self.relu(x)
         x = self.maxpool(x)
         x = self.layer1(x)
@@ -213,7 +215,9 @@ def forward_backbone(self, x):
 
     def forward_head(self, x):
         if self.projection_head is not None:
+            # print("before proj bn mean var", self.projection_head[1]._mean.mean(), self.projection_head[1]._variance.mean())
             x = self.projection_head(x)
+            # print(" proj bn mean var", self.projection_head[1]._mean.mean(), self.projection_head[1]._variance.mean())
         if self.l2norm:
             x = paddle.nn.functional.normalize(x=x, axis=1, p=2)
         if self.prototypes is not None:
@@ -229,8 +233,7 @@ def forward(self, inputs):
             return_counts=True)[1], axis=0) # padiff
         start_idx = 0
         for end_idx in idx_crops:
-            _out = self.forward_backbone(paddle.concat(x=inputs[start_idx:
-                end_idx]))
+            _out = self.forward_backbone(paddle.concat(x=inputs[start_idx:end_idx]))
             if start_idx == 0:
                 output = _out
             else:
@@ -240,7 +243,6 @@ def forward(self, inputs):
 
 
 class MultiPrototypes(paddle.nn.Layer):
-
     def __init__(self, output_dim, nmb_prototypes):
         super(MultiPrototypes, self).__init__()
         self.nmb_heads = len(nmb_prototypes)
diff --git a/passl/models/swav.py b/passl/models/swav.py
index 502bff83..311b7b19 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -84,7 +84,10 @@ def load_pretrained(self, path, rank=0, finetune=False):
 
     def save(self, path, local_rank=0, rank=0):
         paddle.save(self.state_dict(), path + ".pdparams")
-        
+
+    def _freeze_norm(self, layer):
+        if isinstance(layer, (nn.layer.norm._BatchNormBase)):
+            layer._use_global_stats = True
         
 class SwAVLinearProbe(SwAV):
     def __init__(self, class_num=1000, **kwargs):
@@ -103,10 +106,6 @@ def __init__(self, class_num=1000, **kwargs):
         assert len(parameters) == 2  # weight, bias
         
         self.apply(self._freeze_norm)
-
-    def _freeze_norm(self, layer):
-        if isinstance(layer, (nn.layer.norm._BatchNormBase)):
-            layer._use_global_stats = True
     
     def load_pretrained(self, path, rank=0, finetune=False):
         self._load_model(path, self.res_model, 'backbone')
@@ -127,11 +126,7 @@ def __init__(self, **kwargs):
     def load_pretrained(self, path, rank=0, finetune=False):
         self._load_model(path, self.res_model, 'backbone') 
         # self._load_model("projection_head.pdparams", self.res_model.projection_head, 'projection_head')
-    
-    def _freeze_norm(self, layer):
-        if isinstance(layer, (nn.layer.norm._BatchNormBase)):
-            layer._use_global_stats = True
-        
+
     def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=None):
         """
         custom_cfg(dict|optional): [{'name': 'backbone', 'lr': 0.1, 'LRScheduler': {"lr":1.0}}, {'name': 'norm', 'weight_decay_mult': 0}]
@@ -218,7 +213,12 @@ def __init__(self, queue_length=0, crops_for_assign=(0, 1), nmb_crops=[2, 6], ep
         # if queue_length > 0 and epoch >= 15 and self.queue is None:
         #     self.queue = paddle.zeros([len(crops_for_assign),
         #             queue_length // 4, kwargs['output_dim']])
+        # self.load_pretrained('swav_800ep_pretrain.pdparams') 
+        self.apply(self._freeze_norm)
     
+    def load_pretrained(self, path, rank=0, finetune=False):
+        self._load_model('swav_800ep_pretrain.pdparams', self.res_model, 'backbone') 
+        
     @paddle.no_grad()
     def distributed_sinkhorn(self, out, sinkhorn_iterations=3):
         Q = paddle.exp(x=out / self.epsilon).t()
@@ -238,14 +238,21 @@ def distributed_sinkhorn(self, out, sinkhorn_iterations=3):
         return Q.t()
 
     def forward(self, inp):
+        # ####### test            #######
+        # import numpy as np
+        # np.random.seed(42)
+        # a = np.random.rand(32, 3, 224, 224)
+        # inp = paddle.to_tensor(a).astype('float32')
         bs = inp[0].shape[0]
 
         # normalize the prototypes
         with paddle.no_grad():
             w = self.res_model.prototypes.weight.clone()
-            w = paddle.nn.functional.normalize(x=w, axis=1, p=2)
-            self.res_model.prototypes.weight.copy_(w)
+            w = paddle.nn.functional.normalize(x=w, axis=0, p=2) # 1
+            paddle.assign(w, self.res_model.prototypes.weight)
         embedding, output = self.res_model(inp)
+        # print('output, embedding', embedding.mean(), output.mean(), inp.mean())
+        # import pdb; pdb.set_trace()
         embedding = embedding.detach()
 
         # compute loss
@@ -253,6 +260,7 @@ def forward(self, inp):
         for i, crop_id in enumerate(self.crops_for_assign):
             with paddle.no_grad():
                 out = output[bs * crop_id:bs * (crop_id + 1)].detach()
+                # print('bs, crop_id', bs, crop_id, self.nmb_crops)
                 if self.queue is not None:
                     if use_the_queue or not paddle.all(x=self.queue[(i), (-1), :] == 0):
                         use_the_queue = True
@@ -262,23 +270,28 @@ def forward(self, inp):
                     self.queue[(i), :bs] = embedding[crop_id * bs:(crop_id + 1) * bs]
 
                 q = self.distributed_sinkhorn(out)[-bs:]
+                # print('out.mean(), q.mean()', out.mean(), q.mean())
+            
             subloss = 0
+            # print(output.shape)
             for v in np.delete(np.arange(np.sum(self.nmb_crops)), crop_id):
                 x = output[bs * v:bs * (v + 1)] / self.temperature
                 subloss -= paddle.mean(x=paddle.sum(x=q * paddle.nn.
                     functional.log_softmax(x=x, axis=1), axis=1))
+                # print('v, subloss', v, subloss)
+                
             loss += subloss / (np.sum(self.nmb_crops) - 1)
+            # print('i, loss', i, loss)
+        # import pdb; pdb.set_trace()
         loss /= len(self.crops_for_assign)
 
-        return 
+        return loss
     
     def after_loss_backward(self, iteration):
         if iteration < self.freeze_prototypes_niters:
             for name, p in self.res_model.named_parameters():
-                if 'prototypes' in name:
-                    p.grad = None
-
-
+                if 'prototypes' in name and p.grad is not None:
+                    p.clear_grad()
         
 def swav_resnet50_linearprobe(**kwargs):
     model = SwAVLinearProbe(**kwargs)
@@ -295,6 +308,9 @@ def swav_resnet50_pretrain(apex, **kwargs): # todo
     flags['FLAGS_cudnn_exhaustive_search'] = True
     flags['FLAGS_cudnn_deterministic'] = False
     paddle.set_flags(flags)
+
+    model = SwAVPretrain(**kwargs)
+
     if paddle.distributed.get_world_size() > 1:
         if not apex:
             model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
@@ -303,7 +319,6 @@ def swav_resnet50_pretrain(apex, **kwargs): # todo
             process_group = apex.parallel.create_syncbn_process_group(8)
             model = apex.parallel.convert_syncbn_model(model, process_group=process_group)
     
-    model = SwAVPretrain(**kwargs)
     return model       
             
 class RegLog(paddle.nn.Layer):
diff --git a/passl/optimizer/optimizer.py b/passl/optimizer/optimizer.py
index 94c4561f..234af8b9 100644
--- a/passl/optimizer/optimizer.py
+++ b/passl/optimizer/optimizer.py
@@ -214,7 +214,7 @@ def lr_step(self, step=None):
             elif 'lr_func' in group and callable(group['lr_func']):
                 group['lr_func'](group, step)
 
-        print("####lr0 {}, lr0 {}".format(self.param_groups[0]['lr'].get_lr(), self.param_groups[1]['lr'].get_lr()))
+        # print("####lr0 {}, lr0 {}".format(self.param_groups[0]['lr'].get_lr(), self.param_groups[1]['lr'].get_lr()))
 
     @paddle.no_grad()
     def get_lr(self, group_id=0):
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
new file mode 100644
index 00000000..ba38de7e
--- /dev/null
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
@@ -0,0 +1,95 @@
+# global configs
+Global:
+  task_type: ContrastiveLearning
+  train_loop: ContrastiveLearningTrainingEpochLoop
+  validate_loop: None
+  checkpoint: null
+  pretrained_model: null
+  output_dir: ./output/pretrain_0504_fp16
+  device: gpu
+  save_interval: 1
+  max_num_latest_checkpoint: 0
+  eval_during_train: False
+  eval_interval: 1
+  eval_unit: "epoch"
+  accum_steps: 1
+  epochs: 400 # 800
+  print_batch_step: 100
+  use_visualdl: False
+  seed: 31
+
+# FP16 setting
+FP16:
+  level: O1
+#   GradScaler:
+#     init_loss_scaling: 65536.0
+#     incr_every_n_steps: 2000
+
+DistributedStrategy:
+  data_parallel: True
+
+# model architecture
+Model:
+  name: swav_resnet50_pretrain
+  apex: False
+  queue_length: 3804 # 0
+  crops_for_assign: [0, 1]
+  nmb_crops: [2, 6]
+  epsilon: 0.05
+  freeze_prototypes_niters: 5005 # 313
+  normalize: True
+  hidden_mlp: 2048
+  output_dim: 128
+  nmb_prototypes: 3000
+
+# Optimizer:
+#   name: MomentumLARC
+#   momentum: 0.9
+#   weight_decay: 1e-6
+#   trust_coefficient: 0.001
+#   clip: False
+#   tensor_fusion: False
+#   decay_unit: step
+#   LRScheduler:
+#     name: TimmCosine
+#     learning_rate: 4.8
+#     eta_min: 0.0048
+#     warmup_epoch: 10
+#     warmup_start_lr: 0.3
+#     warmup_prefix: True
+
+Optimizer:
+  name: MomentumLARC
+  momentum: 0.9
+  weight_decay: 1e-6
+  trust_coefficient: 0.001
+  clip: False
+  tensor_fusion: False
+  decay_unit: step
+  LRScheduler:
+    name: TimmCosine
+    learning_rate: 0.6
+    eta_min: 0.0006
+    warmup_epoch: 0
+    warmup_start_lr: 0.
+    warmup_prefix: True
+    last_epoch: 0
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiCropDataset
+      root: ./data/ILSVRC2012
+      size_crops: [224, 96]
+      num_crops: [2, 6]
+      min_scale_crops: [0.14, 0.05]
+      max_scale_crops: [1, 0.14]
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64 # 4 card # 128 32 card # accum_steps: 1, total batchsize: 4096
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 10
+      use_shared_memory: True
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
index 935d4ec7..d0292c58 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
@@ -5,7 +5,7 @@ Global:
   validate_loop: None
   checkpoint: null
   pretrained_model: null
-  output_dir: ./output/pretrain_0420
+  output_dir: ./output/pretrain_0504_fp16
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
@@ -13,14 +13,14 @@ Global:
   eval_interval: 1
   eval_unit: "epoch"
   accum_steps: 1
-  epochs: 400 # 800
+  epochs: 800
   print_batch_step: 100
   use_visualdl: False
   seed: 31
 
 # FP16 setting
-# FP16:
-#   level: O1
+FP16:
+  level: O1
 #   GradScaler:
 #     init_loss_scaling: 65536.0
 #     incr_every_n_steps: 2000
@@ -32,32 +32,16 @@ DistributedStrategy:
 Model:
   name: swav_resnet50_pretrain
   apex: False
-  queue_length: 3804 # 0
+  queue_length: 0
   crops_for_assign: [0, 1]
   nmb_crops: [2, 6]
   epsilon: 0.05
-  freeze_prototypes_niters: 5005 # 313
+  freeze_prototypes_niters: 313
   normalize: True
   hidden_mlp: 2048
   output_dim: 128
   nmb_prototypes: 3000
 
-# Optimizer:
-#   name: MomentumLARC
-#   momentum: 0.9
-#   weight_decay: 1e-6
-#   trust_coefficient: 0.001
-#   clip: False
-#   tensor_fusion: False
-#   decay_unit: step
-#   LRScheduler:
-#     name: TimmCosine
-#     learning_rate: 4.8
-#     eta_min: 0.0048
-#     warmup_epoch: 10
-#     warmup_start_lr: 0.3
-#     warmup_prefix: True
-
 Optimizer:
   name: MomentumLARC
   momentum: 0.9
@@ -68,27 +52,26 @@ Optimizer:
   decay_unit: step
   LRScheduler:
     name: TimmCosine
-    learning_rate: 0.6
-    eta_min: 0.0006
-    warmup_epoch: 0
-    warmup_start_lr: 0.
+    learning_rate: 4.8
+    eta_min: 0.0048
+    warmup_epoch: 10
+    warmup_start_lr: 0.3
     warmup_prefix: True
-    last_epoch: 0
 
 # data loader for train and eval
 DataLoader:
   Train:
     dataset:
       name: MultiCropDataset
-      root: ./data/ILSVRC2012/train
+      root: ./data/ILSVRC2012
       size_crops: [224, 96]
       num_crops: [2, 6]
       min_scale_crops: [0.14, 0.05]
       max_scale_crops: [1, 0.14]
     sampler:
       name: DistributedBatchSampler
-      batch_size: 64 # 4card # 128 32 card # accum_steps: 1, total batchsize: 4096
-      drop_last: False
+      batch_size: 64 # 4 card # 128 32 card # accum_steps: 1, total batchsize: 4096
+      drop_last: True
       shuffle: True
     loader:
       num_workers: 10
diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh
index 6972eddd..d30ff34b 100644
--- a/tasks/ssl/swav/pretrain.sh
+++ b/tasks/ssl/swav/pretrain.sh
@@ -16,12 +16,11 @@ unset PADDLE_TRAINER_ENDPOINTS
 unset DISTRIBUTED_TRAINER_ENDPOINTS
 export PADDLE_NNODES=1
 export PADDLE_MASTER="127.0.0.1:12538"
-# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-export CUDA_VISIBLE_DEVICES=4,5,6,7
-# export CUDA_VISIBLE_DEVICES=7
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 
 python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
-    tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
\ No newline at end of file
+    tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
+    # tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
\ No newline at end of file

From 5922ab1f41d62dec533d8a1dc910ed6880a39745 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Thu, 4 May 2023 21:14:41 +0800
Subject: [PATCH 15/46] format

---
 passl/data/preprocess/basic_transforms.py     |  19 ---
 passl/engine/loops/classification_loop.py     | 117 +-----------------
 .../engine/loops/contrastive_learning_loop.py |   3 +-
 passl/models/swav.py                          |  29 +----
 ..._resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml} |   0
 ...v_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml |  19 ---
 tasks/ssl/swav/finetune.sh                    |   2 +-
 7 files changed, 6 insertions(+), 183 deletions(-)
 rename tasks/ssl/swav/configs/{swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml => swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml} (100%)

diff --git a/passl/data/preprocess/basic_transforms.py b/passl/data/preprocess/basic_transforms.py
index ace7c1d2..7be2b26a 100644
--- a/passl/data/preprocess/basic_transforms.py
+++ b/passl/data/preprocess/basic_transforms.py
@@ -57,7 +57,6 @@
     "SimCLRGaussianBlur",
     "BYOLSolarize",
     "MAERandCropImage",
-    # "GaussianBlur"
 ]
 
 
@@ -942,21 +941,3 @@ def __call__(self, img):
             else:
                 img = ImageOps.solarize(img)
         return img
-
-
-# class GaussianBlur(object):
-#     """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709"""
-#     def __init__(self, sigma=[.1, 2.], _PIL=False):
-#         self.sigma = sigma
-#         self.kernel_size = 23
-#         self._PIL = _PIL
-
-#     def __call__(self, x):
-#         sigma = np.random.uniform(self.sigma[0], self.sigma[1])
-#         if self._PIL:
-#             x = x.filter(ImageFilter.GaussianBlur(radius=sigma))
-#             return x
-#         else:
-#             x = cv2.GaussianBlur(np.array(x),
-#                                  (self.kernel_size, self.kernel_size), sigma)
-#             return Image.fromarray(x.astype(np.uint8))
\ No newline at end of file
diff --git a/passl/engine/loops/classification_loop.py b/passl/engine/loops/classification_loop.py
index 4c7349a2..08bdc1d0 100644
--- a/passl/engine/loops/classification_loop.py
+++ b/passl/engine/loops/classification_loop.py
@@ -32,85 +32,6 @@
 from .loop import _Loop, TrainingEpochLoop
 
 
-import os
-import logging
-import time
-from datetime import timedelta
-import pandas as pd
-
-
-class LogFormatter:
-    def __init__(self):
-        self.start_time = time.time()
-
-    def format(self, record):
-        elapsed_seconds = round(record.created - self.start_time)
-
-        prefix = "%s - %s - %s" % (
-            record.levelname,
-            time.strftime("%x %X"),
-            timedelta(seconds=elapsed_seconds),
-        )
-        message = record.getMessage()
-        message = message.replace("\n", "\n" + " " * (len(prefix) + 3))
-        return "%s - %s" % (prefix, message) if message else ""
-
-
-def create_logger(filepath, rank):
-    """
-    Create a logger.
-    Use a different log file for each process.
-    """
-    # create log formatter
-    log_formatter = LogFormatter()
-
-    # create file handler and set level to debug
-    if filepath is not None:
-        if rank > 0:
-            filepath = "%s-%i" % (filepath, rank)
-        file_handler = logging.FileHandler(filepath, "a")
-        file_handler.setLevel(logging.DEBUG)
-        file_handler.setFormatter(log_formatter)
-
-    # create console handler and set level to info
-    console_handler = logging.StreamHandler()
-    console_handler.setLevel(logging.INFO)
-    console_handler.setFormatter(log_formatter)
-
-    # create logger and set level to debug
-    logger = logging.getLogger()
-    logger.handlers = []
-    logger.setLevel(logging.DEBUG)
-    logger.propagate = False
-    if filepath is not None:
-        logger.addHandler(file_handler)
-    logger.addHandler(console_handler)
-
-    # reset logger elapsed time
-    def reset_time():
-        log_formatter.start_time = time.time()
-
-    logger.reset_time = reset_time
-
-    return logger
-
-
-def init_logger(name):
-    logger = create_logger(
-        os.path.join("{}.log".format(name)), rank=0
-    )
-    logger.info("============ Initialized logger ============")
-    logger.info("")
-    return logger
-
-
-def log_model(model, logger):
-    model1 = model.res_model
-    for name, param in model1.named_parameters():
-        logger.info(name)
-        logger.info(param.abs().mean())
-
-        
 class ClassificationTrainingEpochLoop(TrainingEpochLoop):
 
     def __init__(self, trainer, epochs, max_train_step=None, val_loop=None):
@@ -131,13 +52,6 @@ def forward_backward(self, batch):
             data = batch[0][idx * step_size:(idx + 1) * step_size]
             label = batch[1][idx * step_size:(idx + 1) * step_size]
             
-            ####### test            #######
-            # label = paddle.to_tensor([133, 141, 371, 254,  89, 244,  33,  64, 542,  93, 262, 674, 898, 796, 785, 727, 228, 792, 853, 639, 410, 357, 545, 473, 637, 400, 863, 386, 689, 359, 476, 960]).cast('int32')
-            # import numpy as np
-            # np.random.seed(42)
-            # a = np.random.rand(32, 3, 224, 224)
-            # data = paddle.to_tensor(a).astype('float32')
-            
             # do cast if using fp16 otherwise do nothing
             with paddle.amp.auto_cast(
                     enable=self.trainer.fp16,
@@ -149,11 +63,6 @@ def forward_backward(self, batch):
                 final_out.append(out)
                             
             loss_dict = self.trainer.train_loss_func(out, label)
-            # import pdb; pdb.set_trace()
-
-            ####### test            #######
-            # logger1 = init_logger('before')
-            # log_model(self.trainer.model, logger1)
 
             for key in loss_dict:
                 loss_dict[key] = loss_dict[key] / self.trainer.accum_steps
@@ -164,20 +73,6 @@ def forward_backward(self, batch):
             # loss scaling if using fp16 otherwise do nothing
             scaled = self.trainer.scaler.scale(loss_dict["loss"])
             scaled.backward()
-            
-            ####### test            #######
-#             grad_sync(self.trainer.optimizer.param_groups)
-
-#             # do unscale and step if using fp16 and not found nan/inf
-#             # otherwise do nothing
-#             self.trainer.scaler.step(self.trainer.optimizer)
-#             # do update loss scaling if using fp16
-#             # otherwise do nothing
-#             self.trainer.scaler.update()
-            
-            # logger2 = init_logger('after')
-            # log_model(self.trainer.model, logger2)
-            
 
         out = paddle.concat(final_out, axis=0)
         return out, final_loss_dict
@@ -198,7 +93,7 @@ def train_one_step(self, batch, total_iterations=None):
         # clear gradients
         self.trainer.optimizer.clear_grad()
         
-        if self.trainer.lr_decay_unit == 'step': # default is step
+        if self.trainer.lr_decay_unit == 'step':
             self.trainer.optimizer.lr_step(self.global_step)
 
         return out, loss_dict
@@ -281,19 +176,9 @@ def eval_one_dataset(self, eval_dataloader):
                     custom_black_list=self.trainer.fp16_custom_black_list,
                     level=self.trainer.fp16_level):
                 
-                ####### test            #######
-                # label = paddle.to_tensor([133, 141, 371, 254,  89, 244,  33,  64, 542,  93, 262, 674, 898, 796, 785, 727, 228, 792, 853, 639, 410, 357, 545, 473, 637, 400, 863, 386, 689, 359, 476, 960, 133, 141, 371, 254,  89, 244,  33,  64, 542,  93, 262, 674, 898, 796, 785, 727, 228, 792, 853, 639, 410, 357, 545, 473, 637, 400, 863, 386, 689, 359, 476, 960]).cast('int32')
-                # import numpy as np
-                # np.random.seed(42)
-                # a = np.random.rand(32, 3, 224, 224)
-                # data = paddle.to_tensor(a).astype('float32')
-                
-                # import pdb; pdb.set_trace()
-                # out = self.trainer.model(data)
                 out = self.trainer.model(batch[0])
                 # calc loss
                 if self.trainer.eval_loss_func is not None:
-                    # loss_dict = self.trainer.eval_loss_func(out, target)
                     loss_dict = self.trainer.eval_loss_func(out, batch[1])
                     for key in loss_dict:
                         if key not in output_info:
diff --git a/passl/engine/loops/contrastive_learning_loop.py b/passl/engine/loops/contrastive_learning_loop.py
index 4ebf1346..d943d4cc 100644
--- a/passl/engine/loops/contrastive_learning_loop.py
+++ b/passl/engine/loops/contrastive_learning_loop.py
@@ -181,14 +181,13 @@ def train_one_step(self, batch, total_iterations):
 
         # do unscale and step if using fp16 and not found nan/inf
         # otherwise do nothing
-        self.trainer.scaler.step(self.trainer.optimizer) 
+        self.trainer.scaler.step(self.trainer.optimizer)
         # do update loss scaling if using fp16
         # otherwise do nothing
         self.trainer.scaler.update()
         
         # clear gradients
         self.trainer.optimizer.clear_grad()
-
         if self.trainer.lr_decay_unit == 'step':
             self.trainer.optimizer.lr_step(self.global_step)
 
diff --git a/passl/models/swav.py b/passl/models/swav.py
index 311b7b19..cf9500fb 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -1,5 +1,4 @@
 import os
-import copy
 import numpy as np
 from sys import flags
 from collections import defaultdict
@@ -48,6 +47,9 @@ def _load_model(self, path, model, tag):
                         .format(k, para_state_dict[k].shape, model_state_dict[k]
                                 .shape))
                 else:
+                    # conpact FP16 saving pretrained weight
+                    if model_state_dict[k].dtype != para_state_dict[k].dtype:
+                        para_state_dict[k] = para_state_dict[k].astype(model_state_dict[k].dtype)
                     model_state_dict[k] = para_state_dict[k]
                     num_params_loaded += 1
             model.set_dict(model_state_dict)
@@ -58,29 +60,6 @@ def _load_model(self, path, model, tag):
 
     def load_pretrained(self, path, rank=0, finetune=False):
         pass
-#         if not os.path.exists(path + '.pdparams'):
-#             raise ValueError("Model pretrain path {} does not "
-#                              "exists.".format(path))
-
-#         state_dict = self.state_dict()
-#         param_state_dict = paddle.load(path + ".pdparams")
-
-#         # for FP16 saving pretrained weight
-#         for key, value in param_state_dict.items():
-#             if key in param_state_dict and key in state_dict and param_state_dict[
-#                     key].dtype != state_dict[key].dtype:
-#                 param_state_dict[key] = param_state_dict[key].astype(
-#                     state_dict[key].dtype)
-
-#         if not finetune:
-#             self.set_dict(param_state_dict)
-#         else: # load model when finetune
-#             for k in ['head0.weight', 'head0.bias', 'head.weight', 'head.bias']:
-#                 if k in param_state_dict:
-#                     logger.info(f"Removing key {k} from pretrained checkpoint")
-#                     del param_state_dict[k]
-
-#             self.set_dict(param_state_dict)
 
     def save(self, path, local_rank=0, rank=0):
         paddle.save(self.state_dict(), path + ".pdparams")
@@ -109,7 +88,6 @@ def __init__(self, class_num=1000, **kwargs):
     
     def load_pretrained(self, path, rank=0, finetune=False):
         self._load_model(path, self.res_model, 'backbone')
-        # self._load_model("linear.pdparams", self.linear, 'linear')
 
     def forward(self, inp):
         with paddle.no_grad():
@@ -125,7 +103,6 @@ def __init__(self, **kwargs):
     
     def load_pretrained(self, path, rank=0, finetune=False):
         self._load_model(path, self.res_model, 'backbone') 
-        # self._load_model("projection_head.pdparams", self.res_model.projection_head, 'projection_head')
 
     def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=None):
         """
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml
similarity index 100%
rename from tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
rename to tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
index ba38de7e..7cca7774 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
@@ -21,9 +21,6 @@ Global:
 # FP16 setting
 FP16:
   level: O1
-#   GradScaler:
-#     init_loss_scaling: 65536.0
-#     incr_every_n_steps: 2000
 
 DistributedStrategy:
   data_parallel: True
@@ -42,22 +39,6 @@ Model:
   output_dim: 128
   nmb_prototypes: 3000
 
-# Optimizer:
-#   name: MomentumLARC
-#   momentum: 0.9
-#   weight_decay: 1e-6
-#   trust_coefficient: 0.001
-#   clip: False
-#   tensor_fusion: False
-#   decay_unit: step
-#   LRScheduler:
-#     name: TimmCosine
-#     learning_rate: 4.8
-#     eta_min: 0.0048
-#     warmup_epoch: 10
-#     warmup_start_lr: 0.3
-#     warmup_prefix: True
-
 Optimizer:
   name: MomentumLARC
   momentum: 0.9
diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh
index c577ddb1..494e2002 100644
--- a/tasks/ssl/swav/finetune.sh
+++ b/tasks/ssl/swav/finetune.sh
@@ -23,5 +23,5 @@ python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
-   passl-train -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml
+   passl-train -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml
    
\ No newline at end of file

From 8270a3cecc6c18b8b3f9e1fa138afe8bf81f1097 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Fri, 5 May 2023 11:02:26 +0800
Subject: [PATCH 16/46] fix_AttrDict_error

---
 passl/utils/io.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/passl/utils/io.py b/passl/utils/io.py
index deec5fef..8904215c 100644
--- a/passl/utils/io.py
+++ b/passl/utils/io.py
@@ -157,6 +157,12 @@ def save_checkpoint(net,
     if local_rank == 0:
         if loss_scaler is not None:
             opt_state_dict['scaler_state'] = loss_scaler.state_dict()
+        
+        # Solve AttrDict can't pickle error
+        for group in opt_state_dict['param_groups']:
+            if 'LRScheduler' in group:
+                group['LRScheduler'] = dict(group['LRScheduler'])
+        
         for model_prefix in model_prefixs:
             paddle.save(opt_state_dict, model_prefix + ".pdopt")
             paddle.save(metric_info, model_prefix + ".pdstates")

From 4e42f8e6277d2161a04458e4b5f47fb7b9a977d8 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Fri, 5 May 2023 14:41:27 +0800
Subject: [PATCH 17/46] replace_swav_resnet

---
 passl/models/__init__.py                      |   3 +-
 passl/models/resnet.py                        | 458 ++++++++----------
 passl/models/swav.py                          |   6 +-
 passl/models/swav_resnet.py                   | 111 +++++
 passl/optimizer/momentum_larc.py              |   2 -
 ...v_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml |   2 +-
 tasks/ssl/swav/finetune.sh                    |   2 +-
 7 files changed, 317 insertions(+), 267 deletions(-)
 create mode 100644 passl/models/swav_resnet.py

diff --git a/passl/models/__init__.py b/passl/models/__init__.py
index 38ea440d..85f9663b 100644
--- a/passl/models/__init__.py
+++ b/passl/models/__init__.py
@@ -27,7 +27,8 @@
 from .convnext import *
 from .mocov3 import *
 from .swav import *
-# from .simsiam import *
+from .swav_resnet import *
+from .simsiam import *
 
 __all__ = ["build_model"]
 
diff --git a/passl/models/resnet.py b/passl/models/resnet.py
index 735d6485..f15f3443 100644
--- a/passl/models/resnet.py
+++ b/passl/models/resnet.py
@@ -1,274 +1,214 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
 import paddle
-import functools
 
-import paddle.nn as nn
+from paddle.vision.models.resnet import ResNet as PDResNet
+from paddle.vision.models.resnet import BottleneckBlock, BasicBlock
 
 from passl.models.base_model import Model
-# from base_model import Model
-
-def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
-    """3x3 convolution with padding"""
-    return paddle.nn.Conv2D(in_channels=in_planes, out_channels=out_planes,
-        kernel_size=3, stride=stride, padding=dilation, groups=groups,
-        dilation=dilation, bias_attr=False, )
-
-
-def conv1x1(in_planes, out_planes, stride=1):
-    """1x1 convolution"""
-    return paddle.nn.Conv2D(in_channels=in_planes, out_channels=out_planes,
-        kernel_size=1, stride=stride, bias_attr=False)
-
-
-class BasicBlock(nn.Layer):
-    expansion = 1
-    __constants__ = ['downsample']
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=
-        1, base_width=64, dilation=1, norm_layer=None):
-        super(BasicBlock, self).__init__()
-        if norm_layer is None:
-            norm_layer = paddle.nn.BatchNorm2D
-        if groups != 1 or base_width != 64:
-            raise ValueError(
-                'BasicBlock only supports groups=1 and base_width=64')
-        if dilation > 1:
-            raise NotImplementedError(
-                'Dilation > 1 not supported in BasicBlock')
-        self.conv1 = conv3x3(inplanes, planes, stride)
-        self.bn1 = norm_layer(planes)
-        self.relu = paddle.nn.ReLU()
-        self.conv2 = conv3x3(planes, planes)
-        self.bn2 = norm_layer(planes)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        identity = x
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-        out = self.bn2(out)
-        if self.downsample is not None:
-            identity = self.downsample(x)
-        out += identity
-        out = self.relu(out)
-        return out
-
-
-class Bottleneck(paddle.nn.Layer):
-    expansion = 4
-    __constants__ = ['downsample']
-
-    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=
-        1, base_width=64, dilation=1, norm_layer=None):
-        super(Bottleneck, self).__init__()
-        if norm_layer is None:
-            norm_layer = paddle.nn.BatchNorm2D
-        width = int(planes * (base_width / 64.0)) * groups
-        self.conv1 = conv1x1(inplanes, width)
-        self.bn1 = norm_layer(width)
-        self.conv2 = conv3x3(width, width, stride, groups, dilation)
-        self.bn2 = norm_layer(width)
-        self.conv3 = conv1x1(width, planes * self.expansion)
-        self.bn3 = norm_layer(planes * self.expansion)
-        self.relu = paddle.nn.ReLU()
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        identity = x
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-        out = self.conv3(out)
-        out = self.bn3(out)
-        if self.downsample is not None:
-            identity = self.downsample(x)
-        out += identity
-        out = self.relu(out)
-        return out
-
-def kaiming_normal_init(param, **kwargs):
-    initializer = nn.initializer.KaimingNormal(**kwargs)
-    initializer(param, param.block)
-
-def constant_init(param, **kwargs):
-    initializer = nn.initializer.Constant(**kwargs)
-    initializer(param, param.block)
-    
-    
-class ResNet(paddle.nn.Layer):
-    def __init__(self, block, layers, zero_init_residual=False, groups=1,
-        widen=1, width_per_group=64, replace_stride_with_dilation=None,
-        norm_layer=None, normalize=False, output_dim=0, hidden_mlp=0,
-        nmb_prototypes=0, eval_mode=False):
-        
-        super(ResNet, self).__init__()
-        if norm_layer is None:
-            norm_layer = functools.partial(paddle.nn.BatchNorm2D, use_global_stats=False)
-        self._norm_layer = norm_layer
-        self.eval_mode = eval_mode
-        self.padding = paddle.nn.Pad2D(padding=1, value=0.0)
-        self.inplanes = width_per_group * widen
-        self.dilation = 1
-        if replace_stride_with_dilation is None:
-            replace_stride_with_dilation = [False, False, False]
-        if len(replace_stride_with_dilation) != 3:
-            raise ValueError(
-                'replace_stride_with_dilation should be None or a 3-element tuple, got {}'
-                .format(replace_stride_with_dilation))
-        self.groups = groups
-        self.base_width = width_per_group
-        num_out_filters = width_per_group * widen
-        self.conv1 = paddle.nn.Conv2D(in_channels=3, out_channels=
-            num_out_filters, kernel_size=7, stride=2, padding=2, bias_attr=
-            False)
-        self.bn1 = norm_layer(num_out_filters)
-        self.relu = paddle.nn.ReLU()
-        self.maxpool = paddle.nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
-        self.layer1 = self._make_layer(block, num_out_filters, layers[0])
-        num_out_filters *= 2
-        self.layer2 = self._make_layer(block, num_out_filters, layers[1],
-            stride=2, dilate=replace_stride_with_dilation[0])
-        num_out_filters *= 2
-        self.layer3 = self._make_layer(block, num_out_filters, layers[2],
-            stride=2, dilate=replace_stride_with_dilation[1])
-        num_out_filters *= 2
-        self.layer4 = self._make_layer(block, num_out_filters, layers[3],
-            stride=2, dilate=replace_stride_with_dilation[2])
-        self.avgpool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1))
-        self.l2norm = normalize
-        if output_dim == 0:
-            self.projection_head = None
-        elif hidden_mlp == 0:
-            self.projection_head = paddle.nn.Linear(in_features=
-                num_out_filters * block.expansion, out_features=output_dim)
-        else:
-            self.projection_head = paddle.nn.Sequential(paddle.nn.Linear(
-                in_features=num_out_filters * block.expansion, out_features
-                =hidden_mlp), paddle.nn.BatchNorm1D(num_features=hidden_mlp,
-                momentum=1 - 0.1, epsilon=1e-05, weight_attr=None,
-                bias_attr=None, use_global_stats=True), paddle.nn.ReLU(),
-                paddle.nn.Linear(in_features=hidden_mlp, out_features=
-                output_dim))
-        self.prototypes = None
-        if isinstance(nmb_prototypes, list):
-            self.prototypes = MultiPrototypes(output_dim, nmb_prototypes)
-        elif nmb_prototypes > 0:
-            self.prototypes = paddle.nn.Linear(in_features=output_dim,
-                out_features=nmb_prototypes, bias_attr=False)
-            for sublayer in self.sublayers():
-                if isinstance(sublayer, nn.Conv2D):
-                    kaiming_normal_init(sublayer.weight) # todo mode='fan_out',
-                elif isinstance(sublayer, (nn.BatchNorm2D, nn.GroupNorm)):
-                    constant_init(sublayer.weight, value=1.0)
-                    constant_init(sublayer.bias, value=0.0)
-
+from passl.nn import init
+
+__all__ = [
+    "ResNet",
+    "resnet18",
+    "resnet34",
+    "resnet50",
+    "resnet101",
+    "resnet152",
+    "resnext50_32x4d",
+    "resnext50_64x4d",
+    "resnext101_32x4d",
+    "resnext101_64x4d",
+    "resnext152_32x4d",
+    "resnext152_64x4d",
+    "wide_resnet50_2",
+    "wide_resnet101_2",
+]
+
+class ResNet(PDResNet, Model):
+    def __init__(
+        self,
+        block,
+        depth=50,
+        width=64,
+        class_num=1000,
+        with_pool=True,
+        groups=1,
+        zero_init_residual=True,
+    ):
+        super().__init__(block, depth=depth, width=width, num_classes=class_num, with_pool=with_pool, groups=groups)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
         if zero_init_residual:
-            for sublayer in self.sublayers():
-                if isinstance(m, Bottleneck):
-                    param_init.constant_init(sublayer.bn3.weight, value=0.0)
+            for m in self.sublayers():
+                if isinstance(m, BottleneckBlock):
+                    init.constant_(m.bn3.weight, 0)
                 elif isinstance(m, BasicBlock):
-                    param_init.constant_init(sublayer.bn2.weight, value=0.0)
-
-    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
-        norm_layer = self._norm_layer
-        downsample = None
-        previous_dilation = self.dilation
-        if dilate:
-            self.dilation *= stride
-            stride = 1
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = paddle.nn.Sequential(conv1x1(self.inplanes, planes *
-                block.expansion, stride), norm_layer(planes * block.expansion))
-        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample, self
-            .groups, self.base_width, previous_dilation, norm_layer))
-        self.inplanes = planes * block.expansion
-        for _ in range(1, blocks):
-            layers.append(block(self.inplanes, planes, groups=self.groups,
-                base_width=self.base_width, dilation=self.dilation,
-                norm_layer=norm_layer))
-        return paddle.nn.Sequential(*layers)
-
-    def forward_backbone(self, x):
-        x = self.padding(x)
-        x = self.conv1(x)
-        # print("before bn mean var", self.bn1._mean.mean(), self.bn1._variance.mean())
-        x = self.bn1(x)
-        # print("bn mean var", self.bn1._mean.mean(), self.bn1._variance.mean())
-        x = self.relu(x)
-        x = self.maxpool(x)
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-        if self.eval_mode:
-            return x
-        x = self.avgpool(x)
-        x = paddle.flatten(x=x, start_axis=1)
-        return x
-
-    def forward_head(self, x):
-        if self.projection_head is not None:
-            # print("before proj bn mean var", self.projection_head[1]._mean.mean(), self.projection_head[1]._variance.mean())
-            x = self.projection_head(x)
-            # print(" proj bn mean var", self.projection_head[1]._mean.mean(), self.projection_head[1]._variance.mean())
-        if self.l2norm:
-            x = paddle.nn.functional.normalize(x=x, axis=1, p=2)
-        if self.prototypes is not None:
-            return x, self.prototypes(x)
-        return x
-
-    def forward(self, inputs):
-        if not isinstance(inputs, list):
-            inputs = [inputs]
-        
-        idx_crops = paddle.cumsum(x=paddle.unique_consecutive(x=paddle.
-            to_tensor(data=[inp.shape[-1] for inp in inputs]),
-            return_counts=True)[1], axis=0) # padiff
-        start_idx = 0
-        for end_idx in idx_crops:
-            _out = self.forward_backbone(paddle.concat(x=inputs[start_idx:end_idx]))
-            if start_idx == 0:
-                output = _out
-            else:
-                output = paddle.concat(x=(output, _out))
-            start_idx = end_idx
-        return self.forward_head(output)
-
-
-class MultiPrototypes(paddle.nn.Layer):
-    def __init__(self, output_dim, nmb_prototypes):
-        super(MultiPrototypes, self).__init__()
-        self.nmb_heads = len(nmb_prototypes)
-        for i, k in enumerate(nmb_prototypes):
-            self.add_module('prototypes' + str(i), paddle.nn.Linear(
-                in_features=output_dim, out_features=k, bias_attr=False))
-
-    def forward(self, x):
-        out = []
-        for i in range(self.nmb_heads):
-            out.append(getattr(self, 'prototypes' + str(i))(x))
-        return out
+                    init.constant_(m.bn2.weight, 0)
 
+    def load_pretrained(self, path, rank=0, finetune=False):
+        if not os.path.exists(path + '.pdparams'):
+            raise ValueError("Model pretrain path {} does not "
+                             "exists.".format(path))
 
-def resnet50(**kwargs):
-    return ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+        state_dict = self.state_dict()
+        param_state_dict = paddle.load(path + ".pdparams")
 
+        # for FP16 saving pretrained weight
+        for key, value in param_state_dict.items():
+            if key in param_state_dict and key in state_dict and param_state_dict[
+                    key].dtype != state_dict[key].dtype:
+                param_state_dict[key] = param_state_dict[key].astype(
+                    state_dict[key].dtype)
 
-def resnet50w2(**kwargs):
-    return ResNet(Bottleneck, [3, 4, 6, 3], widen=2, **kwargs)
+        self.set_dict(param_state_dict)
 
+    def save(self, path, local_rank=0, rank=0):
+        paddle.save(self.state_dict(), path + ".pdparams")
 
-def resnet50w4(**kwargs):
-    return ResNet(Bottleneck, [3, 4, 6, 3], widen=4, **kwargs)
+def resnet18(**kwargs):
+    """ResNet 18-layer model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+    """
 
+    model = ResNet(BasicBlock, 18, **kwargs)
+    return model
 
-def resnet50w5(**kwargs):
-    return ResNet(Bottleneck, [3, 4, 6, 3], widen=5, **kwargs)
+def resnet34(**kwargs):
+    """ResNet 34-layer model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+    """
 
+    model = ResNet(BasicBlock, 34, **kwargs)
+    return model
+
+def resnet50(**kwargs):
+    """ResNet 50-layer model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+    """
+
+    model = ResNet(BottleneckBlock, 50, **kwargs)
+    return model
+
+
+def resnet101(**kwargs):
+    """ResNet 101-layer model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+    """
+
+    model = ResNet(BottleneckBlock, 101, **kwargs)
+    return model
+
+def resnet152(**kwargs):
+    """ResNet 152-layer model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
+    """
+
+    model = ResNet(BottleneckBlock, 152, **kwargs)
+    return model
+
+
+def resnext50_32x4d(**kwargs):
+    """ResNeXt-50 32x4d model from
+    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+    """
+
+    kwargs['groups'] = 32
+    kwargs['width'] = 4
+    model = ResNet(BottleneckBlock, 50, **kwargs)
+    return model
+
+def resnext50_64x4d(**kwargs):
+    """ResNeXt-50 64x4d model from
+    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+    """
+
+    kwargs['groups'] = 64
+    kwargs['width'] = 4
+    model = ResNet(BottleneckBlock, 50, **kwargs)
+    return model
+
+def resnext101_32x4d(**kwargs):
+    """ResNeXt-101 32x4d model from
+    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+    """
+
+    kwargs['groups'] = 32
+    kwargs['width'] = 4
+    model = ResNet(BottleneckBlock, 101, **kwargs)
+    return model
+
+def resnext101_64x4d(**kwargs):
+    """ResNeXt-101 64x4d model from
+    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+    """
+
+    kwargs['groups'] = 64
+    kwargs['width'] = 4
+    model = ResNet(BottleneckBlock, 101, **kwargs)
+    return model
+
+
+def resnext152_32x4d(**kwargs):
+    """ResNeXt-152 32x4d model from
+    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+    """
+
+    kwargs['groups'] = 32
+    kwargs['width'] = 4
+    model = ResNet(BottleneckBlock, 152, **kwargs)
+    return model
+
+def resnext152_64x4d(**kwargs):
+    """ResNeXt-152 64x4d model from
+    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
+    """
+
+    kwargs['groups'] = 64
+    kwargs['width'] = 4
+    model = ResNet(BottleneckBlock, 152, **kwargs)
+    return model
+
+def wide_resnet50_2(**kwargs):
+    """Wide ResNet-50-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
+    """
+
+    kwargs['width'] = 64 * 2
+    model = ResNet(BottleneckBlock, 50, **kwargs)
+    return model
+
+def wide_resnet101_2(**kwargs):
+    """Wide ResNet-101-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
+    """
+
+    kwargs['width'] = 64 * 2
+    model = ResNet(BottleneckBlock, 101, **kwargs)
+    return model
diff --git a/passl/models/swav.py b/passl/models/swav.py
index cf9500fb..0b0b554c 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -7,9 +7,9 @@
 import paddle.nn as nn
 
 from passl.nn import init
-from passl.scheduler import build_lr_scheduler, lr_scheduler
+from passl.scheduler import build_lr_scheduler
 from passl.utils import logger
-from passl.models.resnet import resnet50
+from passl.models.swav_resnet import swavresnet50
 from passl.models.base_model import Model
 
 
@@ -27,7 +27,7 @@
 class SwAV(Model):
     def __init__(self, **kwargs):
         super().__init__()
-        self.res_model = resnet50(**kwargs)
+        self.res_model = swavresnet50(**kwargs)
     
     def _load_model(self, path, model, tag):
         if os.path.isfile(path):
diff --git a/passl/models/swav_resnet.py b/passl/models/swav_resnet.py
new file mode 100644
index 00000000..2869eedc
--- /dev/null
+++ b/passl/models/swav_resnet.py
@@ -0,0 +1,111 @@
+import paddle
+import functools
+import paddle.nn as nn
+
+from .resnet import ResNet, BottleneckBlock
+
+def kaiming_normal_init(param, **kwargs):
+    initializer = nn.initializer.KaimingNormal(**kwargs)
+    initializer(param, param.block)
+
+def constant_init(param, **kwargs):
+    initializer = nn.initializer.Constant(**kwargs)
+    initializer(param, param.block)
+    
+    
+class SwAVResNet(paddle.nn.Layer):
+    def __init__(self, block, depth,
+        normalize=False, output_dim=0, hidden_mlp=0,
+        nmb_prototypes=0, eval_mode=False):
+        
+        super(SwAVResNet, self).__init__()
+        self.l2norm = normalize
+        self.eval_mode = eval_mode
+        num_out_filters = 512
+        
+        self.avgpool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1))
+       
+        if output_dim == 0:
+            self.projection_head = None
+        elif hidden_mlp == 0:
+            self.projection_head = paddle.nn.Linear(in_features=
+                num_out_filters * block.expansion, out_features=output_dim)
+        else:
+            self.projection_head = paddle.nn.Sequential(paddle.nn.Linear(
+                in_features=num_out_filters * block.expansion, out_features
+                =hidden_mlp), paddle.nn.BatchNorm1D(num_features=hidden_mlp,
+                momentum=1 - 0.1, epsilon=1e-05, weight_attr=None,
+                bias_attr=None, use_global_stats=True), paddle.nn.ReLU(),
+                paddle.nn.Linear(in_features=hidden_mlp, out_features=
+                output_dim))
+
+        self.prototypes = None
+        if isinstance(nmb_prototypes, list):
+            self.prototypes = MultiPrototypes(output_dim, nmb_prototypes)
+        elif nmb_prototypes > 0:
+            self.prototypes = paddle.nn.Linear(in_features=output_dim,
+                out_features=nmb_prototypes, bias_attr=False)
+            for sublayer in self.sublayers():
+                if isinstance(sublayer, nn.Conv2D):
+                    kaiming_normal_init(sublayer.weight) # todo mode='fan_out',
+                elif isinstance(sublayer, (nn.BatchNorm2D, nn.GroupNorm)):
+                    constant_init(sublayer.weight, value=1.0)
+                    constant_init(sublayer.bias, value=0.0)
+
+        self.encoder = functools.partial(ResNet, block=block, depth=depth)(with_pool=False, class_num=0)
+
+    def forward_backbone(self, x):
+        x = self.encoder(x)
+
+        if self.eval_mode:
+            return x
+        
+        x = self.avgpool(x)
+        x = paddle.flatten(x=x, start_axis=1)
+        return x
+
+    def forward_head(self, x):
+        if self.projection_head is not None:
+            x = self.projection_head(x)
+        if self.l2norm:
+            x = paddle.nn.functional.normalize(x=x, axis=1, p=2)
+        if self.prototypes is not None:
+            return x, self.prototypes(x)
+        return x
+
+    def forward(self, inputs):
+        if not isinstance(inputs, list):
+            inputs = [inputs]
+        
+        idx_crops = paddle.cumsum(x=paddle.unique_consecutive(x=paddle.
+            to_tensor(data=[inp.shape[-1] for inp in inputs]),
+            return_counts=True)[1], axis=0) # padiff
+        start_idx = 0
+        for end_idx in idx_crops:
+            _out = self.forward_backbone(paddle.concat(x=inputs[start_idx:end_idx]))
+            if start_idx == 0:
+                output = _out
+            else:
+                output = paddle.concat(x=(output, _out))
+            start_idx = end_idx
+        return self.forward_head(output)
+
+
+class MultiPrototypes(paddle.nn.Layer):
+    def __init__(self, output_dim, nmb_prototypes):
+        super(MultiPrototypes, self).__init__()
+        self.nmb_heads = len(nmb_prototypes)
+        for i, k in enumerate(nmb_prototypes):
+            self.add_module('prototypes' + str(i), paddle.nn.Linear(
+                in_features=output_dim, out_features=k, bias_attr=False))
+
+    def forward(self, x):
+        out = []
+        for i in range(self.nmb_heads):
+            out.append(getattr(self, 'prototypes' + str(i))(x))
+        return out
+
+
+def swavresnet50(**kwargs):
+    return SwAVResNet(block=BottleneckBlock, depth=50, **kwargs)
+
diff --git a/passl/optimizer/momentum_larc.py b/passl/optimizer/momentum_larc.py
index 80427b9d..09982f78 100644
--- a/passl/optimizer/momentum_larc.py
+++ b/passl/optimizer/momentum_larc.py
@@ -16,10 +16,8 @@
 from __future__ import division
 from __future__ import print_function
 
-import math
 import paddle
 from .optimizer import Optimizer
-from passl.utils import logger
 
 
 class MomentumLARC(Optimizer):
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml
index 974d84b1..a01f5680 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml
@@ -4,7 +4,7 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-  pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams
+  pretrained_model: swav_800ep_pretrain_adjustresnet.pdparams
   finetune: True
   output_dir: ./output/semi_0426_semi10
   device: gpu
diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh
index 494e2002..af548129 100644
--- a/tasks/ssl/swav/finetune.sh
+++ b/tasks/ssl/swav/finetune.sh
@@ -23,5 +23,5 @@ python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
-   passl-train -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml
+   tools/train.py  -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml
    
\ No newline at end of file

From 21639c2e5df86e7a2077ac1a5fcc1e9f2b953b24 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Fri, 5 May 2023 15:41:32 +0800
Subject: [PATCH 18/46] add_ci_readme

---
 passl/models/swav.py                          |  1 +
 tasks/ssl/swav/README.md                      | 50 ++++++++-----------
 ...av_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml} |  2 +-
 ...wav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml} |  2 +-
 ...v_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml |  2 +-
 ..._resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml |  4 +-
 tasks/ssl/swav/finetune.sh                    |  2 +-
 tasks/ssl/swav/linearprobe.sh                 |  2 +-
 tests/CI/case.sh                              | 49 ++++++++++++++++++
 .../swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh | 30 +++++++++++
 .../swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh | 30 +++++++++++
 ...wav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh | 29 +++++++++++
 12 files changed, 168 insertions(+), 35 deletions(-)
 rename tasks/ssl/swav/configs/{swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml => swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml} (97%)
 rename tasks/ssl/swav/configs/{swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml => swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml} (95%)
 create mode 100644 tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh
 create mode 100644 tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh
 create mode 100644 tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh

diff --git a/passl/models/swav.py b/passl/models/swav.py
index 0b0b554c..22905d9c 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -30,6 +30,7 @@ def __init__(self, **kwargs):
         self.res_model = swavresnet50(**kwargs)
     
     def _load_model(self, path, model, tag):
+        path = path + ".pdparams"
         if os.path.isfile(path):
             para_state_dict = paddle.load(path)
             
diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md
index b3f14b0e..d14c1b81 100644
--- a/tasks/ssl/swav/README.md
+++ b/tasks/ssl/swav/README.md
@@ -1,10 +1,10 @@
-## MoCo v3 for Self-supervised ResNet and ViT
+## SwAV: Unsupervised Learning of Visual Features by Contrasting Cluster Assignments
 
 
-PaddlePaddle reimplementation of [facebookresearch's repository for the MoCo v3 model](https://github.com/facebookresearch/moco-v3) that was released with the paper [An Empirical Study of Training Self-Supervised Vision Transformers](https://arxiv.org/abs/2104.02057).
+PaddlePaddle reimplementation of [facebookresearch's repository for the SwAV model](https://github.com/facebookresearch/swav) that was released with the paper [Unsupervised Learning of Visual Features by Contrasting Cluster Assignments](https://arxiv.org/abs/2006.09882).
 
 ## Requirements
-To enjoy some new features, PaddlePaddle 2.4 is required. For more installation tutorials
+To enjoy some new features, PaddlePaddle develop is required. For more installation tutorials
 refer to [installation.md](../../../tutorials/get_started/installation.md)
 
 ## Data Preparation
@@ -20,7 +20,7 @@ dataset/
 
 ## How to Self-supervised Pre-Training
 
-With a batch size of 4096, ViT-Base is trained with 4 nodes:
+With a batch size of 4096, SwAV is trained with 4 nodes:
 
 ```bash
 # Note: Set the following environment variables
@@ -36,12 +36,12 @@ python -m paddle.distributed.launch \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
     passl-train \
-    -c ./configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml
+    -c ./configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
 ```
 
 ## How to Linear Classification
 
-By default, we use momentum-SGD and a batch size of 1024 for linear classification on frozen features/weights. This can be done with a single 8-GPU node.
+By default, we use momentum-SGD and a batch size of 256 for linear classification on frozen features/weights. This can be done with a single 8-GPU node.
 
 ```bash
 unset PADDLE_TRAINER_ENDPOINTS
@@ -55,25 +55,17 @@ python -m paddle.distributed.launch \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
     passl-train \
-    -c ./configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml
+    -c ./configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yaml
 ```
 
 ## How to End-to-End Fine-tuning
-To perform end-to-end fine-tuning for ViT, use our script to convert the pre-trained ViT checkpoint to PASSL DeiT format:
-
-```bash
-python extract_weight.py \
-  --input pretrained/checkpoint_0299.pd \
-  --output pretrained/moco_vit_base.pdparams
-```
-
-Then run the training with the converted PASSL format checkpoint:
+To perform end-to-end fine-tuning for SwAV, run the training with the trained PASSL format checkpoint:
 
 ```bash
 unset PADDLE_TRAINER_ENDPOINTS
 export PADDLE_NNODES=1
 export PADDLE_MASTER="127.0.0.1:12538"
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export CUDA_VISIBLE_DEVICES=0,1,2,3
 export FLAGS_stop_check_timeout=3600
 
 python -m paddle.distributed.launch \
@@ -81,28 +73,30 @@ python -m paddle.distributed.launch \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
     passl-train \
-    -c ./configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml
+    -c ./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml
 ```
 
 ## Other Configurations
-We provide more directly runnable configurations, see [MoCoV3 Configurations](./configs/).
+We provide more directly runnable configurations, see [SwAV Configurations](./configs/).
 
 ## Models
 
 ### ViT-Base
-| Model         | Phase       | Dataset      | Configs                                                      | GPUs       | Epochs | Top1 Acc | Checkpoint                                                   |
+| Model         | Phase       | Dataset      | Configs                                                      | GPUs       | Epochs | Top1 Acc (%) | Links                                                   |
 | ------------- | ----------- | ------------ | ------------------------------------------------------------ | ---------- | ------ | -------- | ------------------------------------------------------------ |
-| moco_vit_base | pretrain    | ImageNet2012 | [config](./configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml) | A100*N4C32 | 300    | -        | [download](https://plsc.bj.bcebos.com/models/mocov3/v2.4/moco_vit_base_in1k_300ep.pd) |
-| moco_vit_base | linear prob | ImageNet2012 | [config](./configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml) | A100*N1C8  | 90     | 0.7662   |                                                              |
-| moco_vit_base | finetune    | ImageNet2012 | [config](./configs/DeiT_base_patch16_224_in1k_1n8c_dp_fp16o1.yaml) | A100*N1C8  | 150    | 0.8288   |                                                              |
+| resnet50 | pretrain    | ImageNet2012 | [config](./configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml) | A100*N4C32 | 800    | -        | [model]() \| [log]() |
+| resnet50 | linear probe | ImageNet2012 | [config](./configs/swav_resnet50_224_lp_in1k_4n32c_dp_fp16o1.yaml) | A100*N1C8  | 75.3    | 0.7662   |        [model]() \| [log]() |
+| resnet50 | finetune    | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4  | 100    | 69.0   | [model]() \| [log]() |
 
 ## Citations
 
 ```bibtex
-@Article{chen2021mocov3,
-  author  = {Xinlei Chen* and Saining Xie* and Kaiming He},
-  title   = {An Empirical Study of Training Self-Supervised Vision Transformers},
-  journal = {arXiv preprint arXiv:2104.02057},
-  year    = {2021},
+@misc{caron2021unsupervised,
+      title={Unsupervised Learning of Visual Features by Contrasting Cluster Assignments}, 
+      author={Mathilde Caron and Ishan Misra and Julien Mairal and Priya Goyal and Piotr Bojanowski and Armand Joulin},
+      year={2021},
+      eprint={2006.09882},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
 }
 ```
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
similarity index 97%
rename from tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml
rename to tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
index a01f5680..8f641d0f 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
@@ -4,7 +4,7 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-  pretrained_model: swav_800ep_pretrain_adjustresnet.pdparams
+  pretrained_model: swav_800ep_pretrain_adjustresnet
   finetune: True
   output_dir: ./output/semi_0426_semi10
   device: gpu
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
similarity index 95%
rename from tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
rename to tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
index c67ddd2a..5a4e9a83 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
@@ -4,7 +4,7 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-  pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams
+  pretrained_model: ./pretrained/swav/swav_resnet50_in1k_800ep_pretrained
   output_dir: ./output
   device: gpu
   save_interval: 1
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
index 7cca7774..9eaf4fa1 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
@@ -5,7 +5,7 @@ Global:
   validate_loop: None
   checkpoint: null
   pretrained_model: null
-  output_dir: ./output/pretrain_0504_fp16
+  output_dir: ./output/
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
index d0292c58..5ccbc6ad 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
@@ -5,7 +5,7 @@ Global:
   validate_loop: None
   checkpoint: null
   pretrained_model: null
-  output_dir: ./output/pretrain_0504_fp16
+  output_dir: ./output/
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
@@ -70,7 +70,7 @@ DataLoader:
       max_scale_crops: [1, 0.14]
     sampler:
       name: DistributedBatchSampler
-      batch_size: 64 # 4 card # 128 32 card # accum_steps: 1, total batchsize: 4096
+      batch_size: 128 # 64 8 card # 128 32 card # accum_steps: 1, total batchsize: 4096
       drop_last: True
       shuffle: True
     loader:
diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh
index af548129..9844d806 100644
--- a/tasks/ssl/swav/finetune.sh
+++ b/tasks/ssl/swav/finetune.sh
@@ -23,5 +23,5 @@ python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
-   tools/train.py  -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml
+   tools/train.py  -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
    
\ No newline at end of file
diff --git a/tasks/ssl/swav/linearprobe.sh b/tasks/ssl/swav/linearprobe.sh
index 4c37392b..07ced970 100644
--- a/tasks/ssl/swav/linearprobe.sh
+++ b/tasks/ssl/swav/linearprobe.sh
@@ -22,4 +22,4 @@ python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
-    tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml
+    tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
diff --git a/tests/CI/case.sh b/tests/CI/case.sh
index 4d863ed7..60814b7c 100644
--- a/tests/CI/case.sh
+++ b/tests/CI/case.sh
@@ -38,6 +38,9 @@ function model_list(){
     mocov3_vit_base_patch16_224_pt_in1k_1n8c_dp_fp16o1
     mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1
     mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1
+    swav_resnet50_224_ft_in1k_1n4c_dp_fp32
+    swav_resnet50_224_lp_in1k_1n8c_dp_fp32
+    swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1
 }
 
 ############ case start ############
@@ -354,6 +357,52 @@ function mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1() {
 }
 
 
+function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() {
+    echo "=========== $FUNCNAME run begin ==========="
+    rm -rf log
+    bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
+
+    loss=`cat log/workerlog.0 | grep '200/501' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
+    ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
+    mem=`cat log/workerlog.0 | grep '200/501' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
+    loss_base=2.23445
+    ips_base=793.89847
+    mem_base=5.67
+    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
+    echo "=========== $FUNCNAME run  end ==========="
+}
+
+function swav_resnet50_224_lp_in1k_1n8c_dp_fp32() {
+    echo "=========== $FUNCNAME run begin ==========="
+    rm -rf log
+    bash ./ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh
+
+    loss=`cat log/workerlog.0 | grep '200/5005' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
+    ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
+    mem=`cat log/workerlog.0 | grep '200/5005' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
+    loss_base=4.89133
+    ips_base=11111.52955
+    mem_base=0.83
+    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
+    echo "=========== $FUNCNAME run  end ==========="
+}
+
+
+function swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1() {
+    echo "=========== $FUNCNAME run begin ==========="
+    rm -rf log
+    bash ./ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh
+
+    loss=`cat log/workerlog.0 | grep '200/2599' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
+    ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
+    mem=`cat log/workerlog.0 | grep '200/2599' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
+    loss_base=8.00343
+    ips_base=1385.94186
+    mem_base=8.63
+    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
+    echo "=========== $FUNCNAME run  end ==========="
+}
+
 function check_result() {
     if [ $? -ne 0 ];then
       echo -e "\033 $1 model runs failed! \033" | tee -a $log_path/result.log
diff --git a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh
new file mode 100644
index 00000000..187b8e8b
--- /dev/null
+++ b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export PADDLE_NNODES=1
+export PADDLE_MASTER="127.0.0.0:12538"
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export FLAGS_stop_check_timeout=3600
+
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    passl-train \
+    -c ../../tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yaml \
+    -o Global.print_batch_step=1 \
+    -o Global.max_train_step=201 \
+    -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \
+    -o Global.flags.FLAGS_cudnn_deterministic=1
+    -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained
\ No newline at end of file
diff --git a/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh b/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh
new file mode 100644
index 00000000..7c748f15
--- /dev/null
+++ b/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh
@@ -0,0 +1,30 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export PADDLE_NNODES=1
+export PADDLE_MASTER="127.0.0.0:12538"
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export FLAGS_stop_check_timeout=3600
+
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    passl-train \
+    -c ../../tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yaml \
+    -o Global.print_batch_step=1 \
+    -o Global.max_train_step=201 \
+    -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \
+    -o Global.flags.FLAGS_cudnn_deterministic=1 \
+    -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained
\ No newline at end of file
diff --git a/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh b/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh
new file mode 100644
index 00000000..954705ad
--- /dev/null
+++ b/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh
@@ -0,0 +1,29 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export PADDLE_NNODES=1
+export PADDLE_MASTER="127.0.0.0:12538"
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export FLAGS_stop_check_timeout=3600
+
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    passl-train \
+    -c ../../tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml \
+    -o Global.print_batch_step=1 \
+    -o Global.max_train_step=201 \
+    -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \
+    -o Global.flags.FLAGS_cudnn_deterministic=1
\ No newline at end of file

From 6814c125ec8b28b2bdaf95b232d522cf6be5ebbf Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Fri, 5 May 2023 17:02:24 +0800
Subject: [PATCH 19/46] compact_lr_group

---
 passl/engine/engine.py                        |  32 +--
 passl/models/swav.py                          | 115 +++++----
 passl/optimizer/__init__.py                   | 227 +++++++++++++-----
 passl/optimizer/momentum.py                   |   6 +-
 passl/optimizer/optimizer.py                  |  19 +-
 passl/optimizer/utils/__init__.py             |   1 +
 passl/optimizer/utils/group_params.py         | 194 +++++++++++++++
 passl/scheduler/__init__.py                   |   8 +-
 passl/scheduler/lr_callable.py                |  16 +-
 ...wav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml |  14 +-
 ...swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml |   4 +-
 ...v_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml |   4 +-
 ..._resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml |   4 +-
 tasks/ssl/swav/finetune.sh                    |   2 +-
 14 files changed, 473 insertions(+), 173 deletions(-)
 create mode 100644 passl/optimizer/utils/__init__.py
 create mode 100644 passl/optimizer/utils/group_params.py

diff --git a/passl/engine/engine.py b/passl/engine/engine.py
index c50b5084..7cacb83f 100644
--- a/passl/engine/engine.py
+++ b/passl/engine/engine.py
@@ -213,20 +213,24 @@ def worker_init_fn(worker_id):
             paddle.set_default_dtype(default_dtype)
 
         # build optimizer and lr scheduler
+        assert self.config.get("Optimizer", None) is not None, "Optimizer must be defined in config."
+        self.lr_decay_unit = self.config["Optimizer"].pop('lr_decay_unit', None)
+        if self.lr_decay_unit is None:
+            self.lr_decay_unit = 'step'
+            logger.warning('lr_decay_unit is not set in optimizer config, set to step by default!')
         if self.mode == 'train':
-            assert self.config.get("Optimizer", None) is not None, "Optimizer must be defined in config."
-            if self.config["Optimizer"].get('decay_unit', None) is not None:
-                self.lr_decay_unit = self.config["Optimizer"]['decay_unit']
-            else:
-                self.lr_decay_unit = 'step'
-                Warning('lr_decay_unit is not set in optimizer config, set to step by default')
-            
-            config_lr_scheduler = self.config["Optimizer"].get('LRScheduler', None)
-            self.lr_scheduler = None	                
-            if config_lr_scheduler is not None:	        	  
-                self.lr_scheduler = build_lr_scheduler(config_lr_scheduler, self.config["Global"]["epochs"], len(self.train_dataloader), self.lr_decay_unit)	
-
-            self.optimizer = build_optimizer(self.config["Optimizer"], self.model, self.config, len(self.train_dataloader), self.lr_scheduler)
+            config_lr_scheduler = self.config.get('LRScheduler', None)
+            self.lr_scheduler = None
+            if config_lr_scheduler is not None:
+                self.lr_decay_unit = config_lr_scheduler.get('decay_unit',
+                                                             'step')
+                self.lr_scheduler = build_lr_scheduler(
+                    config_lr_scheduler, self.config["Global"]["epochs"],
+                    len(self.train_dataloader))
+
+            self.optimizer = build_optimizer(self.config["Optimizer"], self.lr_scheduler, self.model,
+                                             self.config["Global"]["epochs"], len(self.train_dataloader),
+                                             self.lr_decay_unit)
 
         # load pretrained model
             if  self.config["Global"]["pretrained_model"] is not None:
@@ -368,4 +372,4 @@ def export(self):
         self.model.eval()
 
         path = os.path.join(self.output_dir, self.config["Model"]["name"])
-        io.export(self.config["Export"], self.model, path)
+        io.export(self.config["Export"], self.model, path)
\ No newline at end of file
diff --git a/passl/models/swav.py b/passl/models/swav.py
index 22905d9c..8d509560 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -105,68 +105,67 @@ def __init__(self, **kwargs):
     def load_pretrained(self, path, rank=0, finetune=False):
         self._load_model(path, self.res_model, 'backbone') 
 
-    def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=None):
-        """
-        custom_cfg(dict|optional): [{'name': 'backbone', 'lr': 0.1, 'LRScheduler': {"lr":1.0}}, {'name': 'norm', 'weight_decay_mult': 0}]
-        """
-
-        self.custom_cfg = config.pop('custom_cfg', None)
-        if self.custom_cfg is not None:
-            assert isinstance(self.custom_cfg, list), "`custom_cfg` must be a list."
+    # def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=None):
+    #     """
+    #     custom_cfg(dict|optional): [{'name': 'backbone', 'lr': 0.1, 'LRScheduler': {"lr":1.0}}, {'name': 'norm', 'weight_decay_mult': 0}]
+    #     """
+
+    #     self.custom_cfg = config.pop('custom_cfg', None)
+    #     if self.custom_cfg is not None:
+    #         assert isinstance(self.custom_cfg, list), "`custom_cfg` must be a list."
         
-        for item in self.custom_cfg:
-            assert isinstance(
-                    item, dict), "The item of `custom_cfg` must be a dict"
+    #     for item in self.custom_cfg:
+    #         assert isinstance(
+    #                 item, dict), "The item of `custom_cfg` must be a dict"
         
-        param_group = self._collect_params(config, self.res_model, tensor_fusion, epochs, trainset_length)
+    #     param_group = self._collect_params(config, self.res_model, tensor_fusion, epochs, trainset_length)
 
-        return param_group
+    #     return param_group
     
-    def _collect_params(self, config, model, tensor_fusion, epochs, trainset_length):
-        # Collect different parameter groups
-        if self.custom_cfg is None or len(self.custom_cfg) == 0:
-            return [{'params': model.parameters(), 'tensor_fusion': tensor_fusion}]
-
-        # split params
-        self.weight_decay = config['weight_decay']
-        params_dict = {item['name']: [] for item in self.custom_cfg} # key name and a PasslDefault
-        params_dict['PasslDefault'] = []
-        for name, param in model.named_parameters():
-            if param.stop_gradient:
-                continue
-            for idx, item in enumerate(self.custom_cfg):
-                if item['name'] in name:
-                    params_dict[item['name']].append(param)
-                    break
-            else:
-                params_dict['PasslDefault'].append(param)
-
-        res = []
-        for item in self.custom_cfg:
-            weight_decay_mult = item.get("weight_decay_mult", None)
-            if item.get("LRScheduler", None) is not None:
-                lr_scheduler = build_lr_scheduler(item['LRScheduler'], epochs, trainset_length, config['decay_unit'])
-            else:
-                Warning('The LRScheduler is not set for group with name {}, use default LRScheduler'.format(item['name']))
-            # todo: initialize LRCallable here.
-            param_dict = {'params': params_dict[item['name']], 'lr': lr_scheduler}    
-
-            if self.weight_decay is not None and weight_decay_mult is not None:
-                param_dict['weight_decay'] = self.weight_decay * weight_decay_mult
-            param_dict['tensor_fusion'] = tensor_fusion
-            res.append(param_dict)
-        else:
-            res.append({'params': params_dict['PasslDefault'], 'tensor_fusion': tensor_fusion})
-
-        msg = 'Parameter groups for optimizer: \n'
-        for idx, item in enumerate(self.custom_cfg):
-            params_name = [p.name for p in params_dict[item['name']]]
-            item = item.copy()
-            item['params_name'] = params_name
-            msg += 'Group {}: \n{} \n'.format(idx, item)
-        logger.info(msg)
-
-        return res
+    # def _collect_params(self, config, model, tensor_fusion, epochs, trainset_length):
+    #     # Collect different parameter groups
+    #     if self.custom_cfg is None or len(self.custom_cfg) == 0:
+    #         return [{'params': model.parameters(), 'tensor_fusion': tensor_fusion}]
+
+    #     # split params
+    #     self.weight_decay = config['weight_decay']
+    #     params_dict = {item['name']: [] for item in self.custom_cfg} # key name and a PasslDefault
+    #     params_dict['PasslDefault'] = []
+    #     for name, param in model.named_parameters():
+    #         if param.stop_gradient:
+    #             continue
+    #         for idx, item in enumerate(self.custom_cfg):
+    #             if item['name'] in name:
+    #                 params_dict[item['name']].append(param)
+    #                 break
+    #         else:
+    #             params_dict['PasslDefault'].append(param)
+
+    #     res = []
+    #     for item in self.custom_cfg:
+    #         weight_decay_mult = item.get("weight_decay_mult", None)
+    #         if item.get("LRScheduler", None) is not None:
+    #             lr_scheduler = build_lr_scheduler(item['LRScheduler'], epochs, trainset_length, config['decay_unit'])
+    #         else:
+    #             Warning('The LRScheduler is not set for group with name {}, use default LRScheduler'.format(item['name']))
+    #         param_dict = {'params': params_dict[item['name']], 'lr': lr_scheduler}    
+
+    #         if self.weight_decay is not None and weight_decay_mult is not None:
+    #             param_dict['weight_decay'] = self.weight_decay * weight_decay_mult
+    #         param_dict['tensor_fusion'] = tensor_fusion
+    #         res.append(param_dict)
+    #     else:
+    #         res.append({'params': params_dict['PasslDefault'], 'tensor_fusion': tensor_fusion})
+
+    #     msg = 'Parameter groups for optimizer: \n'
+    #     for idx, item in enumerate(self.custom_cfg):
+    #         params_name = [p.name for p in params_dict[item['name']]]
+    #         item = item.copy()
+    #         item['params_name'] = params_name
+    #         msg += 'Group {}: \n{} \n'.format(idx, item)
+    #     logger.info(msg)
+
+    #     return res
     
     def forward(self, inp):
         return self.res_model(inp)
diff --git a/passl/optimizer/__init__.py b/passl/optimizer/__init__.py
index 43216690..9f2170ae 100644
--- a/passl/optimizer/__init__.py
+++ b/passl/optimizer/__init__.py
@@ -18,12 +18,12 @@
 from collections import defaultdict
 
 import copy
+import re
 import paddle
 
 from passl.core.grad_clip import ClipGradByGlobalNorm
 from passl.core.param_fuse import get_fused_params
-from passl.scheduler import LRCallable
-
+from passl.scheduler import build_lr_scheduler, LRCallable
 from passl.utils import logger
 
 from .optimizer import Optimizer
@@ -32,76 +32,183 @@
 from .momentum import Momentum
 from .momentum_lars import MomentumLARS
 from .momentum_larc import MomentumLARC
-
-
-def build_optimizer(optim_config, model, config, trainset_length, lr_scheduler):
-    optim_config = copy.deepcopy(optim_config)
-    optim_name = optim_config.pop('name')
-    
+from .utils.group_params import (
+    param_group_layer_decay,
+    param_group_weight_decay,
+    group_params_by_state)
+
+
+def build_group_lr_scheduler(param_groups_cfg, epochs, step_each_epoch, lr_decay_unit):
+    '''
+    Build lr scheduler in each param_group.
+    Args:
+        param_groups_cfg: Dict, param_groups config
+        epochs: Int, epochs
+        step_each_epoch: Int, step for each epoch
+
+    Returns:
+        param_groups_cfg: Dict of param_groups config in which lr has beed build
+    '''
+    for idx, item in enumerate(param_groups_cfg):
+        lr_cfg = item.get('lr', None)
+        if isinstance(lr_cfg, dict):
+            if 'decay_unit' in lr_cfg:
+                logger.warning('decay_unit is no need to set, for it will be reset by lr_decay_unit.')
+            lr_cfg['decay_unit'] = lr_decay_unit
+            lr_scheduler = build_lr_scheduler(lr_cfg, epochs, step_each_epoch)
+            if isinstance(lr_scheduler, LRCallable):
+                item['lr_func'] = lr_scheduler
+            else:
+                item['lr'] = lr_scheduler
+        elif isinstance(lr_cfg, float):
+           item['lr'] = lr_cfg
+        logger.info('build lr scheduler in param_groups succeed.')
+    return param_groups_cfg
+
+
+def group_params(model, param_groups_cfg=None):
+    '''
+    Group params by config or by stop_gradient by default.
+    Args:
+        model: paddle.nn.Layer
+        param_groups_cfg: Dict, param_groups config
+    Returns:
+        Dict, f.g. {'group_name': {'params': [(name, param), ...],}}
+    '''
+
+    if param_groups_cfg and len(param_groups_cfg) > 0:
+        params_dict = {}
+        # init params_dict by config
+        for group in param_groups_cfg:
+            params_dict[group['name']] = {}
+            params_dict[group['name']]['params'] = []
+            for k, v in group.items():
+                params_dict[group['name']][k] = v
+        # add params
+        for name, param in model.named_parameters():
+            if param.stop_gradient:
+                continue
+            flag = 0
+            for g_name in params_dict:
+                if 'regular_exp' in params_dict[g_name]:
+                    regular_exp = params_dict[g_name]['regular_exp']
+                    group_matcher = re.compile(regular_exp)
+                else: 
+                    group_matcher = re.compile(g_name)
+                if group_matcher.match(name):
+                    params_dict[g_name]["params"].append((name, param))
+                    flag = 1
+                    break
+            if flag == 0:
+                if 'default' not in params_dict:
+                    params_dict['default'] = {'params': []}
+                params_dict['default']["params"].append((name, param))
+
+        logger.info(f'Model parameters has been split into {len(params_dict)} groups by config.')
+        for key in params_dict:
+            logger.info(f"{key}-params length: {len(params_dict[key]['params'])}")
+
+        return params_dict
+
+    # default group method
+    param_groups = []
+    for name, param in model.named_parameters():
+        if param.stop_gradient:
+            continue
+        param_groups.append((name, param))
+    logger.info(f'Model parameters has been split into 1 groups by default.')
+    return {'default': {"params": param_groups}}
+
+
+def build_optimizer(config, lr_scheduler, model, epochs, step_each_epoch, lr_decay_unit):
+    config = copy.deepcopy(config)
+
+    optim_name = config.pop('name')
+    layer_decay = config.pop('layer_decay', None)
     grad_clip = None
-    grad_clip_config = optim_config.pop('grad_clip', None)
+    grad_clip_config = config.pop('grad_clip', None)
     if grad_clip_config is not None:
         grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm')
         grad_clip = eval(grad_clip_name)(**grad_clip_config)
 
-    no_weight_decay_name = optim_config.pop('no_weight_decay_name', [])
-    tensor_fusion = optim_config.pop('tensor_fusion', True)
+    weight_decay = config.get('weight_decay', None)
+    no_weight_decay_name = config.pop('no_weight_decay_name', [])
+
+    tensor_fusion = config.pop('tensor_fusion', True)
     if 'LAR' in optim_name:
         tensor_fusion = False
-        logger.info('LARS or LARC Optimizer can not use tensor fusion technology. It automatically fall back to `tensor_fusion = False`.')
-
-    if hasattr(model, 'param_groups'):
-        # param_group = model.param_groups(no_weight_decay_name, tensor_fusion) # todo compact simsaim
-        param_group = model.param_groups(optim_config, tensor_fusion, config["Global"]["epochs"], trainset_length)
-        for group in param_group:
-            if 'tensor_fusion' in group and group['tensor_fusion']:
-                group['params'] = get_fused_params(group['params'])
-        optim_config.pop('custom_cfg', None)
-
+        logger.info('LARS or LARC Optimizer can not use tensor fusion technology. '
+                    'It automatically fall back to `tensor_fusion = False`.')
+
+    # param_groups is a dict like {'group_name': {'params': [(name, param), ...]}}
+    if hasattr(model, 'param_group_fn'):
+        # param groups are defined by model
+        model_group_cfg = config.pop('param_group_fn', {})
+        param_group_map = model.param_group_fn(no_weight_decay_name=no_weight_decay_name, weight_decay=weight_decay,
+                                               layer_decay=layer_decay, **model_group_cfg)
     else:
-        param_group_map = defaultdict(list)
-        for n, p in model.named_parameters():
-            state = copy.deepcopy(p.__dict__)
-            state['stop_gradient'] = p.stop_gradient
-            if any(nd in n for nd in no_weight_decay_name):
-                state['no_weight_decay'] = True
-            param_group_map[str(state)].append(p)
-
-        if tensor_fusion:
-            # fuse params
-            for key in param_group_map:
-                if 'gpu' not in paddle.get_device():
-                    continue
-                if "'is_distributed': True" in key:
-                    continue
-                if "'has_sparse_grad': True" in key:
-                    continue
-                param_group_map[key] = get_fused_params(param_group_map[key])
-
-        # bulid optimizer params
-        param_group = []
+        param_groups_cfg = config.get('param_groups', None)
+        if param_groups_cfg and len(param_groups_cfg) > 0:
+            param_groups_cfg = build_group_lr_scheduler(param_groups_cfg, epochs, step_each_epoch, lr_decay_unit)
+        param_group_map = group_params(model, param_groups_cfg)
+        if isinstance(layer_decay, float):
+            param_group_map = param_group_layer_decay(model,
+                                                      layer_decay,
+                                                      weight_decay=weight_decay,
+                                                      param_groups_map=param_group_map,
+                                                      no_weight_decay_list=no_weight_decay_name,
+                                                      )
+        elif len(no_weight_decay_name) > 0:
+            param_group_map = param_group_weight_decay(model,
+                                                      weight_decay=weight_decay,
+                                                      param_groups_map=param_group_map,
+                                                      no_weight_decay_list=no_weight_decay_name,
+                                                      )
+
+    for key in param_group_map:
+        param_group_map[key]['params'] = [p for (n, p) in param_group_map[key]['params']]
+
+    if tensor_fusion:
+        param_group_map = group_params_by_state(param_group_map)
+        # fuse params
         for key in param_group_map:
-            group = {'params': param_group_map[key]}
-
+            if 'gpu' not in paddle.get_device():
+                continue
             if "'is_distributed': True" in key:
-                group['is_distributed'] = True
-
-            if 'no_weight_decay' in key:
-                group['weight_decay'] = 0.0
-
-            param_group.append(group)
-
-    lr = lr_scheduler	
-    lr_func = None	
-    if isinstance(lr_scheduler, LRCallable):	
-        lr = lr_scheduler.lr	
+                continue
+            if "'has_sparse_grad': True" in key:
+                continue
+            param_group_map[key]["params"] = get_fused_params(param_group_map[key]["params"])
+
+    param_group = []
+    for key in param_group_map:
+        group = param_group_map[key]
+        if "'is_distributed': True" in key:
+            group['is_distributed'] = True
+        if 'no_weight_decay' in key:
+            group['weight_decay'] = 0.0
+        param_group.append(group)
+
+    # build default lr scheduler
+    lr = lr_scheduler
+    lr_func = None
+    lr_cfg = config.pop('lr', None)
+    if isinstance(lr_cfg, float):
+        lr = lr_cfg
+    elif isinstance(lr_cfg, dict):
+        if 'decay_unit' in lr_cfg:
+            logger.warning('decay_unit is no need to set, for it will be reset by lr_decay_unit.')
+        lr_cfg['decay_unit'] = lr_decay_unit
+        lr_scheduler = build_lr_scheduler(lr_cfg, epochs, step_each_epoch)
+        lr = lr_scheduler
+    if isinstance(lr_scheduler, LRCallable):
+        lr = lr_scheduler.lr
         lr_func = lr_scheduler
-
+    assert lr is not None, 'lr should not be None.'
     optim = eval(optim_name)(param_group,
-                             lr=lr,	
+                             lr=lr,
                              lr_func=lr_func,
                              grad_clip=grad_clip,
-                             **optim_config)
-                             
+                             **config)
     logger.debug("build optimizer ({}) success..".format(optim))
-    return optim
+    return optim
\ No newline at end of file
diff --git a/passl/optimizer/momentum.py b/passl/optimizer/momentum.py
index 179839fc..55402fd4 100644
--- a/passl/optimizer/momentum.py
+++ b/passl/optimizer/momentum.py
@@ -26,6 +26,8 @@
 class Momentum(Optimizer):
     def __init__(self,
                  params,
+                 lr=0.001,
+                 lr_func=None,
                  momentum=0.9,
                  weight_decay=0.0,
                  use_master_param=True,
@@ -33,6 +35,8 @@ def __init__(self,
                  **args):
 
         defaults = dict(
+            lr=lr,
+            lr_func=lr_func,
             momentum=momentum,
             weight_decay=weight_decay,
             use_master_param=use_master_param,
@@ -68,7 +72,7 @@ def step(self):
                 grad = p.grad
                 if grad is None:
                     continue
-                # print('###########',p.name)
+
                 if grad.is_selected_rows():
                     raise RuntimeError(
                         'Momentum does not support sparse gradients.')
diff --git a/passl/optimizer/optimizer.py b/passl/optimizer/optimizer.py
index 234af8b9..19cd3428 100644
--- a/passl/optimizer/optimizer.py
+++ b/passl/optimizer/optimizer.py
@@ -83,7 +83,6 @@ def add_param_group(self, param_group):
                     param_group.setdefault(name, deepcopy(default))
                 else:
                     param_group.setdefault(name, default)
-
         params = param_group['params']
         if len(params) != len(set(params)):
             warnings.warn(
@@ -114,6 +113,15 @@ def __getstate__(self):
     def __setstate__(self, state):
         self.__dict__.update(state)
 
+    @staticmethod
+    def _get_lr(param_group):
+        lr_t = param_group["lr"]
+        if isinstance(lr_t, paddle.optimizer.lr.LRScheduler):
+            lr_t = lr_t.get_lr()
+        if 'lr_scale' in param_group:
+            lr_t *= param_group['lr_scale']
+        return lr_t
+
     def state_dict(self):
         def pack_group(group):
             packed = {k: v for k, v in group.items() if k != 'params'}
@@ -206,16 +214,13 @@ def clear_grad(self, set_to_zero=True):
 
     @paddle.no_grad()
     def lr_step(self, step=None):
-        for i, group in enumerate(self.param_groups):
+        for group in self.param_groups:
             lr = group['lr']
-            
-            if isinstance(lr, paddle.optimizer.lr.LRScheduler): # group defined lr scheduler
+            if isinstance(lr, paddle.optimizer.lr.LRScheduler):
                 lr.step(step)
             elif 'lr_func' in group and callable(group['lr_func']):
                 group['lr_func'](group, step)
 
-        # print("####lr0 {}, lr0 {}".format(self.param_groups[0]['lr'].get_lr(), self.param_groups[1]['lr'].get_lr()))
-
     @paddle.no_grad()
     def get_lr(self, group_id=0):
         lr = self.param_groups[group_id]['lr']
@@ -225,4 +230,4 @@ def get_lr(self, group_id=0):
 
     @paddle.no_grad()
     def step(self):
-        raise NotImplementedError
+        raise NotImplementedError
\ No newline at end of file
diff --git a/passl/optimizer/utils/__init__.py b/passl/optimizer/utils/__init__.py
new file mode 100644
index 00000000..9d9f7a4a
--- /dev/null
+++ b/passl/optimizer/utils/__init__.py
@@ -0,0 +1 @@
+from .group_params import *
\ No newline at end of file
diff --git a/passl/optimizer/utils/group_params.py b/passl/optimizer/utils/group_params.py
new file mode 100644
index 00000000..9108222c
--- /dev/null
+++ b/passl/optimizer/utils/group_params.py
@@ -0,0 +1,194 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import re
+from collections import defaultdict
+from passl.utils import logger
+
+
+def group_with_matcher(model, group_matcher):
+    """
+
+    Args:
+        named_params: List like [(name, param),]
+        group_matcher: Dict like {group_name: regular_expression1}
+    Returns:
+        param_groups: Dict like {group_name: [param_name1, param_name2, ...]}
+
+    """
+    matcher_list = []
+    for group_name, re_exps in group_matcher.items():
+        assert re_exps is not None, "re_exps should not be None."
+        if isinstance(re_exps, (tuple, list)):
+            for re_str in re_exps:
+                matcher_list.append((group_name, re.compile(re_str)))
+        else:
+            matcher_list.append((group_name, re.compile(re_exps)))
+    param_groups = defaultdict(list)
+    default_group = []
+    for name, param in model.named_parameters():
+        if param.stop_gradient:
+           continue
+        flag = 0
+        for group_name, matcher in matcher_list:
+            res = matcher.match(name)
+            if res:
+                param_groups[group_name].append((name, param))
+                flag = 1
+        if flag == 0:
+            default_group.append((name, param))
+    if len(default_group) > 0:
+        param_groups['default'] = default_group
+    param_groups = {k: {"params": v} for k, v in param_groups.items()}
+    return param_groups
+
+
+def group_params_by_state(param_groups_map):
+    '''
+    group parameters by state for tensor fusion
+    Args:
+        param_groups_map: Dict like {'group_name': {'params': [param1, param2, ...]}}
+
+    Returns:
+        new_param_groups: Dict like {'group_name': {'params': [param1, param2, ...]}}
+    '''
+    new_param_groups = {}
+    for g_name in param_groups_map:
+        for param in param_groups_map[g_name]['params']:
+            if param.stop_gradient:
+                continue
+            state = copy.deepcopy(param.__dict__)
+            new_group_name = g_name+'_'+str(state)
+            if new_group_name not in new_param_groups:
+                new_param_groups[new_group_name] = {
+                    "params": [],
+                    "group_name": new_group_name,
+                }
+                for key in param_groups_map[g_name]:
+                    if key not in ["params", "group_name"]:
+                        new_param_groups[new_group_name][key] = param_groups_map[g_name][key]
+
+            new_param_groups[new_group_name]["params"].append(param)
+    logger.info(f"The original param_groups which has {len(param_groups_map)} "
+                f"groups has been split to {len(new_param_groups)} groups by state.")
+    return new_param_groups
+
+
+def param_group_layer_decay(
+        model,
+        layer_decay,
+        weight_decay=None,
+        group_matcher=None,
+        no_weight_decay_list=(),
+        param_groups_map=None,
+    ):
+    '''
+    group parameters by layer_decay and weight_decay setting
+    Args:
+        model: instance of paddle.nn.Layer
+        layer_decay: float or None
+        weight_decay: float or None by default, which can also assigned in the optimizer args,
+                    but it has the highest priority if given here.
+        group_matcher: Dict like {group_name: regular_expression1}
+        no_weight_decay_list: list of string(layer name keyword)
+        param_groups_map:  Dict like {group_name: {'params': [(name, group), ...]}}
+
+    Returns:
+        param_groups: Dict like {group_name: {'params': [(name, group), ...]}}
+    '''
+    assert (not group_matcher) or (not param_groups_map), \
+        "group_matcher and param_names_group should not be given in the same time."
+    if group_matcher:
+        param_groups_map = group_with_matcher(model, group_matcher)
+    num_layers = len(param_groups_map)
+    layer_scales = {z[0]: layer_decay ** (num_layers - i) for i, (k, v) in enumerate(param_groups_map.items()) for z in v}
+    param_groups = {}
+    for g_name in param_groups_map:
+        for name, param in param_groups_map[g_name]['params']:
+            if param.stop_gradient:
+                continue
+            lr_scale = layer_scales[name] if name in layer_scales else 1.
+            if any(nd in name for nd in no_weight_decay_list):
+                this_decay = 0.
+                g_decay = "no_weight_decay"
+            else:
+                this_decay = weight_decay
+                g_decay = "weight_decay"
+            new_group_name = g_name + '_' + g_decay
+            if new_group_name not in param_groups:
+                param_groups[new_group_name] = {
+                    "lr_scale": lr_scale,
+                    "params": [],
+                    "group_name": new_group_name,
+                }
+                for key in param_groups_map[g_name]:
+                    if key not in param_groups[new_group_name]:
+                        param_groups[new_group_name][key] = param_groups_map[g_name][key]
+            if this_decay is not None:
+                param_groups[new_group_name]["weight_decay"] = this_decay
+            param_groups[new_group_name]["params"].append((name, param))
+    return param_groups
+
+
+def param_group_weight_decay(
+        model,
+        group_matcher=None,
+        weight_decay=None,
+        no_weight_decay_list=(),
+        param_groups_map=None,
+    ):
+    '''
+    group parameters by weight_decay setting
+    Args:
+        model: instance of paddle.nn.Layer
+        group_matcher: Dict like {group_name: regular_expression1}
+        weight_decay: float or None by default, which can also assigned in the optimizer args,
+                    but it has the highest priority if given here.
+        no_weight_decay_list: list of string(layer name keyword)
+        param_groups_map: Dict like {group_name: {'params': [(name, group), ...]}}
+
+    Returns:
+        param_groups: Dict like {group_name: {'params': [(name, group), ...]}}
+    '''
+    # weight_decay value can be None and assigned in the optimizer config,
+    # but it has the highest priority if given here.
+    assert (not group_matcher) or (not param_groups_map), \
+        "group_matcher and param_names_group should not be given in the same time."
+    param_groups = {}
+    if group_matcher is not None:
+        param_groups_map = group_with_matcher(model, group_matcher)
+    for g_name in param_groups_map:
+        for name, param in param_groups_map[g_name]['params']:
+            if param.stop_gradient:
+                continue
+            if any(nd in name for nd in no_weight_decay_list):
+                g_decay = "no_weight_decay"
+                this_decay = 0.
+            else:
+                g_decay = "weight_decay"
+                this_decay = weight_decay
+            new_group_name = g_name + "_" + g_decay
+            if new_group_name not in param_groups:
+                param_groups[new_group_name] = {
+                    "params": [],
+                    "group_name": new_group_name,
+                }
+                for key in param_groups_map[g_name]:
+                    if key not in param_groups[new_group_name]:
+                        param_groups[new_group_name][key] = param_groups_map[g_name][key]
+            if this_decay is not None:
+                param_groups[new_group_name]["weight_decay"] = this_decay
+            param_groups[new_group_name]["params"].append((name, param))
+
+    return param_groups
\ No newline at end of file
diff --git a/passl/scheduler/__init__.py b/passl/scheduler/__init__.py
index 4f31e170..bb0522e3 100644
--- a/passl/scheduler/__init__.py
+++ b/passl/scheduler/__init__.py
@@ -17,11 +17,11 @@
 from passl.utils import logger
 
 from .lr_scheduler import TimmCosine, ViTLRScheduler, Step, Poly
-from .lr_callable import LRCallable, CosineWithFixLR
+from .lr_callable import LRCallable
 
 
-def build_lr_scheduler(lr_config, epochs, step_each_epoch, decay_unit):
-    lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch, 'decay_unit': decay_unit })
+def build_lr_scheduler(lr_config, epochs, step_each_epoch):
+    lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch})
     if 'name' in lr_config:
         lr_name = lr_config.pop('name')
         if "MultiStepDecay" in lr_name:
@@ -39,4 +39,4 @@ def build_lr_scheduler(lr_config, epochs, step_each_epoch, decay_unit):
     else:
         lr = lr_config['learning_rate']
     logger.debug("build lr ({}) success..".format(lr))
-    return lr
+    return lr
\ No newline at end of file
diff --git a/passl/scheduler/lr_callable.py b/passl/scheduler/lr_callable.py
index b4722733..3bd049e9 100644
--- a/passl/scheduler/lr_callable.py
+++ b/passl/scheduler/lr_callable.py
@@ -1,17 +1,3 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import math
 
 class LRCallable(object):
@@ -35,4 +21,4 @@ def __call__(self, group, epoch):
         if 'fix_lr' in group and group['fix_lr']:
             group['lr'] = self.lr
         else:
-            group['lr'] = cur_lr
+            group['lr'] = cur_lr
\ No newline at end of file
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
index 8f641d0f..6d9687ab 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
@@ -41,16 +41,16 @@ Optimizer:
   momentum: 0.9
   weight_decay: 0.0
   tensor_fusion: False
-  decay_unit: epoch
-  LRScheduler:
+  lr_decay_unit: epoch
+  lr:
     name: MultiStepDecay
     learning_rate: 0.02
     milestones: [12, 16]
     gamma: 0.2
     last_epoch: -1
-  custom_cfg:
-    - name: head
-      LRScheduler:
+  param_groups:
+    - name: res_model.projection_head
+      lr: 
         name: MultiStepDecay
         learning_rate: 5
         milestones: [12, 16]
@@ -74,7 +74,7 @@ DataLoader:
       samples_tag: semi_10
     sampler:
       name: DistributedBatchSampler
-      batch_size: 64 # accum_steps: 1, total batchsize: 256
+      batch_size: 128 # accum_steps: 1, total batchsize: 256
       drop_last: False
       shuffle: True
     loader:
@@ -96,7 +96,7 @@ DataLoader:
             std: [0.228, 0.224, 0.225]
     sampler:
       name: DistributedBatchSampler
-      batch_size: 64
+      batch_size: 128
       drop_last: False
       shuffle: False
     loader:
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
index 5a4e9a83..bf7bdf5b 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
@@ -42,8 +42,8 @@ Optimizer:
   momentum: 0.9
   weight_decay: 1e-6
   tensor_fusion: True
-  decay_unit: epoch
-  LRScheduler:
+  lr_decay_unit: epoch
+  lr:
     name: TimmCosine
     learning_rate: 0.3
     eta_min: 0.0
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
index 9eaf4fa1..a6d0c2e5 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
@@ -46,8 +46,8 @@ Optimizer:
   trust_coefficient: 0.001
   clip: False
   tensor_fusion: False
-  decay_unit: step
-  LRScheduler:
+  lr_decay_unit: step
+  lr:
     name: TimmCosine
     learning_rate: 0.6
     eta_min: 0.0006
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
index 5ccbc6ad..4cf7398e 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
@@ -49,8 +49,8 @@ Optimizer:
   trust_coefficient: 0.001
   clip: False
   tensor_fusion: False
-  decay_unit: step
-  LRScheduler:
+  lr_decay_unit: step
+  lr:
     name: TimmCosine
     learning_rate: 4.8
     eta_min: 0.0048
diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh
index 9844d806..e52a9d0f 100644
--- a/tasks/ssl/swav/finetune.sh
+++ b/tasks/ssl/swav/finetune.sh
@@ -17,7 +17,7 @@ unset PADDLE_TRAINER_ENDPOINTS
 unset DISTRIBUTED_TRAINER_ENDPOINTS
 export PADDLE_NNODES=1
 export PADDLE_MASTER="127.0.0.1:12538"
-export CUDA_VISIBLE_DEVICES=4,5,6,7
+export CUDA_VISIBLE_DEVICES=0,1 #,2,5
 
 python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \

From 77060bf5d78282ec33746eeb41581674ea554be5 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Fri, 5 May 2023 17:17:52 +0800
Subject: [PATCH 20/46] format

---
 passl/core/param_fuse.py                      |   2 +-
 passl/data/dataset/multicrop_dataset.py       |   4 +-
 passl/engine/engine.py                        |   2 +-
 passl/engine/loops/classification_loop.py     |   8 +-
 .../engine/loops/contrastive_learning_loop.py | 113 +------------
 passl/models/swav.py                          | 154 ++++--------------
 passl/models/swav_resnet.py                   |  29 +++-
 passl/optimizer/__init__.py                   |   4 +-
 passl/optimizer/optimizer.py                  |   2 +-
 passl/optimizer/utils/__init__.py             |  16 +-
 passl/optimizer/utils/group_params.py         |   2 +-
 passl/scheduler/__init__.py                   |   2 +-
 passl/scheduler/lr_callable.py                |   2 +-
 passl/utils/io.py                             |   4 +-
 tasks/ssl/swav/README.md                      |   2 +-
 ...wav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml |   4 +-
 ...swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml |   2 +-
 tasks/ssl/swav/finetune.sh                    |   4 +-
 tasks/ssl/swav/pretrain.sh                    |   2 +-
 .../swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh |   2 +-
 .../swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh |   2 +-
 ...wav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh |   2 +-
 22 files changed, 105 insertions(+), 259 deletions(-)

diff --git a/passl/core/param_fuse.py b/passl/core/param_fuse.py
index 87fc5cb3..f3ff5e46 100644
--- a/passl/core/param_fuse.py
+++ b/passl/core/param_fuse.py
@@ -504,4 +504,4 @@ def get_fused_params(params):
     for group_idx, parameters in var_groups.items():
         fused_param = flatten_dense_tensors(parameters)
         fused_params.append(fused_param)
-    return fused_params
\ No newline at end of file
+    return fused_params
diff --git a/passl/data/dataset/multicrop_dataset.py b/passl/data/dataset/multicrop_dataset.py
index 42b800f7..a4488e7b 100644
--- a/passl/data/dataset/multicrop_dataset.py
+++ b/passl/data/dataset/multicrop_dataset.py
@@ -72,7 +72,7 @@ def __getitem__(self, index):
         path, target = self.imgs[index]
         sample = self.loader(path)
         sample = list(map(lambda trans: trans(sample), self.trans))
-        
+
         return sample, target
 
 
@@ -89,4 +89,4 @@ def get_color_distortion(s=1.0):
     rnd_color_jitter = RandomApply([color_jitter], p=0.8)
     rnd_gray = RandomGrayscale(p=0.2)
     color_distort = Compose([rnd_color_jitter, rnd_gray])
-    return color_distort
\ No newline at end of file
+    return color_distort
diff --git a/passl/engine/engine.py b/passl/engine/engine.py
index 7cacb83f..c3277561 100644
--- a/passl/engine/engine.py
+++ b/passl/engine/engine.py
@@ -372,4 +372,4 @@ def export(self):
         self.model.eval()
 
         path = os.path.join(self.output_dir, self.config["Model"]["name"])
-        io.export(self.config["Export"], self.model, path)
\ No newline at end of file
+        io.export(self.config["Export"], self.model, path)
diff --git a/passl/engine/loops/classification_loop.py b/passl/engine/loops/classification_loop.py
index 08bdc1d0..659bcc19 100644
--- a/passl/engine/loops/classification_loop.py
+++ b/passl/engine/loops/classification_loop.py
@@ -51,7 +51,7 @@ def forward_backward(self, batch):
         for idx in range(self.trainer.accum_steps):
             data = batch[0][idx * step_size:(idx + 1) * step_size]
             label = batch[1][idx * step_size:(idx + 1) * step_size]
-            
+
             # do cast if using fp16 otherwise do nothing
             with paddle.amp.auto_cast(
                     enable=self.trainer.fp16,
@@ -61,7 +61,7 @@ def forward_backward(self, batch):
 
                 out = self.trainer.model(data)
                 final_out.append(out)
-                            
+
             loss_dict = self.trainer.train_loss_func(out, label)
 
             for key in loss_dict:
@@ -92,7 +92,7 @@ def train_one_step(self, batch, total_iterations=None):
         self.trainer.scaler.update()
         # clear gradients
         self.trainer.optimizer.clear_grad()
-        
+
         if self.trainer.lr_decay_unit == 'step':
             self.trainer.optimizer.lr_step(self.global_step)
 
@@ -175,7 +175,7 @@ def eval_one_dataset(self, eval_dataloader):
                     custom_white_list=self.trainer.fp16_custom_white_list,
                     custom_black_list=self.trainer.fp16_custom_black_list,
                     level=self.trainer.fp16_level):
-                
+
                 out = self.trainer.model(batch[0])
                 # calc loss
                 if self.trainer.eval_loss_func is not None:
diff --git a/passl/engine/loops/contrastive_learning_loop.py b/passl/engine/loops/contrastive_learning_loop.py
index d943d4cc..5bdefea6 100644
--- a/passl/engine/loops/contrastive_learning_loop.py
+++ b/passl/engine/loops/contrastive_learning_loop.py
@@ -16,97 +16,13 @@
 from __future__ import division
 from __future__ import print_function
 
-import os
-import sys
-import logging
-from datetime import timedelta
-
-import time
 import collections
-import platform
 import paddle
-from passl.core import grad_sync, param_sync
-from passl.utils import io
+from passl.core import grad_sync
 
-from passl.utils import profiler
 from passl.utils import logger
 from .loop import TrainingEpochLoop
 
-
-class LogFormatter:
-    def __init__(self):
-        self.start_time = time.time()
-
-    def format(self, record):
-        elapsed_seconds = round(record.created - self.start_time)
-
-        prefix = "%s - %s - %s" % (
-            record.levelname,
-            time.strftime("%x %X"),
-            timedelta(seconds=elapsed_seconds),
-        )
-        message = record.getMessage()
-        message = message.replace("\n", "\n" + " " * (len(prefix) + 3))
-        return "%s - %s" % (prefix, message) if message else ""
-
-
-def create_logger(filepath, rank):
-    """
-    Create a logger.
-    Use a different log file for each process.
-    """
-    # create log formatter
-    log_formatter = LogFormatter()
-
-    # create file handler and set level to debug
-    if filepath is not None:
-        if rank > 0:
-            filepath = "%s-%i" % (filepath, rank)
-        file_handler = logging.FileHandler(filepath, "a")
-        file_handler.setLevel(logging.DEBUG)
-        file_handler.setFormatter(log_formatter)
-
-    # create console handler and set level to info
-    console_handler = logging.StreamHandler()
-    console_handler.setLevel(logging.INFO)
-    console_handler.setFormatter(log_formatter)
-
-    # create logger and set level to debug
-    logger = logging.getLogger()
-    logger.handlers = []
-    logger.setLevel(logging.DEBUG)
-    logger.propagate = False
-    if filepath is not None:
-        logger.addHandler(file_handler)
-    logger.addHandler(console_handler)
-
-    # reset logger elapsed time
-    def reset_time():
-        log_formatter.start_time = time.time()
-
-    logger.reset_time = reset_time
-
-    return logger
-
-
-def init_logger(name):
-    logger = create_logger(
-        os.path.join("{}.log".format(name)), rank=0
-    )
-    logger.info("============ Initialized logger ============")
-    logger.info("")
-    return logger
-
-
-def log_model(model, logger):
-    model1 = model.res_model
-    for name, param in model1.named_parameters():
-        logger.info(name)
-        logger.info(param.abs().sum())
-        if param.grad is not None:
-            logger.info(name+'grad')
-            logger.info(param.grad.abs().sum())
-        
 class ContrastiveLearningTrainingEpochLoop(TrainingEpochLoop):
 
     def __init__(self, trainer, epochs, max_train_step=None, val_loop=None):
@@ -136,9 +52,7 @@ def forward_backward(self, batch, total_iterations):
                 if isinstance(loss_dict, paddle.Tensor):
                     loss_dict = {'loss': loss_dict}
 
-            ####### test            #######
-            # logger1 = init_logger('before_pretrain')
-            # log_model(self.trainer.model, logger1)
+
             for key in loss_dict:
                 loss_dict[key] = loss_dict[key] / self.trainer.accum_steps
 
@@ -148,32 +62,19 @@ def forward_backward(self, batch, total_iterations):
             # loss scaling if using fp16 otherwise do nothing
             scaled = self.trainer.scaler.scale(loss_dict["loss"])
             scaled.backward()
-            
-            try: 
+
+            try:
                 self.trainer.model.after_loss_backward(total_iterations)
             except AttributeError:
                 logger.warning("Model has no after_loss_backward method, ignored this process")
-            
-            ####### test            #######
-#             grad_sync(self.trainer.optimizer.param_groups)
-
-#             # do unscale and step if using fp16 and not found nan/inf
-#             # otherwise do nothing
-#             self.trainer.scaler.step(self.trainer.optimizer)
-#             # do update loss scaling if using fp16
-#             # otherwise do nothing
-#             self.trainer.scaler.update()
-            
-#             logger2 = init_logger('after_pretrain')
-            # log_model(self.trainer.model, logger2)
-        # print('final_loss_dict', final_loss_dict)
+
         return final_loss_dict
 
     def train_one_step(self, batch, total_iterations):
 
         # remove label
         batch = batch[0]
-        
+
         # do forward and backward
         loss_dict = self.forward_backward(batch, total_iterations)
 
@@ -185,7 +86,7 @@ def train_one_step(self, batch, total_iterations):
         # do update loss scaling if using fp16
         # otherwise do nothing
         self.trainer.scaler.update()
-        
+
         # clear gradients
         self.trainer.optimizer.clear_grad()
         if self.trainer.lr_decay_unit == 'step':
diff --git a/passl/models/swav.py b/passl/models/swav.py
index 8d509560..50795189 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import numpy as np
 from sys import flags
@@ -23,17 +37,16 @@
     'SwAVPretrain',
 ]
 
-# def model and 
 class SwAV(Model):
     def __init__(self, **kwargs):
         super().__init__()
         self.res_model = swavresnet50(**kwargs)
-    
+
     def _load_model(self, path, model, tag):
         path = path + ".pdparams"
         if os.path.isfile(path):
             para_state_dict = paddle.load(path)
-            
+
             # resnet
             model_state_dict = model.state_dict()
             keys = model_state_dict.keys()
@@ -68,13 +81,13 @@ def save(self, path, local_rank=0, rank=0):
     def _freeze_norm(self, layer):
         if isinstance(layer, (nn.layer.norm._BatchNormBase)):
             layer._use_global_stats = True
-        
+
 class SwAVLinearProbe(SwAV):
     def __init__(self, class_num=1000, **kwargs):
         super().__init__(**kwargs)
         self.linear = RegLog(class_num)
         self.res_model.eval()
-        
+
         # freeze all layers but the last fc
         for name, param in self.named_parameters():
             if name not in ['linear.linear.weight', 'linear.linear.bias']:
@@ -84,9 +97,9 @@ def __init__(self, class_num=1000, **kwargs):
         parameters = list(
             filter(lambda p: not p.stop_gradient, self.parameters()))
         assert len(parameters) == 2  # weight, bias
-        
+
         self.apply(self._freeze_norm)
-    
+
     def load_pretrained(self, path, rank=0, finetune=False):
         self._load_model(path, self.res_model, 'backbone')
 
@@ -94,79 +107,17 @@ def forward(self, inp):
         with paddle.no_grad():
             output = self.res_model(inp)
         output = self.linear(output)
-        
+
         return output
 
 class SwAVFinetune(SwAV):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.apply(self._freeze_norm)
-    
+
     def load_pretrained(self, path, rank=0, finetune=False):
-        self._load_model(path, self.res_model, 'backbone') 
-
-    # def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=None):
-    #     """
-    #     custom_cfg(dict|optional): [{'name': 'backbone', 'lr': 0.1, 'LRScheduler': {"lr":1.0}}, {'name': 'norm', 'weight_decay_mult': 0}]
-    #     """
-
-    #     self.custom_cfg = config.pop('custom_cfg', None)
-    #     if self.custom_cfg is not None:
-    #         assert isinstance(self.custom_cfg, list), "`custom_cfg` must be a list."
-        
-    #     for item in self.custom_cfg:
-    #         assert isinstance(
-    #                 item, dict), "The item of `custom_cfg` must be a dict"
-        
-    #     param_group = self._collect_params(config, self.res_model, tensor_fusion, epochs, trainset_length)
-
-    #     return param_group
-    
-    # def _collect_params(self, config, model, tensor_fusion, epochs, trainset_length):
-    #     # Collect different parameter groups
-    #     if self.custom_cfg is None or len(self.custom_cfg) == 0:
-    #         return [{'params': model.parameters(), 'tensor_fusion': tensor_fusion}]
-
-    #     # split params
-    #     self.weight_decay = config['weight_decay']
-    #     params_dict = {item['name']: [] for item in self.custom_cfg} # key name and a PasslDefault
-    #     params_dict['PasslDefault'] = []
-    #     for name, param in model.named_parameters():
-    #         if param.stop_gradient:
-    #             continue
-    #         for idx, item in enumerate(self.custom_cfg):
-    #             if item['name'] in name:
-    #                 params_dict[item['name']].append(param)
-    #                 break
-    #         else:
-    #             params_dict['PasslDefault'].append(param)
-
-    #     res = []
-    #     for item in self.custom_cfg:
-    #         weight_decay_mult = item.get("weight_decay_mult", None)
-    #         if item.get("LRScheduler", None) is not None:
-    #             lr_scheduler = build_lr_scheduler(item['LRScheduler'], epochs, trainset_length, config['decay_unit'])
-    #         else:
-    #             Warning('The LRScheduler is not set for group with name {}, use default LRScheduler'.format(item['name']))
-    #         param_dict = {'params': params_dict[item['name']], 'lr': lr_scheduler}    
-
-    #         if self.weight_decay is not None and weight_decay_mult is not None:
-    #             param_dict['weight_decay'] = self.weight_decay * weight_decay_mult
-    #         param_dict['tensor_fusion'] = tensor_fusion
-    #         res.append(param_dict)
-    #     else:
-    #         res.append({'params': params_dict['PasslDefault'], 'tensor_fusion': tensor_fusion})
-
-    #     msg = 'Parameter groups for optimizer: \n'
-    #     for idx, item in enumerate(self.custom_cfg):
-    #         params_name = [p.name for p in params_dict[item['name']]]
-    #         item = item.copy()
-    #         item['params_name'] = params_name
-    #         msg += 'Group {}: \n{} \n'.format(idx, item)
-    #     logger.info(msg)
-
-    #     return res
-    
+        self._load_model(path, self.res_model, 'backbone')
+
     def forward(self, inp):
         return self.res_model(inp)
 
@@ -179,23 +130,11 @@ def __init__(self, queue_length=0, crops_for_assign=(0, 1), nmb_crops=[2, 6], ep
         self.epsilon = epsilon
         self.freeze_prototypes_niters = freeze_prototypes_niters
 
-        # initialize queue
-        self.queue = None
-        # queue_path = os.path.join('.', "queue" + str(0) + ".pth")
-        # if os.path.isfile(queue_path):
-        #     self.queue = paddle.load(queue_path)["queue"]
-        # # the queue needs to be divisible by the batch size
-        # queue_length = queue_length
-        # queue_length -= queue_length % (256)
-        # if queue_length > 0 and epoch >= 15 and self.queue is None:
-        #     self.queue = paddle.zeros([len(crops_for_assign),
-        #             queue_length // 4, kwargs['output_dim']])
-        # self.load_pretrained('swav_800ep_pretrain.pdparams') 
         self.apply(self._freeze_norm)
-    
+
     def load_pretrained(self, path, rank=0, finetune=False):
-        self._load_model('swav_800ep_pretrain.pdparams', self.res_model, 'backbone') 
-        
+        self._load_model('swav_800ep_pretrain.pdparams', self.res_model, 'backbone')
+
     @paddle.no_grad()
     def distributed_sinkhorn(self, out, sinkhorn_iterations=3):
         Q = paddle.exp(x=out / self.epsilon).t()
@@ -215,11 +154,6 @@ def distributed_sinkhorn(self, out, sinkhorn_iterations=3):
         return Q.t()
 
     def forward(self, inp):
-        # ####### test            #######
-        # import numpy as np
-        # np.random.seed(42)
-        # a = np.random.rand(32, 3, 224, 224)
-        # inp = paddle.to_tensor(a).astype('float32')
         bs = inp[0].shape[0]
 
         # normalize the prototypes
@@ -228,8 +162,6 @@ def forward(self, inp):
             w = paddle.nn.functional.normalize(x=w, axis=0, p=2) # 1
             paddle.assign(w, self.res_model.prototypes.weight)
         embedding, output = self.res_model(inp)
-        # print('output, embedding', embedding.mean(), output.mean(), inp.mean())
-        # import pdb; pdb.set_trace()
         embedding = embedding.detach()
 
         # compute loss
@@ -237,39 +169,25 @@ def forward(self, inp):
         for i, crop_id in enumerate(self.crops_for_assign):
             with paddle.no_grad():
                 out = output[bs * crop_id:bs * (crop_id + 1)].detach()
-                # print('bs, crop_id', bs, crop_id, self.nmb_crops)
-                if self.queue is not None:
-                    if use_the_queue or not paddle.all(x=self.queue[(i), (-1), :] == 0):
-                        use_the_queue = True
-                        out = paddle.concat(x=(paddle.mm(input=self.queue[i],
-                            mat2=self.res_model.prototypes.weight.t()), out))
-                    self.queue[(i), bs:] = self.queue[(i), :-bs].clone()
-                    self.queue[(i), :bs] = embedding[crop_id * bs:(crop_id + 1) * bs]
-
                 q = self.distributed_sinkhorn(out)[-bs:]
-                # print('out.mean(), q.mean()', out.mean(), q.mean())
-            
+
             subloss = 0
-            # print(output.shape)
             for v in np.delete(np.arange(np.sum(self.nmb_crops)), crop_id):
                 x = output[bs * v:bs * (v + 1)] / self.temperature
                 subloss -= paddle.mean(x=paddle.sum(x=q * paddle.nn.
                     functional.log_softmax(x=x, axis=1), axis=1))
-                # print('v, subloss', v, subloss)
-                
+
             loss += subloss / (np.sum(self.nmb_crops) - 1)
-            # print('i, loss', i, loss)
-        # import pdb; pdb.set_trace()
         loss /= len(self.crops_for_assign)
 
         return loss
-    
+
     def after_loss_backward(self, iteration):
         if iteration < self.freeze_prototypes_niters:
             for name, p in self.res_model.named_parameters():
                 if 'prototypes' in name and p.grad is not None:
                     p.clear_grad()
-        
+
 def swav_resnet50_linearprobe(**kwargs):
     model = SwAVLinearProbe(**kwargs)
     return model
@@ -295,9 +213,9 @@ def swav_resnet50_pretrain(apex, **kwargs): # todo
             # with apex syncbn speeds up computation than global syncbn
             process_group = apex.parallel.create_syncbn_process_group(8)
             model = apex.parallel.convert_syncbn_model(model, process_group=process_group)
-    
-    return model       
-            
+
+    return model
+
 class RegLog(paddle.nn.Layer):
     """Creates logistic regression on top of frozen features"""
 
@@ -306,11 +224,11 @@ def __init__(self, num_labels):
         s = 2048
         self.av_pool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1))
         self.linear = paddle.nn.Linear(in_features=s, out_features=num_labels)
-        
+
         init.normal_(self.linear.weight, mean=0.0, std=0.01)
         init.zeros_(self.linear.bias)
 
     def forward(self, x):
         x = self.av_pool(x)
         x = x.reshape((x.shape[0], -1))
-        return self.linear(x)
\ No newline at end of file
+        return self.linear(x)
diff --git a/passl/models/swav_resnet.py b/passl/models/swav_resnet.py
index 2869eedc..03e5dffb 100644
--- a/passl/models/swav_resnet.py
+++ b/passl/models/swav_resnet.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import paddle
 import functools
 import paddle.nn as nn
@@ -11,20 +25,20 @@ def kaiming_normal_init(param, **kwargs):
 def constant_init(param, **kwargs):
     initializer = nn.initializer.Constant(**kwargs)
     initializer(param, param.block)
-    
-    
+
+
 class SwAVResNet(paddle.nn.Layer):
     def __init__(self, block, depth,
         normalize=False, output_dim=0, hidden_mlp=0,
         nmb_prototypes=0, eval_mode=False):
-        
+
         super(SwAVResNet, self).__init__()
         self.l2norm = normalize
         self.eval_mode = eval_mode
         num_out_filters = 512
-        
+
         self.avgpool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1))
-       
+
         if output_dim == 0:
             self.projection_head = None
         elif hidden_mlp == 0:
@@ -59,7 +73,7 @@ def forward_backbone(self, x):
 
         if self.eval_mode:
             return x
-        
+
         x = self.avgpool(x)
         x = paddle.flatten(x=x, start_axis=1)
         return x
@@ -76,7 +90,7 @@ def forward_head(self, x):
     def forward(self, inputs):
         if not isinstance(inputs, list):
             inputs = [inputs]
-        
+
         idx_crops = paddle.cumsum(x=paddle.unique_consecutive(x=paddle.
             to_tensor(data=[inp.shape[-1] for inp in inputs]),
             return_counts=True)[1], axis=0) # padiff
@@ -108,4 +122,3 @@ def forward(self, x):
 
 def swavresnet50(**kwargs):
     return SwAVResNet(block=BottleneckBlock, depth=50, **kwargs)
-
diff --git a/passl/optimizer/__init__.py b/passl/optimizer/__init__.py
index 9f2170ae..5f7d3982 100644
--- a/passl/optimizer/__init__.py
+++ b/passl/optimizer/__init__.py
@@ -93,7 +93,7 @@ def group_params(model, param_groups_cfg=None):
                 if 'regular_exp' in params_dict[g_name]:
                     regular_exp = params_dict[g_name]['regular_exp']
                     group_matcher = re.compile(regular_exp)
-                else: 
+                else:
                     group_matcher = re.compile(g_name)
                 if group_matcher.match(name):
                     params_dict[g_name]["params"].append((name, param))
@@ -211,4 +211,4 @@ def build_optimizer(config, lr_scheduler, model, epochs, step_each_epoch, lr_dec
                              grad_clip=grad_clip,
                              **config)
     logger.debug("build optimizer ({}) success..".format(optim))
-    return optim
\ No newline at end of file
+    return optim
diff --git a/passl/optimizer/optimizer.py b/passl/optimizer/optimizer.py
index 19cd3428..f556240a 100644
--- a/passl/optimizer/optimizer.py
+++ b/passl/optimizer/optimizer.py
@@ -230,4 +230,4 @@ def get_lr(self, group_id=0):
 
     @paddle.no_grad()
     def step(self):
-        raise NotImplementedError
\ No newline at end of file
+        raise NotImplementedError
diff --git a/passl/optimizer/utils/__init__.py b/passl/optimizer/utils/__init__.py
index 9d9f7a4a..d79233f8 100644
--- a/passl/optimizer/utils/__init__.py
+++ b/passl/optimizer/utils/__init__.py
@@ -1 +1,15 @@
-from .group_params import *
\ No newline at end of file
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .group_params import *
diff --git a/passl/optimizer/utils/group_params.py b/passl/optimizer/utils/group_params.py
index 9108222c..bd04904d 100644
--- a/passl/optimizer/utils/group_params.py
+++ b/passl/optimizer/utils/group_params.py
@@ -191,4 +191,4 @@ def param_group_weight_decay(
                 param_groups[new_group_name]["weight_decay"] = this_decay
             param_groups[new_group_name]["params"].append((name, param))
 
-    return param_groups
\ No newline at end of file
+    return param_groups
diff --git a/passl/scheduler/__init__.py b/passl/scheduler/__init__.py
index bb0522e3..a538bb70 100644
--- a/passl/scheduler/__init__.py
+++ b/passl/scheduler/__init__.py
@@ -39,4 +39,4 @@ def build_lr_scheduler(lr_config, epochs, step_each_epoch):
     else:
         lr = lr_config['learning_rate']
     logger.debug("build lr ({}) success..".format(lr))
-    return lr
\ No newline at end of file
+    return lr
diff --git a/passl/scheduler/lr_callable.py b/passl/scheduler/lr_callable.py
index 62a46155..137fdc82 100644
--- a/passl/scheduler/lr_callable.py
+++ b/passl/scheduler/lr_callable.py
@@ -18,4 +18,4 @@ def __init__(self, learning_rate):
         self.lr = learning_rate
 
     def __call__(self, param_group, epoch):
-        raise NotImplementedError
\ No newline at end of file
+        raise NotImplementedError
diff --git a/passl/utils/io.py b/passl/utils/io.py
index 8904215c..ab166eaf 100644
--- a/passl/utils/io.py
+++ b/passl/utils/io.py
@@ -157,12 +157,12 @@ def save_checkpoint(net,
     if local_rank == 0:
         if loss_scaler is not None:
             opt_state_dict['scaler_state'] = loss_scaler.state_dict()
-        
+
         # Solve AttrDict can't pickle error
         for group in opt_state_dict['param_groups']:
             if 'LRScheduler' in group:
                 group['LRScheduler'] = dict(group['LRScheduler'])
-        
+
         for model_prefix in model_prefixs:
             paddle.save(opt_state_dict, model_prefix + ".pdopt")
             paddle.save(metric_info, model_prefix + ".pdstates")
diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md
index d14c1b81..3bd16663 100644
--- a/tasks/ssl/swav/README.md
+++ b/tasks/ssl/swav/README.md
@@ -92,7 +92,7 @@ We provide more directly runnable configurations, see [SwAV Configurations](./co
 
 ```bibtex
 @misc{caron2021unsupervised,
-      title={Unsupervised Learning of Visual Features by Contrasting Cluster Assignments}, 
+      title={Unsupervised Learning of Visual Features by Contrasting Cluster Assignments},
       author={Mathilde Caron and Ishan Misra and Julien Mairal and Priya Goyal and Piotr Bojanowski and Armand Joulin},
       year={2021},
       eprint={2006.09882},
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
index 6d9687ab..11feaa57 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
@@ -50,7 +50,7 @@ Optimizer:
     last_epoch: -1
   param_groups:
     - name: res_model.projection_head
-      lr: 
+      lr:
         name: MultiStepDecay
         learning_rate: 5
         milestones: [12, 16]
@@ -113,4 +113,4 @@ Metric:
 
 Export:
   export_type: paddle
-  input_shape: [None, 3, 224, 224]
\ No newline at end of file
+  input_shape: [None, 3, 224, 224]
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
index bf7bdf5b..37247fde 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
@@ -24,7 +24,7 @@ DistributedStrategy:
 # model architecture
 Model:
   name: swav_resnet50_linearprobe
-  output_dim: 0 
+  output_dim: 0
   eval_mode: True
   class_num: 1000
 
diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh
index e52a9d0f..3a598182 100644
--- a/tasks/ssl/swav/finetune.sh
+++ b/tasks/ssl/swav/finetune.sh
@@ -17,11 +17,11 @@ unset PADDLE_TRAINER_ENDPOINTS
 unset DISTRIBUTED_TRAINER_ENDPOINTS
 export PADDLE_NNODES=1
 export PADDLE_MASTER="127.0.0.1:12538"
-export CUDA_VISIBLE_DEVICES=0,1 #,2,5
+export CUDA_VISIBLE_DEVICES=0,1,2,3
 
 python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
    tools/train.py  -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
-   
\ No newline at end of file
+   
diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh
index d30ff34b..d1f6e86e 100644
--- a/tasks/ssl/swav/pretrain.sh
+++ b/tasks/ssl/swav/pretrain.sh
@@ -23,4 +23,4 @@ python -m paddle.distributed.launch \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
     tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
-    # tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
\ No newline at end of file
+    # tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
diff --git a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh
index 187b8e8b..f56f1e0b 100644
--- a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh
+++ b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh
@@ -27,4 +27,4 @@ python -m paddle.distributed.launch \
     -o Global.max_train_step=201 \
     -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \
     -o Global.flags.FLAGS_cudnn_deterministic=1
-    -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained
\ No newline at end of file
+    -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained
diff --git a/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh b/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh
index 7c748f15..fd8a7709 100644
--- a/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh
+++ b/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh
@@ -27,4 +27,4 @@ python -m paddle.distributed.launch \
     -o Global.max_train_step=201 \
     -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \
     -o Global.flags.FLAGS_cudnn_deterministic=1 \
-    -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained
\ No newline at end of file
+    -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained
diff --git a/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh b/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh
index 954705ad..2fa7ad20 100644
--- a/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh
+++ b/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh
@@ -26,4 +26,4 @@ python -m paddle.distributed.launch \
     -o Global.print_batch_step=1 \
     -o Global.max_train_step=201 \
     -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \
-    -o Global.flags.FLAGS_cudnn_deterministic=1
\ No newline at end of file
+    -o Global.flags.FLAGS_cudnn_deterministic=1

From 45d527359158ba55efbedcb80e621e9801ff4c4d Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Fri, 5 May 2023 17:33:33 +0800
Subject: [PATCH 21/46] valid_ft

---
 ..._resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml |  3 ---
 tasks/ssl/swav/pretrain.sh                    |  6 ++---
 tasks/ssl/swav/pretrain_1N8C.sh               | 26 +++++++++++++++++++
 tests/CI/case.sh                              |  2 +-
 4 files changed, 29 insertions(+), 8 deletions(-)
 create mode 100644 tasks/ssl/swav/pretrain_1N8C.sh

diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
index 4cf7398e..deb515a3 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
@@ -21,9 +21,6 @@ Global:
 # FP16 setting
 FP16:
   level: O1
-#   GradScaler:
-#     init_loss_scaling: 65536.0
-#     incr_every_n_steps: 2000
 
 DistributedStrategy:
   data_parallel: True
diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh
index d1f6e86e..e45ea53c 100644
--- a/tasks/ssl/swav/pretrain.sh
+++ b/tasks/ssl/swav/pretrain.sh
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-unset PADDLE_TRAINER_ENDPOINTS
 unset DISTRIBUTED_TRAINER_ENDPOINTS
-export PADDLE_NNODES=1
+export PADDLE_NNODES=4
 export PADDLE_MASTER="127.0.0.1:12538"
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 
@@ -22,5 +21,4 @@ python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
-    tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
-    # tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
+    tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
diff --git a/tasks/ssl/swav/pretrain_1N8C.sh b/tasks/ssl/swav/pretrain_1N8C.sh
new file mode 100644
index 00000000..d1f6e86e
--- /dev/null
+++ b/tasks/ssl/swav/pretrain_1N8C.sh
@@ -0,0 +1,26 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+export PADDLE_NNODES=1
+export PADDLE_MASTER="127.0.0.1:12538"
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+python -m paddle.distributed.launch \
+    --nnodes=$PADDLE_NNODES \
+    --master=$PADDLE_MASTER \
+    --devices=$CUDA_VISIBLE_DEVICES \
+    tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
+    # tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
diff --git a/tests/CI/case.sh b/tests/CI/case.sh
index f0c8772d..21a1f9b5 100644
--- a/tests/CI/case.sh
+++ b/tests/CI/case.sh
@@ -398,7 +398,7 @@ function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() {
     loss=`cat log/workerlog.0 | grep '200/501' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
     mem=`cat log/workerlog.0 | grep '200/501' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
-    loss_base=2.23445
+    loss_base=1.95351
     ips_base=793.89847
     mem_base=5.67
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}

From 3ea3e73012bbeda0b4451656fba6d76e753c87ec Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Sat, 6 May 2023 09:58:45 +0800
Subject: [PATCH 22/46] backbone_config

---
 passl/data/dataset/multicrop_dataset.py              |  2 --
 passl/models/swav.py                                 |  7 ++++++-
 .../swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml      |  4 +++-
 .../swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml       |  8 +++++---
 .../swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml    | 12 +++++++-----
 .../swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml   | 10 ++++++----
 tasks/ssl/swav/pretrain.sh                           |  2 +-
 tasks/ssl/swav/pretrain_1N8C.sh                      |  1 -
 8 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/passl/data/dataset/multicrop_dataset.py b/passl/data/dataset/multicrop_dataset.py
index a4488e7b..3a098b01 100644
--- a/passl/data/dataset/multicrop_dataset.py
+++ b/passl/data/dataset/multicrop_dataset.py
@@ -23,7 +23,6 @@
 from passl.data.dataset.imagefolder_dataset import ImageFolder
 from passl.data.preprocess import (
     RandomApply,
-    # GaussianBlur,
     SimCLRGaussianBlur,
     NormalizeImage,
     RandomGrayscale,
@@ -77,7 +76,6 @@ def __getitem__(self, index):
 
 
 def get_pil_gaussian_blur(p=0.5):
-    # gaussian_blur = GaussianBlur(sigma=[.1, 2.], _PIL=True)
     gaussian_blur = SimCLRGaussianBlur(sigma=[.1, 2.])
     rnd_gaussian_blur = RandomApply([gaussian_blur], p=p)
     return rnd_gaussian_blur
diff --git a/passl/models/swav.py b/passl/models/swav.py
index 50795189..989b11ab 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -40,7 +40,12 @@
 class SwAV(Model):
     def __init__(self, **kwargs):
         super().__init__()
-        self.res_model = swavresnet50(**kwargs)
+        backbone_config = kwargs['backbone']
+        backbone_type = backbone_config.pop("type", None)
+        if backbone_type is not None:
+            self.res_model = eval(backbone_type)(**backbone_config)
+        else:
+            AttributeError(f'Backbone type is not assigned, please assign it.')
 
     def _load_model(self, path, model, tag):
         path = path + ".pdparams"
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
index 11feaa57..7dc624cc 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
@@ -25,7 +25,9 @@ DistributedStrategy:
 # model architecture
 Model:
   name: swav_resnet50_finetune
-  output_dim: 1000
+  backbone:
+    type: swavresnet50
+    output_dim: 1000
 
 # loss function config for traing/eval process
 Loss:
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
index 37247fde..68b54a47 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
@@ -24,9 +24,11 @@ DistributedStrategy:
 # model architecture
 Model:
   name: swav_resnet50_linearprobe
-  output_dim: 0
-  eval_mode: True
-  class_num: 1000
+  backbone:
+    type: swavresnet50
+    output_dim: 0
+    eval_mode: True
+    class_num: 1000
 
 # loss function config for traing/eval process
 Loss:
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
index a6d0c2e5..2e5b7a7d 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
@@ -28,16 +28,18 @@ DistributedStrategy:
 # model architecture
 Model:
   name: swav_resnet50_pretrain
-  apex: False
+  backbone:
+    type: swavresnet50
+    normalize: True
+    hidden_mlp: 2048
+    output_dim: 128
+    nmb_prototypes: 3000
+    apex: False
   queue_length: 3804 # 0
   crops_for_assign: [0, 1]
   nmb_crops: [2, 6]
   epsilon: 0.05
   freeze_prototypes_niters: 5005 # 313
-  normalize: True
-  hidden_mlp: 2048
-  output_dim: 128
-  nmb_prototypes: 3000
 
 Optimizer:
   name: MomentumLARC
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
index deb515a3..cdc0ebf8 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
@@ -28,16 +28,18 @@ DistributedStrategy:
 # model architecture
 Model:
   name: swav_resnet50_pretrain
+  backbone:
+    type: swavresnet50
+    normalize: True
+    hidden_mlp: 2048
+    output_dim: 128
+    nmb_prototypes: 3000
   apex: False
   queue_length: 0
   crops_for_assign: [0, 1]
   nmb_crops: [2, 6]
   epsilon: 0.05
   freeze_prototypes_niters: 313
-  normalize: True
-  hidden_mlp: 2048
-  output_dim: 128
-  nmb_prototypes: 3000
 
 Optimizer:
   name: MomentumLARC
diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh
index e45ea53c..ad460394 100644
--- a/tasks/ssl/swav/pretrain.sh
+++ b/tasks/ssl/swav/pretrain.sh
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-unset DISTRIBUTED_TRAINER_ENDPOINTS
+# unset DISTRIBUTED_TRAINER_ENDPOINTS
 export PADDLE_NNODES=4
 export PADDLE_MASTER="127.0.0.1:12538"
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
diff --git a/tasks/ssl/swav/pretrain_1N8C.sh b/tasks/ssl/swav/pretrain_1N8C.sh
index d1f6e86e..ce6caad8 100644
--- a/tasks/ssl/swav/pretrain_1N8C.sh
+++ b/tasks/ssl/swav/pretrain_1N8C.sh
@@ -23,4 +23,3 @@ python -m paddle.distributed.launch \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
     tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
-    # tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml

From 675e075959bd200d4acc033d1998ea4cccbdebe3 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Sat, 6 May 2023 14:31:02 +0800
Subject: [PATCH 23/46] verified

---
 passl/data/dataset/imagefolder_dataset.py     |  7 +--
 passl/data/dataset/multicrop_dataset.py       |  2 +-
 .../engine/loops/contrastive_learning_loop.py |  5 +--
 passl/models/swav.py                          | 22 ++++-----
 tasks/ssl/swav/README.md                      | 45 ++++++++++++-------
 ...wav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml |  4 +-
 ...swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml |  2 +-
 ...v_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml |  8 ++--
 8 files changed, 54 insertions(+), 41 deletions(-)

diff --git a/passl/data/dataset/imagefolder_dataset.py b/passl/data/dataset/imagefolder_dataset.py
index dac2634a..a42c6425 100644
--- a/passl/data/dataset/imagefolder_dataset.py
+++ b/passl/data/dataset/imagefolder_dataset.py
@@ -20,6 +20,7 @@
 
 import paddle
 
+from passl.utils import logger
 from passl.data.dataset import default_loader
 
 IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif",
@@ -66,8 +67,8 @@ def __init__(self,
         if samples_tag is None:
             samples = self.make_dataset(self.root, class_to_idx, extensions)
         elif samples_tag == "semi_1" or samples_tag == "semi_10":
-            # connection reset
-            # subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(percent) + "percent.txt")
+            # connection reset proxyon
+            # subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(samples_tag.split('_')[-1]) + "percent.txt")
             # list_imgs = [li.decode("utf-8").split('\n')[0] for li in subset_file]
             subset_file = str(samples_tag.split('_')[-1]) + "percent.txt"
             with open(subset_file, 'r') as f:
@@ -77,7 +78,7 @@ def __init__(self,
         else:
             raise NotImplementedError('{} is not implemented'.format(samples))
 
-        print(f'find total {len(classes)} classes and {len(samples)} images.')
+        logger.info(f'find total {len(classes)} classes and {len(samples)} images.')
 
         self.extensions = extensions
 
diff --git a/passl/data/dataset/multicrop_dataset.py b/passl/data/dataset/multicrop_dataset.py
index 3a098b01..f3acce64 100644
--- a/passl/data/dataset/multicrop_dataset.py
+++ b/passl/data/dataset/multicrop_dataset.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/passl/engine/loops/contrastive_learning_loop.py b/passl/engine/loops/contrastive_learning_loop.py
index 5bdefea6..2cd5c91c 100644
--- a/passl/engine/loops/contrastive_learning_loop.py
+++ b/passl/engine/loops/contrastive_learning_loop.py
@@ -16,10 +16,10 @@
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import paddle
-from passl.core import grad_sync
+import collections
 
+from passl.core import grad_sync
 from passl.utils import logger
 from .loop import TrainingEpochLoop
 
@@ -52,7 +52,6 @@ def forward_backward(self, batch, total_iterations):
                 if isinstance(loss_dict, paddle.Tensor):
                     loss_dict = {'loss': loss_dict}
 
-
             for key in loss_dict:
                 loss_dict[key] = loss_dict[key] / self.trainer.accum_steps
 
diff --git a/passl/models/swav.py b/passl/models/swav.py
index 989b11ab..0a423b30 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -47,21 +47,21 @@ def __init__(self, **kwargs):
         else:
             AttributeError(f'Backbone type is not assigned, please assign it.')
 
-    def _load_model(self, path, model, tag):
+    def _load_model(self, path, tag):
         path = path + ".pdparams"
         if os.path.isfile(path):
             para_state_dict = paddle.load(path)
 
             # resnet
-            model_state_dict = model.state_dict()
+            model_state_dict = self.state_dict()
             keys = model_state_dict.keys()
             num_params_loaded = 0
             for k in keys:
                 if k not in para_state_dict:
-                    print("{} is not in pretrained model".format(k))
+                    logger.info("{} is not in pretrained model".format(k))
                 elif list(para_state_dict[k].shape) != list(model_state_dict[k]
                                                             .shape):
-                    print(
+                    logger.info(
                         "[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})"
                         .format(k, para_state_dict[k].shape, model_state_dict[k]
                                 .shape))
@@ -71,11 +71,11 @@ def _load_model(self, path, model, tag):
                         para_state_dict[k] = para_state_dict[k].astype(model_state_dict[k].dtype)
                     model_state_dict[k] = para_state_dict[k]
                     num_params_loaded += 1
-            model.set_dict(model_state_dict)
-            print("There are {}/{} variables loaded into {}.".format(
+            self.set_dict(model_state_dict)
+            logger.info("There are {}/{} variables loaded into {}.".format(
                 num_params_loaded, len(model_state_dict), tag))
         else:
-            print("No pretrained weights found in {} => training with random weights".format(tag))
+            logger.info("No pretrained weights found in {} => training with random weights".format(tag))
 
     def load_pretrained(self, path, rank=0, finetune=False):
         pass
@@ -106,7 +106,7 @@ def __init__(self, class_num=1000, **kwargs):
         self.apply(self._freeze_norm)
 
     def load_pretrained(self, path, rank=0, finetune=False):
-        self._load_model(path, self.res_model, 'backbone')
+        self._load_model(path, 'backbone')
 
     def forward(self, inp):
         with paddle.no_grad():
@@ -121,7 +121,7 @@ def __init__(self, **kwargs):
         self.apply(self._freeze_norm)
 
     def load_pretrained(self, path, rank=0, finetune=False):
-        self._load_model(path, self.res_model, 'backbone')
+        self._load_model(path, 'backbone')
 
     def forward(self, inp):
         return self.res_model(inp)
@@ -137,8 +137,8 @@ def __init__(self, queue_length=0, crops_for_assign=(0, 1), nmb_crops=[2, 6], ep
 
         self.apply(self._freeze_norm)
 
-    def load_pretrained(self, path, rank=0, finetune=False):
-        self._load_model('swav_800ep_pretrain.pdparams', self.res_model, 'backbone')
+    # def load_pretrained(self, path, rank=0, finetune=False):
+    #     self._load_model('swav_800ep_pretrain.pdparams', 'backbone')
 
     @paddle.no_grad()
     def distributed_sinkhorn(self, out, sinkhorn_iterations=3):
diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md
index 3bd16663..4cd19543 100644
--- a/tasks/ssl/swav/README.md
+++ b/tasks/ssl/swav/README.md
@@ -59,22 +59,35 @@ python -m paddle.distributed.launch \
 ```
 
 ## How to End-to-End Fine-tuning
-To perform end-to-end fine-tuning for SwAV, run the training with the trained PASSL format checkpoint:
-
-```bash
-unset PADDLE_TRAINER_ENDPOINTS
-export PADDLE_NNODES=1
-export PADDLE_MASTER="127.0.0.1:12538"
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-export FLAGS_stop_check_timeout=3600
-
-python -m paddle.distributed.launch \
-    --nnodes=$PADDLE_NNODES \
-    --master=$PADDLE_MASTER \
-    --devices=$CUDA_VISIBLE_DEVICES \
-    passl-train \
-    -c ./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml
-```
+To perform end-to-end fine-tuning for SwAV:
+
+* First download the data split text file with following commands:
+    ```bash
+    cd PASSL
+
+    wget "https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/10percent.txt"
+
+    wget "https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/1percent.txt"
+    ```
+
+* Then, download the pretrained models to `./pretrained/swav/swav_resnet50_in1k_800ep_pretrained.pdparams`
+
+* Finally, run the training with the trained PASSL format checkpoint:
+    ```bash
+    unset PADDLE_TRAINER_ENDPOINTS
+    export PADDLE_NNODES=1
+    export PADDLE_MASTER="127.0.0.1:12538"
+    export CUDA_VISIBLE_DEVICES=0,1,2,3
+    export FLAGS_stop_check_timeout=3600
+
+    python -m paddle.distributed.launch \
+        --nnodes=$PADDLE_NNODES \
+        --master=$PADDLE_MASTER \
+        --devices=$CUDA_VISIBLE_DEVICES \
+        passl-train \
+        -c ./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml
+        -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained
+    ```
 
 ## Other Configurations
 We provide more directly runnable configurations, see [SwAV Configurations](./configs/).
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
index 7dc624cc..4eb691fe 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
@@ -4,9 +4,9 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-  pretrained_model: swav_800ep_pretrain_adjustresnet
+  pretrained_model: swav_800ep_pretrain_adjustresnetn
   finetune: True
-  output_dir: ./output/semi_0426_semi10
+  output_dir: ./output/semi_0506_semi10
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
index 68b54a47..08a1dc25 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
@@ -28,7 +28,7 @@ Model:
     type: swavresnet50
     output_dim: 0
     eval_mode: True
-    class_num: 1000
+  class_num: 1000
 
 # loss function config for traing/eval process
 Loss:
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
index 2e5b7a7d..75961e5a 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
@@ -13,7 +13,7 @@ Global:
   eval_interval: 1
   eval_unit: "epoch"
   accum_steps: 1
-  epochs: 400 # 800
+  epochs: 400
   print_batch_step: 100
   use_visualdl: False
   seed: 31
@@ -34,12 +34,12 @@ Model:
     hidden_mlp: 2048
     output_dim: 128
     nmb_prototypes: 3000
-    apex: False
-  queue_length: 3804 # 0
+  apex: False
+  queue_length: 3804
   crops_for_assign: [0, 1]
   nmb_crops: [2, 6]
   epsilon: 0.05
-  freeze_prototypes_niters: 5005 # 313
+  freeze_prototypes_niters: 5005
 
 Optimizer:
   name: MomentumLARC

From 2c259818d80f170689880ef936af6e72a3d59a8e Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Sat, 6 May 2023 14:43:38 +0800
Subject: [PATCH 24/46] fix

---
 tests/CI/case.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/CI/case.sh b/tests/CI/case.sh
index 21a1f9b5..192eb715 100644
--- a/tests/CI/case.sh
+++ b/tests/CI/case.sh
@@ -390,10 +390,10 @@ function simsiam_resnet50_lp_in1k_1n8c_dp_fp32() {
     echo "=========== $FUNCNAME run  end ==========="
 }
 
-function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() {
+function swav_resnet50_224_ft_in1k_1n4c_dp() {
     echo "=========== $FUNCNAME run begin ==========="
     rm -rf log
-    bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
+    bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh
 
     loss=`cat log/workerlog.0 | grep '200/501' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
@@ -405,10 +405,10 @@ function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() {
     echo "=========== $FUNCNAME run  end ==========="
 }
 
-function swav_resnet50_224_lp_in1k_1n8c_dp_fp32() {
+function swav_resnet50_224_lp_in1k_1n8c_dp() {
     echo "=========== $FUNCNAME run begin ==========="
     rm -rf log
-    bash ./ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh
+    bash ./ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh
 
     loss=`cat log/workerlog.0 | grep '200/5005' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`

From df744d0d1f37668b3beae63ed5ab6b41cdf8e003 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Sat, 6 May 2023 16:12:55 +0800
Subject: [PATCH 25/46] update

---
 passl/data/dataset/imagefolder_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/passl/data/dataset/imagefolder_dataset.py b/passl/data/dataset/imagefolder_dataset.py
index a42c6425..76a5d77c 100644
--- a/passl/data/dataset/imagefolder_dataset.py
+++ b/passl/data/dataset/imagefolder_dataset.py
@@ -78,7 +78,7 @@ def __init__(self,
         else:
             raise NotImplementedError('{} is not implemented'.format(samples))
 
-        logger.info(f'find total {len(classes)} classes and {len(samples)} images.')
+        print(f'find total {len(classes)} classes and {len(samples)} images.')
 
         self.extensions = extensions
 

From 948e9b9f71984d079ede8932ba31fc8e92e4ea4a Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Sat, 6 May 2023 17:49:08 +0800
Subject: [PATCH 26/46] fix

---
 passl/models/swav.py                                         | 4 ++--
 .../swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml | 2 +-
 tasks/ssl/swav/linearprobe.sh                                | 2 +-
 tasks/ssl/swav/pretrain.sh                                   | 5 +++--
 tests/CI/case.sh                                             | 4 ++--
 ..._1n4c_dp.sh => swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh} | 0
 ..._1n8c_dp.sh => swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh} | 0
 7 files changed, 9 insertions(+), 8 deletions(-)
 rename tests/CI/ssl/swav/{swav_resnet50_224_ft_in1k_1n4c_dp.sh => swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh} (100%)
 rename tests/CI/ssl/swav/{swav_resnet50_224_lp_in1k_1n8c_dp.sh => swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh} (100%)

diff --git a/passl/models/swav.py b/passl/models/swav.py
index 0a423b30..3aa13757 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -72,8 +72,8 @@ def _load_model(self, path, tag):
                     model_state_dict[k] = para_state_dict[k]
                     num_params_loaded += 1
             self.set_dict(model_state_dict)
-            logger.info("There are {}/{} variables loaded into {}.".format(
-                num_params_loaded, len(model_state_dict), tag))
+            logger.info("There are {}/{} variables loaded into {} with {}.".format(
+                num_params_loaded, len(model_state_dict), tag, path))
         else:
             logger.info("No pretrained weights found in {} => training with random weights".format(tag))
 
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
index 4eb691fe..c7353402 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
@@ -4,7 +4,7 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-  pretrained_model: swav_800ep_pretrain_adjustresnetn
+  pretrained_model: epoch_73n
   finetune: True
   output_dir: ./output/semi_0506_semi10
   device: gpu
diff --git a/tasks/ssl/swav/linearprobe.sh b/tasks/ssl/swav/linearprobe.sh
index 07ced970..ad2845a9 100644
--- a/tasks/ssl/swav/linearprobe.sh
+++ b/tasks/ssl/swav/linearprobe.sh
@@ -16,7 +16,7 @@ unset PADDLE_TRAINER_ENDPOINTS
 unset DISTRIBUTED_TRAINER_ENDPOINTS
 export PADDLE_NNODES=1
 export PADDLE_MASTER="127.0.0.1:12538"
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export CUDA_VISIBLE_DEVICES=1,2,3,0,4,5,6,7
 
 python -m paddle.distributed.launch \
     --nnodes=$PADDLE_NNODES \
diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh
index ad460394..1288c5d3 100644
--- a/tasks/ssl/swav/pretrain.sh
+++ b/tasks/ssl/swav/pretrain.sh
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# unset DISTRIBUTED_TRAINER_ENDPOINTS
-export PADDLE_NNODES=4
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+export PADDLE_NNODES=2
 export PADDLE_MASTER="127.0.0.1:12538"
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
 
diff --git a/tests/CI/case.sh b/tests/CI/case.sh
index 192eb715..7d215a78 100644
--- a/tests/CI/case.sh
+++ b/tests/CI/case.sh
@@ -390,7 +390,7 @@ function simsiam_resnet50_lp_in1k_1n8c_dp_fp32() {
     echo "=========== $FUNCNAME run  end ==========="
 }
 
-function swav_resnet50_224_ft_in1k_1n4c_dp() {
+function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() {
     echo "=========== $FUNCNAME run begin ==========="
     rm -rf log
     bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh
@@ -405,7 +405,7 @@ function swav_resnet50_224_ft_in1k_1n4c_dp() {
     echo "=========== $FUNCNAME run  end ==========="
 }
 
-function swav_resnet50_224_lp_in1k_1n8c_dp() {
+function swav_resnet50_224_lp_in1k_1n8c_dp_fp32() {
     echo "=========== $FUNCNAME run begin ==========="
     rm -rf log
     bash ./ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh
diff --git a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
similarity index 100%
rename from tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh
rename to tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
diff --git a/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh b/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh
similarity index 100%
rename from tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh
rename to tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh

From 49b7daccd77d9610bc3575eae8f3655d334bed89 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Mon, 8 May 2023 10:29:37 +0800
Subject: [PATCH 27/46] fix_ci

---
 tests/CI/case.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/CI/case.sh b/tests/CI/case.sh
index 7d215a78..21a1f9b5 100644
--- a/tests/CI/case.sh
+++ b/tests/CI/case.sh
@@ -393,7 +393,7 @@ function simsiam_resnet50_lp_in1k_1n8c_dp_fp32() {
 function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() {
     echo "=========== $FUNCNAME run begin ==========="
     rm -rf log
-    bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh
+    bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
 
     loss=`cat log/workerlog.0 | grep '200/501' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
@@ -408,7 +408,7 @@ function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() {
 function swav_resnet50_224_lp_in1k_1n8c_dp_fp32() {
     echo "=========== $FUNCNAME run begin ==========="
     rm -rf log
-    bash ./ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh
+    bash ./ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh
 
     loss=`cat log/workerlog.0 | grep '200/5005' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`

From 3cfb191d633d99a3a54db26fdcf083be72a6d48c Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Mon, 15 May 2023 11:04:30 +0800
Subject: [PATCH 28/46] edit_accord_comment

---
 passl/engine/engine.py                        |  18 +--
 .../engine/loops/contrastive_learning_loop.py |   9 +-
 passl/engine/loops/loop.py                    |  10 +-
 passl/models/swav.py                          | 112 +++++++++++++++-
 passl/models/swav_resnet.py                   | 124 ------------------
 passl/scheduler/__init__.py                   |   8 +-
 passl/scheduler/lr_scheduler.py               |   6 +-
 7 files changed, 132 insertions(+), 155 deletions(-)
 delete mode 100644 passl/models/swav_resnet.py

diff --git a/passl/engine/engine.py b/passl/engine/engine.py
index c3277561..c43b199b 100644
--- a/passl/engine/engine.py
+++ b/passl/engine/engine.py
@@ -233,14 +233,14 @@ def worker_init_fn(worker_id):
                                              self.lr_decay_unit)
 
         # load pretrained model
-            if  self.config["Global"]["pretrained_model"] is not None:
-                assert isinstance(
-                    self.config["Global"]["pretrained_model"], str
-                ), "pretrained_model type is not available. Please use `string`."
-                self.model.load_pretrained(
-                    self.config["Global"]["pretrained_model"],
-                    self.config["Global"]["rank"],
-                    self.config["Global"].get("finetune", False))
+        if  self.config["Global"]["pretrained_model"] is not None:
+            assert isinstance(
+                self.config["Global"]["pretrained_model"], str
+            ), "pretrained_model type is not available. Please use `string`."
+            self.model.load_pretrained(
+                self.config["Global"]["pretrained_model"],
+                self.config["Global"]["rank"],
+                self.config["Global"].get("finetune", False))
 
         # for distributed
         if self.config["Global"]["distributed"]:
@@ -356,7 +356,7 @@ def train(self):
             self.vdl_writer.close()
 
     @paddle.no_grad()
-    def eval(self, epoch_id=0):
+    def eval(self):
         assert self.mode in ["train", "eval"]
         self.model.eval()
         self.validating = True
diff --git a/passl/engine/loops/contrastive_learning_loop.py b/passl/engine/loops/contrastive_learning_loop.py
index 2cd5c91c..663703cc 100644
--- a/passl/engine/loops/contrastive_learning_loop.py
+++ b/passl/engine/loops/contrastive_learning_loop.py
@@ -28,7 +28,7 @@ class ContrastiveLearningTrainingEpochLoop(TrainingEpochLoop):
     def __init__(self, trainer, epochs, max_train_step=None, val_loop=None):
         super().__init__(trainer, epochs, max_train_step=max_train_step, val_loop=val_loop)
 
-    def forward_backward(self, batch, total_iterations):
+    def forward_backward(self, batch):
         # Gradient Merge(GuoxiaWang): Accumulate gradient over multiple
         # steps to save on memory.
 
@@ -63,19 +63,18 @@ def forward_backward(self, batch, total_iterations):
             scaled.backward()
 
             try:
-                self.trainer.model.after_loss_backward(total_iterations)
+                self.trainer.model.after_loss_backward(self.total_iterations)
             except AttributeError:
                 logger.warning("Model has no after_loss_backward method, ignored this process")
 
         return final_loss_dict
 
-    def train_one_step(self, batch, total_iterations):
-
+    def train_one_step(self, batch):
         # remove label
         batch = batch[0]
 
         # do forward and backward
-        loss_dict = self.forward_backward(batch, total_iterations)
+        loss_dict = self.forward_backward(batch)
 
         grad_sync(self.trainer.optimizer.param_groups)
 
diff --git a/passl/engine/loops/loop.py b/passl/engine/loops/loop.py
index 959bb386..c14978a5 100644
--- a/passl/engine/loops/loop.py
+++ b/passl/engine/loops/loop.py
@@ -219,7 +219,7 @@ def run(self):
                 self.trainer.train_dataloader.batch_sampler.set_epoch(epoch_id)
 
             # for one epoch train
-            self.train_one_epoch(epoch_id)
+            self.train_one_epoch()
 
             if self.trainer.lr_decay_unit == 'epoch':
                 self.trainer.optimizer.lr_step(self.cur_epoch_id)
@@ -257,14 +257,14 @@ def run(self):
         self.trainer.training = False
 
 
-    def train_one_epoch(self, epoch_id):
+    def train_one_epoch(self):
         self.trainer.model.train()
 
         tic = time.time()
 
         for batch_idx, batch in enumerate(self.trainer.train_dataloader):
             self.cur_batch_idx = batch_idx
-            total_iterations = (epoch_id-1)*self.total_batch_idx + batch_idx
+            self.total_iterations = (self.trainer.cur_epoch_id-1)*self.total_batch_idx + batch_idx
 
             if self.max_train_step is not None and self.global_step >= self.max_train_step:
                 logger.info(
@@ -289,7 +289,7 @@ def train_one_epoch(self, epoch_id):
             self.global_step += 1
 
             # do forward and backward
-            out, loss_dict = self.train_one_step(batch, total_iterations)
+            out, loss_dict = self.train_one_step(batch)
 
             self.time_info["batch_cost"].update(time.time() - tic)
 
@@ -311,7 +311,7 @@ def train_one_epoch(self, epoch_id):
             tic = time.time()
 
 
-    def train_one_step(self, batch, total_iterations):
+    def train_one_step(self, batch):
         raise NotImplementedError
 
     def save_checkpoint(self):
diff --git a/passl/models/swav.py b/passl/models/swav.py
index 3aa13757..cb9c7298 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -13,19 +13,17 @@
 # limitations under the License.
 
 import os
+import functools
 import numpy as np
 from sys import flags
-from collections import defaultdict
 
 import paddle
 import paddle.nn as nn
 
 from passl.nn import init
-from passl.scheduler import build_lr_scheduler
 from passl.utils import logger
-from passl.models.swav_resnet import swavresnet50
 from passl.models.base_model import Model
-
+from passl.models.resnet import ResNet, BottleneckBlock
 
 __all__ = [
     'swav_resnet50_finetune',
@@ -237,3 +235,109 @@ def forward(self, x):
         x = self.av_pool(x)
         x = x.reshape((x.shape[0], -1))
         return self.linear(x)
+
+
+def kaiming_normal_init(param, **kwargs):
+    initializer = nn.initializer.KaimingNormal(**kwargs)
+    initializer(param, param.block)
+
+def constant_init(param, **kwargs):
+    initializer = nn.initializer.Constant(**kwargs)
+    initializer(param, param.block)
+
+
+class SwAVResNet(paddle.nn.Layer):
+    def __init__(self, block, depth,
+        normalize=False, output_dim=0, hidden_mlp=0,
+        nmb_prototypes=0, eval_mode=False):
+
+        super(SwAVResNet, self).__init__()
+        self.l2norm = normalize
+        self.eval_mode = eval_mode
+        num_out_filters = 512
+
+        self.avgpool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1))
+
+        if output_dim == 0:
+            self.projection_head = None
+        elif hidden_mlp == 0:
+            self.projection_head = paddle.nn.Linear(in_features=
+                num_out_filters * block.expansion, out_features=output_dim)
+        else:
+            self.projection_head = paddle.nn.Sequential(paddle.nn.Linear(
+                in_features=num_out_filters * block.expansion, out_features
+                =hidden_mlp), paddle.nn.BatchNorm1D(num_features=hidden_mlp,
+                momentum=1 - 0.1, epsilon=1e-05, weight_attr=None,
+                bias_attr=None, use_global_stats=True), paddle.nn.ReLU(),
+                paddle.nn.Linear(in_features=hidden_mlp, out_features=
+                output_dim))
+
+        self.prototypes = None
+        if isinstance(nmb_prototypes, list):
+            self.prototypes = MultiPrototypes(output_dim, nmb_prototypes)
+        elif nmb_prototypes > 0:
+            self.prototypes = paddle.nn.Linear(in_features=output_dim,
+                out_features=nmb_prototypes, bias_attr=False)
+            for sublayer in self.sublayers():
+                if isinstance(sublayer, nn.Conv2D):
+                    kaiming_normal_init(sublayer.weight)
+                elif isinstance(sublayer, (nn.BatchNorm2D, nn.GroupNorm)):
+                    constant_init(sublayer.weight, value=1.0)
+                    constant_init(sublayer.bias, value=0.0)
+
+        self.encoder = functools.partial(ResNet, block=block, depth=depth)(with_pool=False, class_num=0)
+
+    def forward_backbone(self, x):
+        x = self.encoder(x)
+
+        if self.eval_mode:
+            return x
+
+        x = self.avgpool(x)
+        x = paddle.flatten(x=x, start_axis=1)
+        return x
+
+    def forward_head(self, x):
+        if self.projection_head is not None:
+            x = self.projection_head(x)
+        if self.l2norm:
+            x = paddle.nn.functional.normalize(x=x, axis=1, p=2)
+        if self.prototypes is not None:
+            return x, self.prototypes(x)
+        return x
+
+    def forward(self, inputs):
+        if not isinstance(inputs, list):
+            inputs = [inputs]
+
+        idx_crops = paddle.cumsum(x=paddle.unique_consecutive(x=paddle.
+            to_tensor(data=[inp.shape[-1] for inp in inputs]),
+            return_counts=True)[1], axis=0) # padiff
+        start_idx = 0
+        for end_idx in idx_crops:
+            _out = self.forward_backbone(paddle.concat(x=inputs[start_idx:end_idx]))
+            if start_idx == 0:
+                output = _out
+            else:
+                output = paddle.concat(x=(output, _out))
+            start_idx = end_idx
+        return self.forward_head(output)
+
+
+class MultiPrototypes(paddle.nn.Layer):
+    def __init__(self, output_dim, nmb_prototypes):
+        super(MultiPrototypes, self).__init__()
+        self.nmb_heads = len(nmb_prototypes)
+        for i, k in enumerate(nmb_prototypes):
+            self.add_module('prototypes' + str(i), paddle.nn.Linear(
+                in_features=output_dim, out_features=k, bias_attr=False))
+
+    def forward(self, x):
+        out = []
+        for i in range(self.nmb_heads):
+            out.append(getattr(self, 'prototypes' + str(i))(x))
+        return out
+
+
+def swavresnet50(**kwargs):
+    return SwAVResNet(block=BottleneckBlock, depth=50, **kwargs)
diff --git a/passl/models/swav_resnet.py b/passl/models/swav_resnet.py
deleted file mode 100644
index 03e5dffb..00000000
--- a/passl/models/swav_resnet.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import functools
-import paddle.nn as nn
-
-from .resnet import ResNet, BottleneckBlock
-
-def kaiming_normal_init(param, **kwargs):
-    initializer = nn.initializer.KaimingNormal(**kwargs)
-    initializer(param, param.block)
-
-def constant_init(param, **kwargs):
-    initializer = nn.initializer.Constant(**kwargs)
-    initializer(param, param.block)
-
-
-class SwAVResNet(paddle.nn.Layer):
-    def __init__(self, block, depth,
-        normalize=False, output_dim=0, hidden_mlp=0,
-        nmb_prototypes=0, eval_mode=False):
-
-        super(SwAVResNet, self).__init__()
-        self.l2norm = normalize
-        self.eval_mode = eval_mode
-        num_out_filters = 512
-
-        self.avgpool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1))
-
-        if output_dim == 0:
-            self.projection_head = None
-        elif hidden_mlp == 0:
-            self.projection_head = paddle.nn.Linear(in_features=
-                num_out_filters * block.expansion, out_features=output_dim)
-        else:
-            self.projection_head = paddle.nn.Sequential(paddle.nn.Linear(
-                in_features=num_out_filters * block.expansion, out_features
-                =hidden_mlp), paddle.nn.BatchNorm1D(num_features=hidden_mlp,
-                momentum=1 - 0.1, epsilon=1e-05, weight_attr=None,
-                bias_attr=None, use_global_stats=True), paddle.nn.ReLU(),
-                paddle.nn.Linear(in_features=hidden_mlp, out_features=
-                output_dim))
-
-        self.prototypes = None
-        if isinstance(nmb_prototypes, list):
-            self.prototypes = MultiPrototypes(output_dim, nmb_prototypes)
-        elif nmb_prototypes > 0:
-            self.prototypes = paddle.nn.Linear(in_features=output_dim,
-                out_features=nmb_prototypes, bias_attr=False)
-            for sublayer in self.sublayers():
-                if isinstance(sublayer, nn.Conv2D):
-                    kaiming_normal_init(sublayer.weight) # todo mode='fan_out',
-                elif isinstance(sublayer, (nn.BatchNorm2D, nn.GroupNorm)):
-                    constant_init(sublayer.weight, value=1.0)
-                    constant_init(sublayer.bias, value=0.0)
-
-        self.encoder = functools.partial(ResNet, block=block, depth=depth)(with_pool=False, class_num=0)
-
-    def forward_backbone(self, x):
-        x = self.encoder(x)
-
-        if self.eval_mode:
-            return x
-
-        x = self.avgpool(x)
-        x = paddle.flatten(x=x, start_axis=1)
-        return x
-
-    def forward_head(self, x):
-        if self.projection_head is not None:
-            x = self.projection_head(x)
-        if self.l2norm:
-            x = paddle.nn.functional.normalize(x=x, axis=1, p=2)
-        if self.prototypes is not None:
-            return x, self.prototypes(x)
-        return x
-
-    def forward(self, inputs):
-        if not isinstance(inputs, list):
-            inputs = [inputs]
-
-        idx_crops = paddle.cumsum(x=paddle.unique_consecutive(x=paddle.
-            to_tensor(data=[inp.shape[-1] for inp in inputs]),
-            return_counts=True)[1], axis=0) # padiff
-        start_idx = 0
-        for end_idx in idx_crops:
-            _out = self.forward_backbone(paddle.concat(x=inputs[start_idx:end_idx]))
-            if start_idx == 0:
-                output = _out
-            else:
-                output = paddle.concat(x=(output, _out))
-            start_idx = end_idx
-        return self.forward_head(output)
-
-
-class MultiPrototypes(paddle.nn.Layer):
-    def __init__(self, output_dim, nmb_prototypes):
-        super(MultiPrototypes, self).__init__()
-        self.nmb_heads = len(nmb_prototypes)
-        for i, k in enumerate(nmb_prototypes):
-            self.add_module('prototypes' + str(i), paddle.nn.Linear(
-                in_features=output_dim, out_features=k, bias_attr=False))
-
-    def forward(self, x):
-        out = []
-        for i in range(self.nmb_heads):
-            out.append(getattr(self, 'prototypes' + str(i))(x))
-        return out
-
-
-def swavresnet50(**kwargs):
-    return SwAVResNet(block=BottleneckBlock, depth=50, **kwargs)
diff --git a/passl/scheduler/__init__.py b/passl/scheduler/__init__.py
index a538bb70..002ee755 100644
--- a/passl/scheduler/__init__.py
+++ b/passl/scheduler/__init__.py
@@ -12,11 +12,10 @@
 # limitations under the License.
 
 import paddle
-from paddle.optimizer.lr import MultiStepDecay
 
 from passl.utils import logger
 
-from .lr_scheduler import TimmCosine, ViTLRScheduler, Step, Poly
+from .lr_scheduler import TimmCosine, ViTLRScheduler, Step, Poly, MultiStepDecay
 from .lr_callable import LRCallable
 
 
@@ -24,11 +23,6 @@ def build_lr_scheduler(lr_config, epochs, step_each_epoch):
     lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch})
     if 'name' in lr_config:
         lr_name = lr_config.pop('name')
-        if "MultiStepDecay" in lr_name:
-            lr_config.pop('epochs')
-            lr_config.pop('step_each_epoch')
-            lr_config.pop('decay_unit')
-            print(lr_config)
         lr = eval(lr_name)(**lr_config)
         if isinstance(lr, paddle.optimizer.lr.LRScheduler):
             return lr
diff --git a/passl/scheduler/lr_scheduler.py b/passl/scheduler/lr_scheduler.py
index 223ca349..2b91405a 100644
--- a/passl/scheduler/lr_scheduler.py
+++ b/passl/scheduler/lr_scheduler.py
@@ -19,7 +19,6 @@
 from paddle.optimizer import lr
 from passl.utils import logger
 
-
 class TimmCosine(lr.LRScheduler):
     def __init__(self,
                  learning_rate,
@@ -200,3 +199,8 @@ def get_lr(self):
 
         return self.base_lr * pow(1 - float(self.last_epoch - self.warmups) /
                                   float(self.T_max - self.warmups), 2)
+
+
+class MultiStepDecay(lr.MultiStepDecay):
+    def __init__(self, learning_rate, milestones, gamma, last_epoch, **kwargs):
+         super().__init__(learning_rate, milestones, gamma, last_epoch)

From d9f9bf1ad0c87e48e1ba798888dcc9cf88e2ff58 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Mon, 15 May 2023 11:33:42 +0800
Subject: [PATCH 29/46] fix

---
 passl/models/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/passl/models/__init__.py b/passl/models/__init__.py
index 85f9663b..0792faae 100644
--- a/passl/models/__init__.py
+++ b/passl/models/__init__.py
@@ -27,7 +27,6 @@
 from .convnext import *
 from .mocov3 import *
 from .swav import *
-from .swav_resnet import *
 from .simsiam import *
 
 __all__ = ["build_model"]

From dc55ff6490c422d34925f94b468fa870ec58c097 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Tue, 16 May 2023 15:25:22 +0800
Subject: [PATCH 30/46] fix_by_comment

---
 passl/data/dataset/__init__.py                |  1 +
 passl/data/dataset/fewshot_dataset.py         | 58 +++++++++++++++++++
 passl/data/dataset/imagefolder_dataset.py     | 21 +------
 passl/engine/engine.py                        |  2 +-
 passl/engine/loops/classification_loop.py     |  2 +-
 .../engine/loops/contrastive_learning_loop.py |  5 --
 passl/engine/loops/loop.py                    |  3 +-
 passl/models/swav.py                          | 13 ++++-
 ...wav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml |  8 +--
 tests/CI/case.sh                              |  4 +-
 10 files changed, 81 insertions(+), 36 deletions(-)
 create mode 100644 passl/data/dataset/fewshot_dataset.py

diff --git a/passl/data/dataset/__init__.py b/passl/data/dataset/__init__.py
index b19912e1..011cae11 100644
--- a/passl/data/dataset/__init__.py
+++ b/passl/data/dataset/__init__.py
@@ -64,3 +64,4 @@ def default_loader(path: str):
 from .imagenet_dataset import ImageNetDataset
 from .imagefolder_dataset import ImageFolder
 from .multicrop_dataset import MultiCropDataset
+from .fewshot_dataset import FewShotDataset
diff --git a/passl/data/dataset/fewshot_dataset.py b/passl/data/dataset/fewshot_dataset.py
new file mode 100644
index 00000000..809abd0a
--- /dev/null
+++ b/passl/data/dataset/fewshot_dataset.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+from passl.utils import logger
+from passl.data.dataset import default_loader
+from passl.data.dataset import ImageFolder
+
+IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif",
+                  ".tiff", ".webp")
+
+
+class FewShotDataset(ImageFolder):
+    """
+    This class inherits from :class:`~passl.data.datasets.ImageFolder`, so
+    the dataset takes txt files containing image names to find the data
+    Args:
+        root (string): Root directory path.
+        transform (callable, optional): A function/transform that  takes in an numpy image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        loader (callable, optional): A function to load an image given its path.
+        txt_file_name(string): The name of the txt file.
+    """
+
+    def __init__(self,
+                 root,
+                 transform=None,
+                 target_transform=None,
+                 loader=default_loader,
+                 extensions=IMG_EXTENSIONS,
+                 txt_file_name=None):
+        super(FewShotDataset, self).__init__(root=root, transform=transform,
+                                             target_transform=target_transform, loader=loader,
+                                             extensions=extensions)
+
+        assert txt_file_name is not None, "The txt_file_name should not be assigned."
+        if os.path.isfile(txt_file_name):
+            with open(txt_file_name, 'r') as f:
+                list_imgs = [li.split('\n')[0] for li in f.readlines()]
+
+            self.imgs = [(os.path.join(root, li.split('_')[0], li), self.class_to_idx[li.split('_')[0]]) for li in list_imgs]
+        else:
+            raise FileNotFoundError('{} is not existed'.format(txt_file_name))
+        print('Previous information is not correct.')
+        print(f'Actually, we have total {len(self.imgs)} images in semi-training setting.')
diff --git a/passl/data/dataset/imagefolder_dataset.py b/passl/data/dataset/imagefolder_dataset.py
index 76a5d77c..fd1429e8 100644
--- a/passl/data/dataset/imagefolder_dataset.py
+++ b/passl/data/dataset/imagefolder_dataset.py
@@ -13,14 +13,10 @@
 # limitations under the License.
 
 import os
-import urllib
-import urllib.request
 import numpy as np
 from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
 
 import paddle
-
-from passl.utils import logger
 from passl.data.dataset import default_loader
 
 IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif",
@@ -59,24 +55,11 @@ def __init__(self,
                  transform=None,
                  target_transform=None,
                  loader=default_loader,
-                 extensions=IMG_EXTENSIONS,
-                 samples_tag=None):
+                 extensions=IMG_EXTENSIONS):
 
         self.root = root
         classes, class_to_idx = self.find_classes(self.root)
-        if samples_tag is None:
-            samples = self.make_dataset(self.root, class_to_idx, extensions)
-        elif samples_tag == "semi_1" or samples_tag == "semi_10":
-            # connection reset proxyon
-            # subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(samples_tag.split('_')[-1]) + "percent.txt")
-            # list_imgs = [li.decode("utf-8").split('\n')[0] for li in subset_file]
-            subset_file = str(samples_tag.split('_')[-1]) + "percent.txt"
-            with open(subset_file, 'r') as f:
-                list_imgs = [li.split('\n')[0] for li in f.readlines()]
-
-            samples = [(os.path.join(root, li.split('_')[0], li), class_to_idx[li.split('_')[0]]) for li in list_imgs]
-        else:
-            raise NotImplementedError('{} is not implemented'.format(samples))
+        samples = self.make_dataset(self.root, class_to_idx, extensions)
 
         print(f'find total {len(classes)} classes and {len(samples)} images.')
 
diff --git a/passl/engine/engine.py b/passl/engine/engine.py
index c43b199b..d0008aa7 100644
--- a/passl/engine/engine.py
+++ b/passl/engine/engine.py
@@ -233,7 +233,7 @@ def worker_init_fn(worker_id):
                                              self.lr_decay_unit)
 
         # load pretrained model
-        if  self.config["Global"]["pretrained_model"] is not None:
+        if self.config["Global"]["pretrained_model"] is not None:
             assert isinstance(
                 self.config["Global"]["pretrained_model"], str
             ), "pretrained_model type is not available. Please use `string`."
diff --git a/passl/engine/loops/classification_loop.py b/passl/engine/loops/classification_loop.py
index 659bcc19..92ce83f0 100644
--- a/passl/engine/loops/classification_loop.py
+++ b/passl/engine/loops/classification_loop.py
@@ -77,7 +77,7 @@ def forward_backward(self, batch):
         out = paddle.concat(final_out, axis=0)
         return out, final_loss_dict
 
-    def train_one_step(self, batch, total_iterations=None):
+    def train_one_step(self, batch):
 
         # do forward and backward
         out, loss_dict = self.forward_backward(batch)
diff --git a/passl/engine/loops/contrastive_learning_loop.py b/passl/engine/loops/contrastive_learning_loop.py
index 663703cc..428d4853 100644
--- a/passl/engine/loops/contrastive_learning_loop.py
+++ b/passl/engine/loops/contrastive_learning_loop.py
@@ -62,11 +62,6 @@ def forward_backward(self, batch):
             scaled = self.trainer.scaler.scale(loss_dict["loss"])
             scaled.backward()
 
-            try:
-                self.trainer.model.after_loss_backward(self.total_iterations)
-            except AttributeError:
-                logger.warning("Model has no after_loss_backward method, ignored this process")
-
         return final_loss_dict
 
     def train_one_step(self, batch):
diff --git a/passl/engine/loops/loop.py b/passl/engine/loops/loop.py
index c14978a5..80b643df 100644
--- a/passl/engine/loops/loop.py
+++ b/passl/engine/loops/loop.py
@@ -26,6 +26,7 @@
 from passl.utils import io
 from passl.utils import logger
 from passl.utils.misc import SmoothedValue
+from passl.utils.infohub import runtime_info_hub
 
 class _Loop:
     """Basic Loops interface."""
@@ -264,7 +265,7 @@ def train_one_epoch(self):
 
         for batch_idx, batch in enumerate(self.trainer.train_dataloader):
             self.cur_batch_idx = batch_idx
-            self.total_iterations = (self.trainer.cur_epoch_id-1)*self.total_batch_idx + batch_idx
+            runtime_info_hub.total_iterations = (self.trainer.cur_epoch_id-1)*self.total_batch_idx + batch_idx
 
             if self.max_train_step is not None and self.global_step >= self.max_train_step:
                 logger.info(
diff --git a/passl/models/swav.py b/passl/models/swav.py
index cb9c7298..08ea2879 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -22,6 +22,7 @@
 
 from passl.nn import init
 from passl.utils import logger
+from passl.utils.infohub import runtime_info_hub
 from passl.models.base_model import Model
 from passl.models.resnet import ResNet, BottleneckBlock
 
@@ -135,9 +136,6 @@ def __init__(self, queue_length=0, crops_for_assign=(0, 1), nmb_crops=[2, 6], ep
 
         self.apply(self._freeze_norm)
 
-    # def load_pretrained(self, path, rank=0, finetune=False):
-    #     self._load_model('swav_800ep_pretrain.pdparams', 'backbone')
-
     @paddle.no_grad()
     def distributed_sinkhorn(self, out, sinkhorn_iterations=3):
         Q = paddle.exp(x=out / self.epsilon).t()
@@ -159,6 +157,15 @@ def distributed_sinkhorn(self, out, sinkhorn_iterations=3):
     def forward(self, inp):
         bs = inp[0].shape[0]
 
+        if runtime_info_hub.total_iterations < self.freeze_prototypes_niters:
+            for name, p in self.res_model.named_parameters():
+                if 'prototypes' in name:
+                    p.stop_gradient = True
+        else:
+            for name, p in self.res_model.named_parameters():
+                if 'prototypes' in name:
+                    p.stop_gradient = False
+
         # normalize the prototypes
         with paddle.no_grad():
             w = self.res_model.prototypes.weight.clone()
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
index c7353402..0427f294 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
@@ -4,9 +4,9 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-  pretrained_model: epoch_73n
+  pretrained_model: epoch_794
   finetune: True
-  output_dir: ./output/semi_0506_semi10
+  output_dir: ./output/semi_0515_semi10
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
@@ -63,7 +63,7 @@ Optimizer:
 DataLoader:
   Train:
     dataset:
-      name: ImageFolder
+      name: FewShotDataset
       root: data/ILSVRC2012/train
       transform:
         - RandomResizedCrop:
@@ -73,7 +73,7 @@ DataLoader:
         - Normalize:
             mean: [0.485, 0.456, 0.406]
             std: [0.228, 0.224, 0.225]
-      samples_tag: semi_10
+      txt_file_name: 10percent.txt
     sampler:
       name: DistributedBatchSampler
       batch_size: 128 # accum_steps: 1, total batchsize: 256
diff --git a/tests/CI/case.sh b/tests/CI/case.sh
index 21a1f9b5..1bcb646e 100644
--- a/tests/CI/case.sh
+++ b/tests/CI/case.sh
@@ -398,9 +398,9 @@ function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() {
     loss=`cat log/workerlog.0 | grep '200/501' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
     mem=`cat log/workerlog.0 | grep '200/501' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
-    loss_base=1.95351
+    loss_base=1.97248
     ips_base=793.89847
-    mem_base=5.67
+    mem_base=10.74
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
 }

From 6b1c6a5efbb4a6afd64063a630f97967b233295e Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Tue, 16 May 2023 15:47:30 +0800
Subject: [PATCH 31/46] pretrained_model

---
 passl/data/dataset/fewshot_dataset.py           |  2 +-
 passl/utils/io.py                               |  5 -----
 tasks/ssl/swav/README.md                        | 17 +++++++++++++++--
 .../swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml |  4 ++--
 .../swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml  |  2 +-
 5 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/passl/data/dataset/fewshot_dataset.py b/passl/data/dataset/fewshot_dataset.py
index 809abd0a..c696f549 100644
--- a/passl/data/dataset/fewshot_dataset.py
+++ b/passl/data/dataset/fewshot_dataset.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/passl/utils/io.py b/passl/utils/io.py
index ab166eaf..cc7b5a28 100644
--- a/passl/utils/io.py
+++ b/passl/utils/io.py
@@ -158,11 +158,6 @@ def save_checkpoint(net,
         if loss_scaler is not None:
             opt_state_dict['scaler_state'] = loss_scaler.state_dict()
 
-        # Solve AttrDict can't pickle error
-        for group in opt_state_dict['param_groups']:
-            if 'LRScheduler' in group:
-                group['LRScheduler'] = dict(group['LRScheduler'])
-
         for model_prefix in model_prefixs:
             paddle.save(opt_state_dict, model_prefix + ".pdopt")
             paddle.save(metric_info, model_prefix + ".pdstates")
diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md
index 4cd19543..2d44d7c5 100644
--- a/tasks/ssl/swav/README.md
+++ b/tasks/ssl/swav/README.md
@@ -40,9 +40,16 @@ python -m paddle.distributed.launch \
 ```
 
 ## How to Linear Classification
-
 By default, we use momentum-SGD and a batch size of 256 for linear classification on frozen features/weights. This can be done with a single 8-GPU node.
 
+- Download pretrained model
+```bash
+mkdir -p pretrained/swav
+wget -O ./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams
+```
+
+- Train linear classification model
+
 ```bash
 unset PADDLE_TRAINER_ENDPOINTS
 export PADDLE_NNODES=1
@@ -70,7 +77,13 @@ To perform end-to-end fine-tuning for SwAV:
     wget "https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/1percent.txt"
     ```
 
-* Then, download the pretrained models to `./pretrained/swav/swav_resnet50_in1k_800ep_pretrained.pdparams`
+* Then, download the pretrained models to `./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams`
+
+- Download pretrained model
+```bash
+mkdir -p pretrained/swav
+wget -O ./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams
+```
 
 * Finally, run the training with the trained PASSL format checkpoint:
     ```bash
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
index 0427f294..e8715e9e 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
@@ -4,9 +4,9 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-  pretrained_model: epoch_794
+  pretrained_model: ./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained
   finetune: True
-  output_dir: ./output/semi_0515_semi10
+  output_dir: ./output
   device: gpu
   save_interval: 1
   max_num_latest_checkpoint: 0
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
index 08a1dc25..05c9b334 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
@@ -4,7 +4,7 @@ Global:
   train_loop: ClassificationTrainingEpochLoop
   validate_loop: ClassificationEvaluationLoop
   checkpoint: null
-  pretrained_model: ./pretrained/swav/swav_resnet50_in1k_800ep_pretrained
+  pretrained_model: ./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained
   output_dir: ./output
   device: gpu
   save_interval: 1

From bc3f41f334ea3bf9d327e8ccc444cab82f97b4eb Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Wed, 17 May 2023 10:07:08 +0800
Subject: [PATCH 32/46] add_models

---
 tasks/ssl/swav/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md
index 2d44d7c5..def91e8e 100644
--- a/tasks/ssl/swav/README.md
+++ b/tasks/ssl/swav/README.md
@@ -108,12 +108,12 @@ We provide more directly runnable configurations, see [SwAV Configurations](./co
 ## Models
 
 ### ViT-Base
-| Model         | Phase       | Dataset      | Configs                                                      | GPUs       | Epochs | Top1 Acc (%) | Links                                                   |
+| Model         | Phase       | Dataset      | Configs  | GPUs       | Epochs | Top1 Acc (%) | Links                                                   |
 | ------------- | ----------- | ------------ | ------------------------------------------------------------ | ---------- | ------ | -------- | ------------------------------------------------------------ |
-| resnet50 | pretrain    | ImageNet2012 | [config](./configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml) | A100*N4C32 | 800    | -        | [model]() \| [log]() |
-| resnet50 | linear probe | ImageNet2012 | [config](./configs/swav_resnet50_224_lp_in1k_4n32c_dp_fp16o1.yaml) | A100*N1C8  | 75.3    | 0.7662   |        [model]() \| [log]() |
-| resnet50 | finetune    | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4  | 100    | 69.0   | [model]() \| [log]() |
-
+| resnet50 | pretrain    | ImageNet2012 | [config](./configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml) | A100*N4C32 | 800    | -        | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams) \| [log](https://github.com/shiyutang/files/files/11493437/pretrain_train.log) |
+| resnet50 | linear probe | ImageNet2012 | [config](./configs/swav_resnet50_224_lp_in1k_4n32c_dp_fp16o1.yaml) | A100*N1C8  |  100  | 75.3    |        [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_linearprobe.pdparams) \| [log](https://github.com/shiyutang/files/files/11493435/linear_train.log) |
+| resnet50 | finetune-semi10    | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4  | 20    | 69.0   | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_10percent.pdparams) \| [log](https://github.com/shiyutang/files/files/11493438/semi10_train.log) |
+| resnet50 | finetune-semi10    | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4  | 20    | 55.0   | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_1percent.pdparams) \| [log](https://github.com/shiyutang/files/files/11493451/semi1.log) |
 ## Citations
 
 ```bibtex

From c8175d31e225e83baaffd8b82260ee9d0948dc3a Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Wed, 17 May 2023 10:15:53 +0800
Subject: [PATCH 33/46] CI

---
 tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh | 6 +++---
 tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh | 6 +++---
 .../CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
index f56f1e0b..734e5108 100644
--- a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
+++ b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
@@ -22,9 +22,9 @@ python -m paddle.distributed.launch \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
     passl-train \
-    -c ../../tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yaml \
-    -o Global.print_batch_step=1 \
+    -c ../../tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp32.yaml \
+    -o Global.print_batch_step=20 \
     -o Global.max_train_step=201 \
     -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \
     -o Global.flags.FLAGS_cudnn_deterministic=1
-    -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained
+    -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained
diff --git a/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh b/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh
index fd8a7709..d018a166 100644
--- a/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh
+++ b/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh
@@ -22,9 +22,9 @@ python -m paddle.distributed.launch \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
     passl-train \
-    -c ../../tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yaml \
-    -o Global.print_batch_step=1 \
+    -c ../../tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yaml \
+    -o Global.print_batch_step=20 \
     -o Global.max_train_step=201 \
     -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \
     -o Global.flags.FLAGS_cudnn_deterministic=1 \
-    -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained
+    -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained
diff --git a/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh b/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh
index 2fa7ad20..8d8cd867 100644
--- a/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh
+++ b/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh
@@ -23,7 +23,7 @@ python -m paddle.distributed.launch \
     --devices=$CUDA_VISIBLE_DEVICES \
     passl-train \
     -c ../../tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml \
-    -o Global.print_batch_step=1 \
+    -o Global.print_batch_step=20 \
     -o Global.max_train_step=201 \
     -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \
     -o Global.flags.FLAGS_cudnn_deterministic=1

From 510ca3d5a8f0c6d9a76cc28153410e0be8670963 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Wed, 17 May 2023 10:18:53 +0800
Subject: [PATCH 34/46] CI

---
 tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
index 734e5108..59cfe413 100644
--- a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
+++ b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
@@ -26,5 +26,5 @@ python -m paddle.distributed.launch \
     -o Global.print_batch_step=20 \
     -o Global.max_train_step=201 \
     -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \
-    -o Global.flags.FLAGS_cudnn_deterministic=1
+    -o Global.flags.FLAGS_cudnn_deterministic=1 \
     -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained

From 455bf84d0be2652d41422d0948ceafe30ca7ca12 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Wed, 17 May 2023 14:13:43 +0800
Subject: [PATCH 35/46] CI

---
 tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
index 59cfe413..220262ac 100644
--- a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
+++ b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
@@ -22,7 +22,7 @@ python -m paddle.distributed.launch \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
     passl-train \
-    -c ../../tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp32.yaml \
+    -c ../../tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml \
     -o Global.print_batch_step=20 \
     -o Global.max_train_step=201 \
     -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \

From 9473b92348d4c55d8de77ac2a55aef22cf9426e7 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Wed, 17 May 2023 17:00:24 +0800
Subject: [PATCH 36/46] fix_CI

---
 .../swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml  | 4 ++--
 .../swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml   | 4 ++--
 .../configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml     | 2 +-
 .../configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml    | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
index e8715e9e..32fa7a90 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
@@ -64,7 +64,7 @@ DataLoader:
   Train:
     dataset:
       name: FewShotDataset
-      root: data/ILSVRC2012/train
+      root: ./dataset/ILSVRC2012/train
       transform:
         - RandomResizedCrop:
             size: 224
@@ -86,7 +86,7 @@ DataLoader:
   Eval:
     dataset:
       name: ImageFolder
-      root: data/ILSVRC2012/val
+      root: ./dataset/ILSVRC2012/val
       transform:
         - Resize:
             size: 256
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
index 05c9b334..59c44277 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
@@ -57,7 +57,7 @@ DataLoader:
   Train:
     dataset:
       name: ImageFolder
-      root: data/ILSVRC2012/train
+      root: ./dataset/ILSVRC2012/train
       transform:
         - RandomResizedCrop:
             size: 224
@@ -78,7 +78,7 @@ DataLoader:
   Eval:
     dataset:
       name: ImageFolder
-      root: data/ILSVRC2012/val
+      root: ./dataset/ILSVRC2012/val
       transform:
         - Resize:
             size: 256
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
index 75961e5a..f2226887 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
@@ -63,7 +63,7 @@ DataLoader:
   Train:
     dataset:
       name: MultiCropDataset
-      root: ./data/ILSVRC2012
+      root: ./dataset/ILSVRC2012
       size_crops: [224, 96]
       num_crops: [2, 6]
       min_scale_crops: [0.14, 0.05]
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
index cdc0ebf8..96042f86 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
@@ -62,7 +62,7 @@ DataLoader:
   Train:
     dataset:
       name: MultiCropDataset
-      root: ./data/ILSVRC2012
+      root: ./dataset/ILSVRC2012
       size_crops: [224, 96]
       num_crops: [2, 6]
       min_scale_crops: [0.14, 0.05]

From 42016993b8f7528220c7fed1a3d83d6fad3cda64 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Wed, 17 May 2023 20:07:58 +0800
Subject: [PATCH 37/46] update_10per

---
 .../swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
index 32fa7a90..283d4e8f 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
@@ -73,7 +73,7 @@ DataLoader:
         - Normalize:
             mean: [0.485, 0.456, 0.406]
             std: [0.228, 0.224, 0.225]
-      txt_file_name: 10percent.txt
+      txt_file_name: ./dataset/ILSVRC2012/10percent.txt
     sampler:
       name: DistributedBatchSampler
       batch_size: 128 # accum_steps: 1, total batchsize: 256

From 0d4292ab99a3b35107c22e94e94280f0470f4e0d Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Wed, 17 May 2023 21:32:47 +0800
Subject: [PATCH 38/46] fix_ci

---
 tasks/ssl/swav/README.md                      | 123 ++++++++++++++++++
 tests/CI/case.sh                              |  38 ++++--
 .../swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh |   2 +-
 3 files changed, 148 insertions(+), 15 deletions(-)

diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md
index def91e8e..0950a663 100644
--- a/tasks/ssl/swav/README.md
+++ b/tasks/ssl/swav/README.md
@@ -126,3 +126,126 @@ We provide more directly runnable configurations, see [SwAV Configurations](./co
       primaryClass={cs.CV}
 }
 ```
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env bash
+set -e
+
+export passl_path=/paddle/PASSL/tests/CI
+export log_path=/paddle/log_passl
+
+function model_list(){
+    swav_resnet50_224_ft_in1k_1n4c_dp_fp32
+    swav_resnet50_224_lp_in1k_1n8c_dp_fp32
+    swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1
+}
+
+############ case start ############
+
+function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() {
+    echo "=========== $FUNCNAME run begin ==========="
+    rm -rf log
+    bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
+
+    loss=`cat log/workerlog.0 | grep '120/126' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
+    ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
+    mem=`cat log/workerlog.0 | grep '120/126' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
+    loss_base=2.01301
+    ips_base=1922.62626
+    mem_base=10.50
+    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
+    echo "=========== $FUNCNAME run  end ==========="
+}
+
+function swav_resnet50_224_lp_in1k_1n8c_dp_fp32() {
+    echo "=========== $FUNCNAME run begin ==========="
+    rm -rf log
+    bash ./ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh
+
+    loss=`cat log/workerlog.0 | grep '200/5005' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
+    ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
+    mem=`cat log/workerlog.0 | grep '200/5005' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
+    loss_base=4.89133
+    ips_base=11111.52955
+    mem_base=0.83
+    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
+    echo "=========== $FUNCNAME run  end ==========="
+}
+
+
+function swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1() {
+    echo "=========== $FUNCNAME run begin ==========="
+    rm -rf log
+    bash ./ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh
+
+    loss=`cat log/workerlog.0 | grep '200/2599' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
+    ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
+    mem=`cat log/workerlog.0 | grep '200/2599' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
+    loss_base=8.00343
+    ips_base=1385.94186
+    mem_base=8.63
+    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
+    echo "=========== $FUNCNAME run  end ==========="
+}
+
+function check_result() {
+    if [ $? -ne 0 ];then
+      echo -e "\033 $1 model runs failed! \033" | tee -a $log_path/result.log
+      exit -1
+    fi
+
+    if [ $# -ne 7 ]; then
+        echo -e "\033 parameter transfer failed: $@ \033" | tee -a $log_path/result.log
+        exit -1
+    fi
+
+    echo -e "loss_base: $2 loss_test: $3" | tee -a $log_path/result.log
+    if [ $2 != $3 ];then
+      echo -e "\033 $1 loss diff check failed! \033" | tee -a $log_path/result.log
+      exit -1
+    fi
+
+    diff=$(echo $4 $5|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
+    echo -e "ips_base: $4 ips_test: $5 ips_diff: $diff% " | tee -a $log_path/result.log
+    # 设置不同ips校验阈值
+    if [ $1 == mae_vit_base_patch16_lp_in1k_1n8c_dp_fp16o1 ];then
+        v1=$(echo $diff 10.0|awk '{print($1>=$2)?"0":"1"}')
+        v2=$(echo $diff -10.0|awk '{print($1<=$2)?"0":"1"}')
+    else
+        v1=$(echo $diff 5.0|awk '{print($1>=$2)?"0":"1"}')
+        v2=$(echo $diff -5.0|awk '{print($1<=$2)?"0":"1"}')
+    fi
+    if [[ $v1 == 0 ]] || [[ $v2 == 0 ]];then
+      echo -e "\033 $1 ips diff check failed! \033" | tee -a $log_path/result.log
+      exit -1
+    fi
+
+    echo -e "mem_base: $6 mem_test: $7" | tee -a $log_path/result.log
+    if [ $6 != $7 ];then
+      echo -e "\033 $1 mem diff check failed! \033" | tee -a $log_path/result.log
+      exit -1
+    fi
+
+}
+
+
+main() {
+    cd ${passl_path}
+
+    model_list
+}
+
+main$@
diff --git a/tests/CI/case.sh b/tests/CI/case.sh
index d123f64e..caeca3cb 100644
--- a/tests/CI/case.sh
+++ b/tests/CI/case.sh
@@ -395,12 +395,12 @@ function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() {
     rm -rf log
     bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
 
-    loss=`cat log/workerlog.0 | grep '200/501' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
+    loss=`cat log/workerlog.0 | grep '120/126' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
-    mem=`cat log/workerlog.0 | grep '200/501' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
-    loss_base=1.97248
-    ips_base=793.89847
-    mem_base=10.74
+    mem=`cat log/workerlog.0 | grep '120/126' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
+    loss_base=2.01301
+    ips_base=1536.33
+    mem_base=10.50
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
 }
@@ -413,9 +413,9 @@ function swav_resnet50_224_lp_in1k_1n8c_dp_fp32() {
     loss=`cat log/workerlog.0 | grep '200/5005' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
     mem=`cat log/workerlog.0 | grep '200/5005' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
-    loss_base=4.89133
-    ips_base=11111.52955
-    mem_base=0.83
+    loss_base=3.83529
+    ips_base=5620.26
+    mem_base=0.46
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
 }
@@ -429,9 +429,9 @@ function swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1() {
     loss=`cat log/workerlog.0 | grep '200/2599' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
     mem=`cat log/workerlog.0 | grep '200/2599' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
-    loss_base=8.00343
-    ips_base=1385.94186
-    mem_base=8.63
+    loss_base=7.94478
+    ips_base=982.07
+    mem_base=8.62
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
 }
@@ -448,9 +448,19 @@ function check_result() {
     fi
 
     echo -e "loss_base: $2 loss_test: $3" | tee -a $log_path/result.log
-    if [ $2 != $3 ];then
-      echo -e "\033 $1 loss diff check failed! \033" | tee -a $log_path/result.log
-      exit -1
+    diff=$(echo $2 $3|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
+    if [ $1 == swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1 ];then
+        v1=$(echo $diff 0.1|awk '{print($1>=$2)?"0":"1"}')
+        v2=$(echo $diff -0.1|awk '{print($1<=$2)?"0":"1"}')
+        if [[ $v1 == 0 ]] || [[ $v2 == 0 ]];then
+        echo -e "\033 $1 loss diff check failed! \033" | tee -a $log_path/result.log
+        exit -1
+        fi
+    else
+        if [ $2 != $3 ];then
+        echo -e "\033 $1 loss diff check failed! \033" | tee -a $log_path/result.log
+        exit -1
+        fi
     fi
 
     diff=$(echo $4 $5|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
diff --git a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
index 220262ac..badf168a 100644
--- a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
+++ b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
@@ -24,7 +24,7 @@ python -m paddle.distributed.launch \
     passl-train \
     -c ../../tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml \
     -o Global.print_batch_step=20 \
-    -o Global.max_train_step=201 \
+    -o Global.max_train_step=121 \
     -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \
     -o Global.flags.FLAGS_cudnn_deterministic=1 \
     -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained

From 3b0862c6af6b885251e242f958cf18de12ea9b6d Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Thu, 18 May 2023 09:58:33 +0800
Subject: [PATCH 39/46] ft_ips

---
 tests/CI/case.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/CI/case.sh b/tests/CI/case.sh
index caeca3cb..f6e34761 100644
--- a/tests/CI/case.sh
+++ b/tests/CI/case.sh
@@ -399,7 +399,7 @@ function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() {
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
     mem=`cat log/workerlog.0 | grep '120/126' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
     loss_base=2.01301
-    ips_base=1536.33
+    ips_base=1919.8
     mem_base=10.50
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="

From afba8ea57727856e04ff86686882fbfc041312fc Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Thu, 18 May 2023 11:03:35 +0800
Subject: [PATCH 40/46] fix_by_comment

---
 passl/data/dataset/__init__.py                |   2 +-
 passl/data/dataset/fewshot_dataset.py         |   4 +-
 ...p_dataset.py => swavmulticrop_datatset.py} |   0
 passl/engine/engine.py                        |   1 +
 passl/engine/loops/loop.py                    |   1 -
 passl/models/swav.py                          |   6 +-
 tasks/ssl/swav/README.md                      | 145 ++----------------
 ...av_resnet50_224_lp_in1k_1n8c_dp_fp32.yaml} |   0
 ...v_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml |   2 +-
 ..._resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml |   2 +-
 10 files changed, 19 insertions(+), 144 deletions(-)
 rename passl/data/dataset/{multicrop_dataset.py => swavmulticrop_datatset.py} (100%)
 rename tasks/ssl/swav/configs/{swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml => swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yaml} (100%)

diff --git a/passl/data/dataset/__init__.py b/passl/data/dataset/__init__.py
index 011cae11..fb5933f5 100644
--- a/passl/data/dataset/__init__.py
+++ b/passl/data/dataset/__init__.py
@@ -63,5 +63,5 @@ def default_loader(path: str):
 
 from .imagenet_dataset import ImageNetDataset
 from .imagefolder_dataset import ImageFolder
-from .multicrop_dataset import MultiCropDataset
+from .multicrop_dataset import SwAVMultiCropDataset
 from .fewshot_dataset import FewShotDataset
diff --git a/passl/data/dataset/fewshot_dataset.py b/passl/data/dataset/fewshot_dataset.py
index c696f549..47448308 100644
--- a/passl/data/dataset/fewshot_dataset.py
+++ b/passl/data/dataset/fewshot_dataset.py
@@ -54,5 +54,5 @@ def __init__(self,
             self.imgs = [(os.path.join(root, li.split('_')[0], li), self.class_to_idx[li.split('_')[0]]) for li in list_imgs]
         else:
             raise FileNotFoundError('{} is not existed'.format(txt_file_name))
-        print('Previous information is not correct.')
-        print(f'Actually, we have total {len(self.imgs)} images in semi-training setting.')
+        logger.info('Previous information is not correct.')
+        logger.info(f'Actually, we have total {len(self.imgs)} images in semi-training setting.')
diff --git a/passl/data/dataset/multicrop_dataset.py b/passl/data/dataset/swavmulticrop_datatset.py
similarity index 100%
rename from passl/data/dataset/multicrop_dataset.py
rename to passl/data/dataset/swavmulticrop_datatset.py
diff --git a/passl/engine/engine.py b/passl/engine/engine.py
index d0008aa7..5302efff 100644
--- a/passl/engine/engine.py
+++ b/passl/engine/engine.py
@@ -345,6 +345,7 @@ def checkpoint(self):
     def init_runtime_info_hub(self):
         runtime_info_hub.epochs = self.train_loop.epochs
         runtime_info_hub.max_steps = self.train_loop.max_steps
+        runtime_info_hub.total_iterations = self.train_loop.global_step
 
     def train(self):
         assert self.mode == "train"
diff --git a/passl/engine/loops/loop.py b/passl/engine/loops/loop.py
index 80b643df..45745484 100644
--- a/passl/engine/loops/loop.py
+++ b/passl/engine/loops/loop.py
@@ -265,7 +265,6 @@ def train_one_epoch(self):
 
         for batch_idx, batch in enumerate(self.trainer.train_dataloader):
             self.cur_batch_idx = batch_idx
-            runtime_info_hub.total_iterations = (self.trainer.cur_epoch_id-1)*self.total_batch_idx + batch_idx
 
             if self.max_train_step is not None and self.global_step >= self.max_train_step:
                 logger.info(
diff --git a/passl/models/swav.py b/passl/models/swav.py
index 08ea2879..db4eeeb8 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -89,7 +89,7 @@ def _freeze_norm(self, layer):
 class SwAVLinearProbe(SwAV):
     def __init__(self, class_num=1000, **kwargs):
         super().__init__(**kwargs)
-        self.linear = RegLog(class_num)
+        self.linear = RegLogit(class_num)
         self.res_model.eval()
 
         # freeze all layers but the last fc
@@ -226,11 +226,11 @@ def swav_resnet50_pretrain(apex, **kwargs): # todo
 
     return model
 
-class RegLog(paddle.nn.Layer):
+class RegLogit(paddle.nn.Layer):
     """Creates logistic regression on top of frozen features"""
 
     def __init__(self, num_labels):
-        super(RegLog, self).__init__()
+        super(RegLogit, self).__init__()
         s = 2048
         self.av_pool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1))
         self.linear = paddle.nn.Linear(in_features=s, out_features=num_labels)
diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md
index 0950a663..e573b84c 100644
--- a/tasks/ssl/swav/README.md
+++ b/tasks/ssl/swav/README.md
@@ -70,7 +70,7 @@ To perform end-to-end fine-tuning for SwAV:
 
 * First download the data split text file with following commands:
     ```bash
-    cd PASSL
+    cd PASSL/dataset/ILSVRC2012
 
     wget "https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/10percent.txt"
 
@@ -78,12 +78,10 @@ To perform end-to-end fine-tuning for SwAV:
     ```
 
 * Then, download the pretrained models to `./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams`
-
-- Download pretrained model
-```bash
-mkdir -p pretrained/swav
-wget -O ./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams
-```
+    ```bash
+    mkdir -p pretrained/swav
+    wget -O ./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams
+    ```
 
 * Finally, run the training with the trained PASSL format checkpoint:
     ```bash
@@ -107,13 +105,13 @@ We provide more directly runnable configurations, see [SwAV Configurations](./co
 
 ## Models
 
-### ViT-Base
+### Resnet
 | Model         | Phase       | Dataset      | Configs  | GPUs       | Epochs | Top1 Acc (%) | Links                                                   |
 | ------------- | ----------- | ------------ | ------------------------------------------------------------ | ---------- | ------ | -------- | ------------------------------------------------------------ |
-| resnet50 | pretrain    | ImageNet2012 | [config](./configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml) | A100*N4C32 | 800    | -        | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams) \| [log](https://github.com/shiyutang/files/files/11493437/pretrain_train.log) |
-| resnet50 | linear probe | ImageNet2012 | [config](./configs/swav_resnet50_224_lp_in1k_4n32c_dp_fp16o1.yaml) | A100*N1C8  |  100  | 75.3    |        [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_linearprobe.pdparams) \| [log](https://github.com/shiyutang/files/files/11493435/linear_train.log) |
-| resnet50 | finetune-semi10    | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4  | 20    | 69.0   | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_10percent.pdparams) \| [log](https://github.com/shiyutang/files/files/11493438/semi10_train.log) |
-| resnet50 | finetune-semi10    | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4  | 20    | 55.0   | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_1percent.pdparams) \| [log](https://github.com/shiyutang/files/files/11493451/semi1.log) |
+| resnet50 | pretrain    | ImageNet2012 | [config](./configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml) | A100*N2C16 | 800    | -        | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.log) |
+| resnet50 | linear probe | ImageNet2012 | [config](./configs/swav_resnet50_224_lp_in1k_4n32c_dp_fp16o1.yaml) | A100*N1C8  |  100  | 75.3    |        [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_linearprobe.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_linearprobe.log) |
+| resnet50 | finetune-semi10    | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4  | 20    | 69.0   | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_10percent.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_10percent.log) |
+| resnet50 | finetune-semi10    | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4  | 20    | 55.0   | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_1percent.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_1percent.log) |
 ## Citations
 
 ```bibtex
@@ -126,126 +124,3 @@ We provide more directly runnable configurations, see [SwAV Configurations](./co
       primaryClass={cs.CV}
 }
 ```
-
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env bash
-set -e
-
-export passl_path=/paddle/PASSL/tests/CI
-export log_path=/paddle/log_passl
-
-function model_list(){
-    swav_resnet50_224_ft_in1k_1n4c_dp_fp32
-    swav_resnet50_224_lp_in1k_1n8c_dp_fp32
-    swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1
-}
-
-############ case start ############
-
-function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() {
-    echo "=========== $FUNCNAME run begin ==========="
-    rm -rf log
-    bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh
-
-    loss=`cat log/workerlog.0 | grep '120/126' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
-    ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
-    mem=`cat log/workerlog.0 | grep '120/126' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
-    loss_base=2.01301
-    ips_base=1922.62626
-    mem_base=10.50
-    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
-    echo "=========== $FUNCNAME run  end ==========="
-}
-
-function swav_resnet50_224_lp_in1k_1n8c_dp_fp32() {
-    echo "=========== $FUNCNAME run begin ==========="
-    rm -rf log
-    bash ./ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh
-
-    loss=`cat log/workerlog.0 | grep '200/5005' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
-    ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
-    mem=`cat log/workerlog.0 | grep '200/5005' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
-    loss_base=4.89133
-    ips_base=11111.52955
-    mem_base=0.83
-    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
-    echo "=========== $FUNCNAME run  end ==========="
-}
-
-
-function swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1() {
-    echo "=========== $FUNCNAME run begin ==========="
-    rm -rf log
-    bash ./ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh
-
-    loss=`cat log/workerlog.0 | grep '200/2599' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
-    ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
-    mem=`cat log/workerlog.0 | grep '200/2599' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
-    loss_base=8.00343
-    ips_base=1385.94186
-    mem_base=8.63
-    check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
-    echo "=========== $FUNCNAME run  end ==========="
-}
-
-function check_result() {
-    if [ $? -ne 0 ];then
-      echo -e "\033 $1 model runs failed! \033" | tee -a $log_path/result.log
-      exit -1
-    fi
-
-    if [ $# -ne 7 ]; then
-        echo -e "\033 parameter transfer failed: $@ \033" | tee -a $log_path/result.log
-        exit -1
-    fi
-
-    echo -e "loss_base: $2 loss_test: $3" | tee -a $log_path/result.log
-    if [ $2 != $3 ];then
-      echo -e "\033 $1 loss diff check failed! \033" | tee -a $log_path/result.log
-      exit -1
-    fi
-
-    diff=$(echo $4 $5|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
-    echo -e "ips_base: $4 ips_test: $5 ips_diff: $diff% " | tee -a $log_path/result.log
-    # 设置不同ips校验阈值
-    if [ $1 == mae_vit_base_patch16_lp_in1k_1n8c_dp_fp16o1 ];then
-        v1=$(echo $diff 10.0|awk '{print($1>=$2)?"0":"1"}')
-        v2=$(echo $diff -10.0|awk '{print($1<=$2)?"0":"1"}')
-    else
-        v1=$(echo $diff 5.0|awk '{print($1>=$2)?"0":"1"}')
-        v2=$(echo $diff -5.0|awk '{print($1<=$2)?"0":"1"}')
-    fi
-    if [[ $v1 == 0 ]] || [[ $v2 == 0 ]];then
-      echo -e "\033 $1 ips diff check failed! \033" | tee -a $log_path/result.log
-      exit -1
-    fi
-
-    echo -e "mem_base: $6 mem_test: $7" | tee -a $log_path/result.log
-    if [ $6 != $7 ];then
-      echo -e "\033 $1 mem diff check failed! \033" | tee -a $log_path/result.log
-      exit -1
-    fi
-
-}
-
-
-main() {
-    cd ${passl_path}
-
-    model_list
-}
-
-main$@
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yaml
similarity index 100%
rename from tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml
rename to tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yaml
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
index f2226887..f3f3f3ab 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml
@@ -62,7 +62,7 @@ Optimizer:
 DataLoader:
   Train:
     dataset:
-      name: MultiCropDataset
+      name: SwAVMultiCropDataset
       root: ./dataset/ILSVRC2012
       size_crops: [224, 96]
       num_crops: [2, 6]
diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
index 96042f86..bf59988b 100644
--- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
+++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml
@@ -61,7 +61,7 @@ Optimizer:
 DataLoader:
   Train:
     dataset:
-      name: MultiCropDataset
+      name: SwAVMultiCropDataset
       root: ./dataset/ILSVRC2012
       size_crops: [224, 96]
       num_crops: [2, 6]

From c975a33a0dc45f527a4dec676bd9e440a1fa10fd Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Thu, 18 May 2023 14:21:58 +0800
Subject: [PATCH 41/46] update

---
 passl/data/dataset/__init__.py               | 2 +-
 passl/data/dataset/swavmulticrop_datatset.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/passl/data/dataset/__init__.py b/passl/data/dataset/__init__.py
index fb5933f5..2f75f2b6 100644
--- a/passl/data/dataset/__init__.py
+++ b/passl/data/dataset/__init__.py
@@ -63,5 +63,5 @@ def default_loader(path: str):
 
 from .imagenet_dataset import ImageNetDataset
 from .imagefolder_dataset import ImageFolder
-from .multicrop_dataset import SwAVMultiCropDataset
+from .swavmulticrop_datatset import SwAVMultiCropDataset
 from .fewshot_dataset import FewShotDataset
diff --git a/passl/data/dataset/swavmulticrop_datatset.py b/passl/data/dataset/swavmulticrop_datatset.py
index f3acce64..ce7826ba 100644
--- a/passl/data/dataset/swavmulticrop_datatset.py
+++ b/passl/data/dataset/swavmulticrop_datatset.py
@@ -29,14 +29,14 @@
 )
 
 
-class MultiCropDataset(ImageFolder):
+class SwAVMultiCropDataset(ImageFolder):
     def __init__(self,
                  root,
                  size_crops,
                  num_crops,
                  min_scale_crops,
                  max_scale_crops):
-        super(MultiCropDataset, self).__init__(root)
+        super(SwAVMultiCropDataset, self).__init__(root)
 
         assert len(size_crops) == len(num_crops)
         assert len(min_scale_crops) == len(num_crops)

From 0016737988f85d870e3597ab57807623dd6e6454 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Thu, 18 May 2023 15:34:00 +0800
Subject: [PATCH 42/46] update

---
 tests/CI/case.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/CI/case.sh b/tests/CI/case.sh
index f6e34761..c79647ab 100644
--- a/tests/CI/case.sh
+++ b/tests/CI/case.sh
@@ -450,8 +450,8 @@ function check_result() {
     echo -e "loss_base: $2 loss_test: $3" | tee -a $log_path/result.log
     diff=$(echo $2 $3|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
     if [ $1 == swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1 ];then
-        v1=$(echo $diff 0.1|awk '{print($1>=$2)?"0":"1"}')
-        v2=$(echo $diff -0.1|awk '{print($1<=$2)?"0":"1"}')
+        v1=$(echo $diff 0.2|awk '{print($1>=$2)?"0":"1"}')
+        v2=$(echo $diff -0.2|awk '{print($1<=$2)?"0":"1"}')
         if [[ $v1 == 0 ]] || [[ $v2 == 0 ]];then
         echo -e "\033 $1 loss diff check failed! \033" | tee -a $log_path/result.log
         exit -1

From 363b4a4ac70878fc5623a2097a0980af39f18933 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Fri, 19 May 2023 10:29:56 +0800
Subject: [PATCH 43/46] pretrain_fix

---
 passl/models/swav.py | 50 +++++++++++++++++++++-----------------------
 tests/CI/case.sh     | 13 ++----------
 2 files changed, 26 insertions(+), 37 deletions(-)

diff --git a/passl/models/swav.py b/passl/models/swav.py
index db4eeeb8..3ba2dda9 100644
--- a/passl/models/swav.py
+++ b/passl/models/swav.py
@@ -198,33 +198,7 @@ def after_loss_backward(self, iteration):
                 if 'prototypes' in name and p.grad is not None:
                     p.clear_grad()
 
-def swav_resnet50_linearprobe(**kwargs):
-    model = SwAVLinearProbe(**kwargs)
-    return model
-
-def swav_resnet50_finetune(**kwargs):
-    model = SwAVFinetune(**kwargs)
-    if paddle.distributed.get_world_size() > 1:
-        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
-    return model
-
-def swav_resnet50_pretrain(apex, **kwargs): # todo
-    flags = {}
-    flags['FLAGS_cudnn_exhaustive_search'] = True
-    flags['FLAGS_cudnn_deterministic'] = False
-    paddle.set_flags(flags)
-
-    model = SwAVPretrain(**kwargs)
 
-    if paddle.distributed.get_world_size() > 1:
-        if not apex:
-            model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
-        else:
-            # with apex syncbn speeds up computation than global syncbn
-            process_group = apex.parallel.create_syncbn_process_group(8)
-            model = apex.parallel.convert_syncbn_model(model, process_group=process_group)
-
-    return model
 
 class RegLogit(paddle.nn.Layer):
     """Creates logistic regression on top of frozen features"""
@@ -348,3 +322,27 @@ def forward(self, x):
 
 def swavresnet50(**kwargs):
     return SwAVResNet(block=BottleneckBlock, depth=50, **kwargs)
+
+
+def swav_resnet50_linearprobe(**kwargs):
+    model = SwAVLinearProbe(**kwargs)
+    return model
+
+def swav_resnet50_finetune(**kwargs):
+    model = SwAVFinetune(**kwargs)
+    if paddle.distributed.get_world_size() > 1:
+        model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+    return model
+
+def swav_resnet50_pretrain(apex, **kwargs):
+    model = SwAVPretrain(**kwargs)
+
+    if paddle.distributed.get_world_size() > 1:
+        if not apex:
+            model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+        else:
+            # with apex syncbn speeds up computation than global syncbn
+            process_group = apex.parallel.create_syncbn_process_group(8)
+            model = apex.parallel.convert_syncbn_model(model, process_group=process_group)
+
+    return model
diff --git a/tests/CI/case.sh b/tests/CI/case.sh
index c79647ab..7ee30014 100644
--- a/tests/CI/case.sh
+++ b/tests/CI/case.sh
@@ -448,21 +448,12 @@ function check_result() {
     fi
 
     echo -e "loss_base: $2 loss_test: $3" | tee -a $log_path/result.log
-    diff=$(echo $2 $3|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
-    if [ $1 == swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1 ];then
-        v1=$(echo $diff 0.2|awk '{print($1>=$2)?"0":"1"}')
-        v2=$(echo $diff -0.2|awk '{print($1<=$2)?"0":"1"}')
-        if [[ $v1 == 0 ]] || [[ $v2 == 0 ]];then
+    if [ $2 != $3 ];then
         echo -e "\033 $1 loss diff check failed! \033" | tee -a $log_path/result.log
         exit -1
-        fi
-    else
-        if [ $2 != $3 ];then
-        echo -e "\033 $1 loss diff check failed! \033" | tee -a $log_path/result.log
-        exit -1
-        fi
     fi
 
+
     diff=$(echo $4 $5|awk '{printf "%0.2f\n", ($2-$1)/$1*100}')
     echo -e "ips_base: $4 ips_test: $5 ips_diff: $diff% " | tee -a $log_path/result.log
     # 设置不同ips校验阈值

From a55609994fb722f5ac41805c4d3720359215751f Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Fri, 19 May 2023 11:55:44 +0800
Subject: [PATCH 44/46] update

---
 tests/CI/case.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/CI/case.sh b/tests/CI/case.sh
index 7ee30014..5a05ac78 100644
--- a/tests/CI/case.sh
+++ b/tests/CI/case.sh
@@ -429,7 +429,7 @@ function swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1() {
     loss=`cat log/workerlog.0 | grep '200/2599' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'`
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
     mem=`cat log/workerlog.0 | grep '200/2599' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
-    loss_base=7.94478
+    loss_base=7.93896
     ips_base=982.07
     mem_base=8.62
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}

From 790500f21c5fd566f4e62d53ea27b48d10c668c8 Mon Sep 17 00:00:00 2001
From: shiyutang <1574572981@qq.com>
Date: Fri, 19 May 2023 14:02:14 +0800
Subject: [PATCH 45/46] update

---
 tests/CI/case.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/CI/case.sh b/tests/CI/case.sh
index 5a05ac78..3d10679a 100644
--- a/tests/CI/case.sh
+++ b/tests/CI/case.sh
@@ -430,8 +430,8 @@ function swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1() {
     ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'`
     mem=`cat log/workerlog.0 | grep '200/2599' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'`
     loss_base=7.93896
-    ips_base=982.07
-    mem_base=8.62
+    ips_base=1000.3
+    mem_base=8.37
     check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem}
     echo "=========== $FUNCNAME run  end ==========="
 }

From b929bf26e2fcc6ca5ea15efae70771d75df397cc Mon Sep 17 00:00:00 2001
From: tangshiyu <tangshiyu@baidu.com>
Date: Mon, 29 May 2023 20:24:33 +0800
Subject: [PATCH 46/46] update_doc

---
 tasks/ssl/swav/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md
index e573b84c..49dd6719 100644
--- a/tasks/ssl/swav/README.md
+++ b/tasks/ssl/swav/README.md
@@ -62,7 +62,7 @@ python -m paddle.distributed.launch \
     --master=$PADDLE_MASTER \
     --devices=$CUDA_VISIBLE_DEVICES \
     passl-train \
-    -c ./configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yaml
+    -c ./configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yaml
 ```
 
 ## How to End-to-End Fine-tuning
@@ -96,8 +96,8 @@ To perform end-to-end fine-tuning for SwAV:
         --master=$PADDLE_MASTER \
         --devices=$CUDA_VISIBLE_DEVICES \
         passl-train \
-        -c ./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml
-        -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained
+        -c ./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml
+        -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained
     ```
 
 ## Other Configurations
@@ -109,9 +109,9 @@ We provide more directly runnable configurations, see [SwAV Configurations](./co
 | Model         | Phase       | Dataset      | Configs  | GPUs       | Epochs | Top1 Acc (%) | Links                                                   |
 | ------------- | ----------- | ------------ | ------------------------------------------------------------ | ---------- | ------ | -------- | ------------------------------------------------------------ |
 | resnet50 | pretrain    | ImageNet2012 | [config](./configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml) | A100*N2C16 | 800    | -        | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.log) |
-| resnet50 | linear probe | ImageNet2012 | [config](./configs/swav_resnet50_224_lp_in1k_4n32c_dp_fp16o1.yaml) | A100*N1C8  |  100  | 75.3    |        [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_linearprobe.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_linearprobe.log) |
-| resnet50 | finetune-semi10    | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4  | 20    | 69.0   | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_10percent.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_10percent.log) |
-| resnet50 | finetune-semi10    | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4  | 20    | 55.0   | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_1percent.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_1percent.log) |
+| resnet50 | linear probe | ImageNet2012 | [config](./configs/swav_resnet50_224_lp_in1k_4n32c_dp_fp32.yaml) | A100*N1C8  |  100  | 75.3    |        [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_linearprobe.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_linearprobe.log) |
+| resnet50 | finetune-semi10    | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml) | A100*N1C4  | 20    | 69.0   | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_10percent.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_10percent.log) |
+| resnet50 | finetune-semi10    | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml) | A100*N1C4  | 20    | 55.0   | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_1percent.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_1percent.log) |
 ## Citations
 
 ```bibtex