From 143e493f650aebb2fa4f9d0d30a3efb201f3ae5b Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Tue, 18 Apr 2023 17:14:58 +0800 Subject: [PATCH 01/46] transfer_swav --- passl/models/resnet.py | 265 ++++++++++++++++++ passl/models/swav.py | 135 +++++++++ tasks/ssl/swav/README.md | 108 +++++++ ...se_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml | 148 ++++++++++ ...se_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml | 109 +++++++ ...e_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml | 108 +++++++ ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml | 110 ++++++++ tasks/ssl/swav/finetune.sh | 28 ++ tasks/ssl/swav/linearprobe.sh | 26 ++ tasks/ssl/swav/pretrain.sh | 26 ++ 10 files changed, 1063 insertions(+) create mode 100644 passl/models/resnet.py create mode 100644 passl/models/swav.py create mode 100644 tasks/ssl/swav/README.md create mode 100644 tasks/ssl/swav/configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml create mode 100644 tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml create mode 100644 tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml create mode 100644 tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml create mode 100644 tasks/ssl/swav/finetune.sh create mode 100644 tasks/ssl/swav/linearprobe.sh create mode 100644 tasks/ssl/swav/pretrain.sh diff --git a/passl/models/resnet.py b/passl/models/resnet.py new file mode 100644 index 00000000..c76709cc --- /dev/null +++ b/passl/models/resnet.py @@ -0,0 +1,265 @@ +import paddle +from passl.models.base_model import Model + +def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): + """3x3 convolution with padding""" + return paddle.nn.Conv2D(in_channels=in_planes, out_channels=out_planes, + kernel_size=3, stride=stride, padding=dilation, groups=groups, + dilation=dilation, bias_attr=False, ) + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution""" + return paddle.nn.Conv2D(in_channels=in_planes, out_channels=out_planes, + kernel_size=1, stride=stride, bias_attr=False) + + +class BasicBlock(nn.Layer): + expansion = 1 + __constants__ = ['downsample'] + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups= + 1, base_width=64, dilation=1, norm_layer=None): + super(BasicBlock, self).__init__() + if norm_layer is None: + norm_layer = paddle.nn.BatchNorm2D + if groups != 1 or base_width != 64: + raise ValueError( + 'BasicBlock only supports groups=1 and base_width=64') + if dilation > 1: + raise NotImplementedError( + 'Dilation > 1 not supported in BasicBlock') + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = norm_layer(planes) + self.relu = paddle.nn.ReLU() + self.conv2 = conv3x3(planes, planes) + self.bn2 = norm_layer(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + out = self.conv2(out) + out = self.bn2(out) + if self.downsample is not None: + identity = self.downsample(x) + out += identity + out = self.relu(out) + return out + + +class Bottleneck(paddle.nn.Layer): + expansion = 4 + __constants__ = ['downsample'] + + def __init__(self, inplanes, planes, stride=1, downsample=None, groups= + 1, base_width=64, dilation=1, norm_layer=None): + super(Bottleneck, self).__init__() + if norm_layer is None: + norm_layer = paddle.nn.BatchNorm2D + width = int(planes * (base_width / 64.0)) * groups + self.conv1 = conv1x1(inplanes, width) + self.bn1 = norm_layer(width) + self.conv2 = conv3x3(width, width, stride, groups, dilation) + self.bn2 = norm_layer(width) + self.conv3 = conv1x1(width, planes * self.expansion) + self.bn3 = norm_layer(planes * self.expansion) + self.relu = paddle.nn.ReLU() + self.downsample = downsample + self.stride = stride + + def forward(self, x): + identity = x + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + out = self.conv3(out) + out = self.bn3(out) + if self.downsample is not None: + identity = self.downsample(x) + out += identity + out = self.relu(out) + return out + +def kaiming_normal_init(param, **kwargs): + initializer = nn.initializer.KaimingNormal(**kwargs) + initializer(param, param.block) + +def constant_init(param, **kwargs): + initializer = nn.initializer.Constant(**kwargs) + initializer(param, param.block) + +class ResNet(paddle.nn.Layer): + def __init__(self, block, layers, zero_init_residual=False, groups=1, + widen=1, width_per_group=64, replace_stride_with_dilation=None, + norm_layer=None, normalize=False, output_dim=0, hidden_mlp=0, + nmb_prototypes=0, eval_mode=False): + + super(ResNet, self).__init__() + if norm_layer is None: + norm_layer = paddle.nn.BatchNorm2D + self._norm_layer = norm_layer + self.eval_mode = eval_mode + self.padding = paddle.nn.Pad2D(padding=1, value=0.0) + self.inplanes = width_per_group * widen + self.dilation = 1 + if replace_stride_with_dilation is None: + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError( + 'replace_stride_with_dilation should be None or a 3-element tuple, got {}' + .format(replace_stride_with_dilation)) + self.groups = groups + self.base_width = width_per_group + num_out_filters = width_per_group * widen + self.conv1 = paddle.nn.Conv2D(in_channels=3, out_channels= + num_out_filters, kernel_size=7, stride=2, padding=2, bias_attr= + False) + self.bn1 = norm_layer(num_out_filters) + self.relu = paddle.nn.ReLU() + self.maxpool = paddle.nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, num_out_filters, layers[0]) + num_out_filters *= 2 + self.layer2 = self._make_layer(block, num_out_filters, layers[1], + stride=2, dilate=replace_stride_with_dilation[0]) + num_out_filters *= 2 + self.layer3 = self._make_layer(block, num_out_filters, layers[2], + stride=2, dilate=replace_stride_with_dilation[1]) + num_out_filters *= 2 + self.layer4 = self._make_layer(block, num_out_filters, layers[3], + stride=2, dilate=replace_stride_with_dilation[2]) + self.avgpool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1)) + self.l2norm = normalize + if output_dim == 0: + self.projection_head = None + elif hidden_mlp == 0: + self.projection_head = paddle.nn.Linear(in_features= + num_out_filters * block.expansion, out_features=output_dim) + else: + self.projection_head = paddle.nn.Sequential(paddle.nn.Linear( + in_features=num_out_filters * block.expansion, out_features + =hidden_mlp), paddle.nn.BatchNorm1D(num_features=hidden_mlp, + momentum=1 - 0.1, epsilon=1e-05, weight_attr=None, + bias_attr=None, use_global_stats=True), paddle.nn.ReLU(), + paddle.nn.Linear(in_features=hidden_mlp, out_features= + output_dim)) + self.prototypes = None + if isinstance(nmb_prototypes, list): + self.prototypes = MultiPrototypes(output_dim, nmb_prototypes) + elif nmb_prototypes > 0: + self.prototypes = paddle.nn.Linear(in_features=output_dim, + out_features=nmb_prototypes, bias_attr=False) + for sublayer in self.sublayers(): + if isinstance(sublayer, nn.Conv2D): + kaiming_normal_init(sublayer.weight) # todo mode='fan_out', + elif isinstance(sublayer, (nn.BatchNorm2D, nn.GroupNorm)): + param_init.constant_init(sublayer.weight, value=1.0) + param_init.constant_init(sublayer.bias, value=0.0) + + if zero_init_residual: + for sublayer in self.sublayers(): + if isinstance(m, Bottleneck): + param_init.constant_init(sublayer.bn3.weight, value=0.0) + elif isinstance(m, BasicBlock): + param_init.constant_init(sublayer.bn2.weight, value=0.0) + + def _make_layer(self, block, planes, blocks, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = paddle.nn.Sequential(conv1x1(self.inplanes, planes * + block.expansion, stride), norm_layer(planes * block.expansion)) + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample, self + .groups, self.base_width, previous_dilation, norm_layer)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, groups=self.groups, + base_width=self.base_width, dilation=self.dilation, + norm_layer=norm_layer)) + return paddle.nn.Sequential(*layers) + + def forward_backbone(self, x): + x = self.padding(x) + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + if self.eval_mode: + return x + x = self.avgpool(x) + x = paddle.flatten(x=x, start_axis=1) + return x + + def forward_head(self, x): + if self.projection_head is not None: + x = self.projection_head(x) + if self.l2norm: + x = paddle.nn.functional.normalize(x=x, axis=1, p=2) + if self.prototypes is not None: + return x, self.prototypes(x) + return x + + def forward(self, inputs): + if not isinstance(inputs, list): + inputs = [inputs] + idx_crops = paddle.cumsum(x=paddle.unique_consecutive(x=paddle. + to_tensor(data=[inp.shape[-1] for inp in inputs]), + return_counts=True)[1], dim=0) + start_idx = 0 + for end_idx in idx_crops: + _out = self.forward_backbone(paddle.concat(x=inputs[start_idx: + end_idx])) + if start_idx == 0: + output = _out + else: + output = paddle.concat(x=(output, _out)) + start_idx = end_idx + return self.forward_head(output) + + +class MultiPrototypes(paddle.nn.Layer): + + def __init__(self, output_dim, nmb_prototypes): + super(MultiPrototypes, self).__init__() + self.nmb_heads = len(nmb_prototypes) + for i, k in enumerate(nmb_prototypes): + self.add_module('prototypes' + str(i), paddle.nn.Linear( + in_features=output_dim, out_features=k, bias_attr=False)) + + def forward(self, x): + out = [] + for i in range(self.nmb_heads): + out.append(getattr(self, 'prototypes' + str(i))(x)) + return out + + +def resnet50(**kwargs): + return ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) + + +def resnet50w2(**kwargs): + return ResNet(Bottleneck, [3, 4, 6, 3], widen=2, **kwargs) + + +def resnet50w4(**kwargs): + return ResNet(Bottleneck, [3, 4, 6, 3], widen=4, **kwargs) + + +def resnet50w5(**kwargs): + return ResNet(Bottleneck, [3, 4, 6, 3], widen=5, **kwargs) + diff --git a/passl/models/swav.py b/passl/models/swav.py new file mode 100644 index 00000000..9c64df62 --- /dev/null +++ b/passl/models/swav.py @@ -0,0 +1,135 @@ +import paddle +import paddle.nn as nn + +from passl.models.resnet import resnet50 +from passl.models.base_model import Model + + +__all__ = [ + 'swav_resnet50', + 'swav_resnet50_linearprobe', + # 'swav_resnet50_pretrain', + 'SwAV', + 'SwAVLinearProbe', + # 'SwAVPretrain', +] + +# def model and +class SwAV(Model): + def __init__(self, **kwargs): + super().__init__() + self.res_model = resnet50(**kwargs) + + + def load_pretrained(self, path, rank=0, finetune=False): + pass +# if not os.path.exists(path + '.pdparams'): +# raise ValueError("Model pretrain path {} does not " +# "exists.".format(path)) + +# state_dict = self.state_dict() +# param_state_dict = paddle.load(path + ".pdparams") + +# # for FP16 saving pretrained weight +# for key, value in param_state_dict.items(): +# if key in param_state_dict and key in state_dict and param_state_dict[ +# key].dtype != state_dict[key].dtype: +# param_state_dict[key] = param_state_dict[key].astype( +# state_dict[key].dtype) + +# if not finetune: +# self.set_dict(param_state_dict) +# else: # load model when finetune +# for k in ['head0.weight', 'head0.bias', 'head.weight', 'head.bias']: +# if k in param_state_dict: +# logger.info(f"Removing key {k} from pretrained checkpoint") +# del param_state_dict[k] + +# self.set_dict(param_state_dict) + + def save(self, path, local_rank=0, rank=0): + paddle.save(self.state_dict(), path + ".pdparams") + + +class SwAVLinearProbe(SwAV): + def __init__(self, num_classes=1000, linear_arch="resnet50", global_avg=True, use_bn=False, **kwargs): + super().__init__(**kwargs) + self.linear = RegLog(1000, "resnet50", global_avg=True, use_bn=False) + self.res_model.eval() + self.criterion = nn.CrossEntropyLoss() + + def load_pretrained(self, path): + # only load res_model + model = path + ".pdparams" + if os.path.isfile(path): + state_dict = paddle.load(path) + + # remove prefixe "module." + state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} + for k, v in model.state_dict().items(): + if k not in list(state_dict): + logger.info('key "{}" could not be found in provided state dict'.format(k)) + elif state_dict[k].shape != v.shape: + logger.info('key "{}" is of different shape in model and provided state dict'.format(k)) + state_dict[k] = v + msg = self.res_model.set_dict(state_dict, strict=False) + logger.info("Load pretrained model with msg: {}".format(msg)) + else: + logger.info("No pretrained weights found => training with random weights") + + def forward() + with paddle.no_grad(): + output = self.res_model(inp) + output = reglog(output) + + return output + + +def swav_resnet50_linearprobe(**kwargs): + model = SwAVLinearProbe(num_classes=1000, + linear_arch="resnet50", + global_avg=True, + use_bn=False, + output_dim=0, + eval_mode=True, + **kwargs) + return model + + + +class RegLog(paddle.nn.Layer): + """Creates logistic regression on top of frozen features""" + + def __init__(self, num_labels, arch='resnet50', global_avg=False, + use_bn=True): + super(RegLog, self).__init__() + self.bn = None + if global_avg: + if arch == 'resnet50': + s = 2048 + elif arch == 'resnet50w2': + s = 4096 + elif arch == 'resnet50w4': + s = 8192 + self.av_pool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1)) + else: + assert arch == 'resnet50' + s = 8192 + self.av_pool = paddle.nn.AvgPool2D(6, stride=1) + if use_bn: + self.bn = paddle.nn.BatchNorm2D(num_features=2048, momentum + =1 - 0.1, epsilon=1e-05, weight_attr=None, bias_attr= + None, use_global_stats=True) + self.linear = paddle.nn.Linear(in_features=s, out_features=num_labels) + x = self.linear.weight.data + paddle.assign(paddle.normal(mean=0.0, std=0.01, shape=x.shape). + astype(x.dtype), x) + self.linear.bias.data.zero_() + + def forward(self, x): + x = self.av_pool(x) + if self.bn is not None: + x = self.bn(x) + + x = x.view((x.shape[0], -1)) + return self.linear(x) \ No newline at end of file diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md new file mode 100644 index 00000000..b3f14b0e --- /dev/null +++ b/tasks/ssl/swav/README.md @@ -0,0 +1,108 @@ +## MoCo v3 for Self-supervised ResNet and ViT + + +PaddlePaddle reimplementation of [facebookresearch's repository for the MoCo v3 model](https://github.com/facebookresearch/moco-v3) that was released with the paper [An Empirical Study of Training Self-Supervised Vision Transformers](https://arxiv.org/abs/2104.02057). + +## Requirements +To enjoy some new features, PaddlePaddle 2.4 is required. For more installation tutorials +refer to [installation.md](../../../tutorials/get_started/installation.md) + +## Data Preparation + +Prepare the data into the following directory: +```text +dataset/ +└── ILSVRC2012 + ├── train + └── val +``` + + +## How to Self-supervised Pre-Training + +With a batch size of 4096, ViT-Base is trained with 4 nodes: + +```bash +# Note: Set the following environment variables +# and then need to run the script on each node. +unset PADDLE_TRAINER_ENDPOINTS +export PADDLE_NNODES=4 +export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538" +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export FLAGS_stop_check_timeout=3600 + +python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + passl-train \ + -c ./configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml +``` + +## How to Linear Classification + +By default, we use momentum-SGD and a batch size of 1024 for linear classification on frozen features/weights. This can be done with a single 8-GPU node. + +```bash +unset PADDLE_TRAINER_ENDPOINTS +export PADDLE_NNODES=1 +export PADDLE_MASTER="127.0.0.1:12538" +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export FLAGS_stop_check_timeout=3600 + +python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + passl-train \ + -c ./configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml +``` + +## How to End-to-End Fine-tuning +To perform end-to-end fine-tuning for ViT, use our script to convert the pre-trained ViT checkpoint to PASSL DeiT format: + +```bash +python extract_weight.py \ + --input pretrained/checkpoint_0299.pd \ + --output pretrained/moco_vit_base.pdparams +``` + +Then run the training with the converted PASSL format checkpoint: + +```bash +unset PADDLE_TRAINER_ENDPOINTS +export PADDLE_NNODES=1 +export PADDLE_MASTER="127.0.0.1:12538" +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export FLAGS_stop_check_timeout=3600 + +python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + passl-train \ + -c ./configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml +``` + +## Other Configurations +We provide more directly runnable configurations, see [MoCoV3 Configurations](./configs/). + +## Models + +### ViT-Base +| Model | Phase | Dataset | Configs | GPUs | Epochs | Top1 Acc | Checkpoint | +| ------------- | ----------- | ------------ | ------------------------------------------------------------ | ---------- | ------ | -------- | ------------------------------------------------------------ | +| moco_vit_base | pretrain | ImageNet2012 | [config](./configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml) | A100*N4C32 | 300 | - | [download](https://plsc.bj.bcebos.com/models/mocov3/v2.4/moco_vit_base_in1k_300ep.pd) | +| moco_vit_base | linear prob | ImageNet2012 | [config](./configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml) | A100*N1C8 | 90 | 0.7662 | | +| moco_vit_base | finetune | ImageNet2012 | [config](./configs/DeiT_base_patch16_224_in1k_1n8c_dp_fp16o1.yaml) | A100*N1C8 | 150 | 0.8288 | | + +## Citations + +```bibtex +@Article{chen2021mocov3, + author = {Xinlei Chen* and Saining Xie* and Kaiming He}, + title = {An Empirical Study of Training Self-Supervised Vision Transformers}, + journal = {arXiv preprint arXiv:2104.02057}, + year = {2021}, +} +``` diff --git a/tasks/ssl/swav/configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml new file mode 100644 index 00000000..d70c6647 --- /dev/null +++ b/tasks/ssl/swav/configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml @@ -0,0 +1,148 @@ +# global configs +Global: + task_type: Classification + train_loop: ClassificationTrainingEpochLoop + validate_loop: ClassificationEvaluationLoop + checkpoint: null + pretrained_model: ./pretrained/mocov3/mocov3_vit_base_in1k_300ep_pretrained + finetune: True + output_dir: ./output/ + device: gpu + save_interval: 1 + max_num_latest_checkpoint: 0 + eval_during_train: True + eval_interval: 1 + eval_unit: "epoch" + accum_steps: 1 + epochs: 150 + print_batch_step: 10 + use_visualdl: False + seed: 2022 + +# FP16 setting +FP16: + level: O1 + GradScaler: + init_loss_scaling: 65536.0 + +DistributedStrategy: + data_parallel: True + +# model architecture +Model: + name: DeiT_base_patch16_224 + drop_path_rate : 0.1 + drop_rate : 0.0 + class_num: 1000 + +# loss function config for traing/eval process +Loss: + Train: + - CELoss: + weight: 1.0 + Eval: + - CELoss: + weight: 1.0 + +LRScheduler: + name: TimmCosine + learning_rate: 1e-3 + eta_min: 1e-5 + warmup_epoch: 5 + warmup_start_lr: 1e-6 + decay_unit: epoch + +Optimizer: + name: AdamW + betas: (0.9, 0.999) + eps: 1e-8 + weight_decay: 0.05 + no_weight_decay_name: ["cls_token", "pos_embed", "norm", "bias"] + use_master_param: True + exp_avg_force_fp32: True + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: ImageFolder + root: ./dataset/ILSVRC2012/train + transform: + - RandomResizedCrop: + size: 224 + interpolation: bicubic + - RandomHorizontalFlip: + - TimmAutoAugment: + config_str: rand-m9-mstd0.5-inc1 + interpolation: bicubic + img_size: 224 + mean: [0.485, 0.456, 0.406] + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + - RandomErasing: + EPSILON: 0.25 + sl: 0.02 + sh: 1.0/3.0 + r1: 0.3 + attempt: 10 + use_log_aspect: True + mode: pixel + - ToCHWImage: + batch_transform: + - TransformOpSampler: + Mixup: + alpha: 0.8 + prob: 0.5 + epsilon: 0.1 + class_num: 1000 + Cutmix: + alpha: 1.0 + prob: 0.5 + epsilon: 0.1 + class_num: 1000 + sampler: + name: RepeatedAugSampler + batch_size: 128 # accum_steps: 1, total batchsize: 1024 + drop_last: False + shuffle: True + loader: + num_workers: 8 + use_shared_memory: True + + Eval: + dataset: + name: ImageFolder + root: ./dataset/ILSVRC2012/val + transform: + - Resize: + size: 256 + interpolation: bicubic + backend: pil + - CenterCrop: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + - ToCHWImage: + sampler: + name: DistributedBatchSampler + batch_size: 256 + drop_last: False + shuffle: False + loader: + num_workers: 8 + use_shared_memory: True + +Metric: + Eval: + - TopkAcc: + topk: [1, 5] + +Export: + export_type: paddle + input_shape: [None, 3, 224, 224] diff --git a/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml new file mode 100644 index 00000000..ae0efc7b --- /dev/null +++ b/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml @@ -0,0 +1,109 @@ +# global configs +Global: + task_type: Classification + train_loop: ClassificationTrainingEpochLoop + validate_loop: ClassificationEvaluationLoop + checkpoint: null + pretrained_model: ./pretrained/mocov3/mocov3_vit_base_in1k_300ep_pretrained + output_dir: ./output/ + device: gpu + save_interval: 1 + max_num_latest_checkpoint: 0 + eval_during_train: True + eval_interval: 1 + eval_unit: "epoch" + accum_steps: 1 + epochs: 90 + print_batch_step: 10 + use_visualdl: False + seed: 2022 + +# FP16 setting +FP16: + level: O1 + +DistributedStrategy: + data_parallel: True + +# model architecture +Model: + name: mocov3_vit_base_linearprobe + class_num: 1000 + +# loss function config for traing/eval process +Loss: + Train: + - CELoss: + weight: 1.0 + Eval: + - CELoss: + weight: 1.0 + +LRScheduler: + name: TimmCosine + learning_rate: 12.0 + decay_unit: epoch + last_epoch: 0 + warmup_epoch: 0 + +Optimizer: + name: Momentum + momentum: 0.9 + weight_decay: 0.0 + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: ImageFolder + root: data/ILSVRC2012/train + transform: + - RandomResizedCrop: + size: 224 + - RandomHorizontalFlip: + - ToTensor: + - Normalize: + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + sampler: + name: DistributedBatchSampler + batch_size: 128 # accum_steps: 1, total batchsize: 1024 + drop_last: False + shuffle: True + loader: + num_workers: 8 + use_shared_memory: True + + Eval: + dataset: + name: ImageFolder + root: data/ILSVRC2012/val + transform: + - Resize: + size: 256 + - CenterCrop: + size: 224 + - ToTensor: + - Normalize: + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + sampler: + name: DistributedBatchSampler + batch_size: 256 + drop_last: False + shuffle: False + loader: + num_workers: 8 + use_shared_memory: True + +Metric: + Train: + - TopkAcc: + topk: [1, 5] + Eval: + - TopkAcc: + topk: [1, 5] + +Export: + export_type: paddle + input_shape: [None, 3, 224, 224] diff --git a/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml new file mode 100644 index 00000000..cb3a7a9e --- /dev/null +++ b/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml @@ -0,0 +1,108 @@ +# global configs +Global: + task_type: ContrastiveLearning + train_loop: ContrastiveLearningTrainingEpochLoop + validate_loop: None + checkpoint: null + pretrained_model: null + output_dir: ./output/ + device: gpu + save_interval: 1 + max_num_latest_checkpoint: 0 + eval_during_train: False + eval_interval: 1 + eval_unit: "epoch" + accum_steps: 1 + epochs: 300 + print_batch_step: 10 + use_visualdl: False + seed: 2023 + +# FP16 setting +FP16: + level: O1 + GradScaler: + init_loss_scaling: 65536.0 + incr_every_n_steps: 2000 + +DistributedStrategy: + data_parallel: True + +# model architecture +Model: + name: mocov3_vit_base_pretrain + +LRScheduler: + name: TimmCosine + learning_rate: 0.0024 + eta_min: 0.0 + warmup_epoch: 40 + warmup_start_lr: 0.0 + decay_unit: step + warmup_prefix: True + +Optimizer: + name: AdamW + betas: (0.9, 0.999) + eps: 1e-8 + weight_decay: 0.1 + use_master_param: True + exp_avg_force_fp32: True + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: ImageFolder + root: ./dataset/ILSVRC2012/train + transform: + - TwoViewsTransform: + base_transform1: + - RandomResizedCrop: + size: 224 + scale: [0.08, 1.0] + interpolation: bicubic + - ColorJitter: + p: 0.8 + brightness: 0.4 + contrast: 0.4 + saturation: 0.2 + hue: 0.1 + - RandomGrayscale: + p: 0.2 + - SimCLRGaussianBlur: + p: 1.0 + sigma: [.1, 2.] + - RandomHorizontalFlip: + - ToTensor: + - Normalize: + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + base_transform2: + - RandomResizedCrop: + size: 224 + scale: [0.08, 1.0] + interpolation: bicubic + - ColorJitter: + p: 0.8 + brightness: 0.4 + contrast: 0.4 + saturation: 0.2 + hue: 0.1 + - RandomGrayscale: + p: 0.2 + - BYOLSolarize: + p: 0.2 + - RandomHorizontalFlip: + - ToTensor: + - Normalize: + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + sampler: + name: DistributedBatchSampler + batch_size: 128 # accum_steps: 1, total batchsize: 4096 + drop_last: False + shuffle: True + loader: + num_workers: 8 + use_shared_memory: True diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml new file mode 100644 index 00000000..3f8782ca --- /dev/null +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml @@ -0,0 +1,110 @@ +# global configs +Global: + task_type: Classification + train_loop: ClassificationTrainingEpochLoop + validate_loop: ClassificationEvaluationLoop + checkpoint: null + pretrained_model: swav_800ep_pretrain.pdparams + output_dir: ./output/ + device: gpu + save_interval: 1 + max_num_latest_checkpoint: 0 + eval_during_train: True + eval_interval: 1 + eval_unit: "epoch" + accum_steps: 1 + epochs: 100 + print_batch_step: 10 + use_visualdl: False + seed: 31 + +# FP16 setting ignore in align +# FP16: +# level: O1 + +DistributedStrategy: + data_parallel: True + +# model architecture +Model: + name: swav_resnet50_linearprobe + class_num: 1000 + +# loss function config for traing/eval process +Loss: + Train: + - CELoss: + weight: 1.0 + Eval: + - CELoss: + weight: 1.0 + +LRScheduler: + name: TimmCosine + learning_rate: 0.3 + decay_unit: epoch + last_epoch: 0 + warmup_epoch: 0 + +Optimizer: + name: Momentum + momentum: 0.9 + weight_decay: 1e-6 + + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: ImageFolder + root: ./dataset/ILSVRC2012/train + transform: + - RandomResizedCrop: + size: 224 + - RandomHorizontalFlip: + - ToTensor: + - Normalize: + mean: [0.485, 0.456, 0.406] + std: [0.228, 0.224, 0.225] + sampler: + name: DistributedBatchSampler + batch_size: 32 # accum_steps: 1, total batchsize: 256 + drop_last: False + shuffle: True + loader: + num_workers: 8 + use_shared_memory: True + + Eval: + dataset: + name: ImageFolder + root: ./dataset/ILSVRC2012/val + transform: + - Resize: + size: 256 + - CenterCrop: + size: 224 + - ToTensor: + - Normalize: + mean: [0.485, 0.456, 0.406] + std: [0.228, 0.224, 0.225] + sampler: + name: DistributedBatchSampler + batch_size: 32 + drop_last: False + shuffle: False + loader: + num_workers: 8 + use_shared_memory: True + +Metric: + Train: + - TopkAcc: + topk: [1, 5] + Eval: + - TopkAcc: + topk: [1, 5] + +Export: + export_type: paddle + input_shape: [None, 3, 224, 224] diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh new file mode 100644 index 00000000..cae7ebba --- /dev/null +++ b/tasks/ssl/swav/finetune.sh @@ -0,0 +1,28 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Note: Set the following environment variables +# and then need to run the script on each node. +unset PADDLE_TRAINER_ENDPOINTS +export PADDLE_NNODES=1 +export PADDLE_MASTER="127.0.0.1:12538" +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export FLAGS_stop_check_timeout=3600 + +python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + passl-train \ + -c ./configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml diff --git a/tasks/ssl/swav/linearprobe.sh b/tasks/ssl/swav/linearprobe.sh new file mode 100644 index 00000000..a0a26b4e --- /dev/null +++ b/tasks/ssl/swav/linearprobe.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#unset PADDLE_TRAINER_ENDPOINTS +#export PADDLE_NNODES=1 +#export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538" +#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export FLAGS_stop_check_timeout=3600 + +python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + passl-train \ + -c ./configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh new file mode 100644 index 00000000..f5dfc176 --- /dev/null +++ b/tasks/ssl/swav/pretrain.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#unset PADDLE_TRAINER_ENDPOINTS +#export PADDLE_NNODES=4 +#export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538" +#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export FLAGS_stop_check_timeout=3600 + +python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + passl-train \ + -c ./configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml From 31921ca0be98649ee28a6a124d4ff8344621d8f6 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Tue, 18 Apr 2023 20:58:16 +0800 Subject: [PATCH 02/46] valid_train --- passl/models/__init__.py | 1 + passl/models/resnet.py | 9 ++- passl/models/swav.py | 69 +++++++++++-------- ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml | 12 ++-- tasks/ssl/swav/linearprobe.sh | 18 +++-- tools/train.py | 0 6 files changed, 67 insertions(+), 42 deletions(-) mode change 100644 => 100755 tools/train.py diff --git a/passl/models/__init__.py b/passl/models/__init__.py index ad01e964..6174f44e 100644 --- a/passl/models/__init__.py +++ b/passl/models/__init__.py @@ -25,6 +25,7 @@ from .cae import * from .convnext import * from .mocov3 import * +from .swav import * __all__ = ["build_model"] diff --git a/passl/models/resnet.py b/passl/models/resnet.py index c76709cc..9abfc0d4 100644 --- a/passl/models/resnet.py +++ b/passl/models/resnet.py @@ -1,4 +1,6 @@ import paddle +import paddle.nn as nn + from passl.models.base_model import Model def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): @@ -95,6 +97,7 @@ def constant_init(param, **kwargs): initializer = nn.initializer.Constant(**kwargs) initializer(param, param.block) + class ResNet(paddle.nn.Layer): def __init__(self, block, layers, zero_init_residual=False, groups=1, widen=1, width_per_group=64, replace_stride_with_dilation=None, @@ -159,8 +162,8 @@ def __init__(self, block, layers, zero_init_residual=False, groups=1, if isinstance(sublayer, nn.Conv2D): kaiming_normal_init(sublayer.weight) # todo mode='fan_out', elif isinstance(sublayer, (nn.BatchNorm2D, nn.GroupNorm)): - param_init.constant_init(sublayer.weight, value=1.0) - param_init.constant_init(sublayer.bias, value=0.0) + constant_init(sublayer.weight, value=1.0) + constant_init(sublayer.bias, value=0.0) if zero_init_residual: for sublayer in self.sublayers(): @@ -219,7 +222,7 @@ def forward(self, inputs): inputs = [inputs] idx_crops = paddle.cumsum(x=paddle.unique_consecutive(x=paddle. to_tensor(data=[inp.shape[-1] for inp in inputs]), - return_counts=True)[1], dim=0) + return_counts=True)[1], axis=0) # padiff start_idx = 0 for end_idx in idx_crops: _out = self.forward_backbone(paddle.concat(x=inputs[start_idx: diff --git a/passl/models/swav.py b/passl/models/swav.py index 9c64df62..7cba9acc 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -1,3 +1,5 @@ +import os + import paddle import paddle.nn as nn @@ -6,7 +8,7 @@ __all__ = [ - 'swav_resnet50', + # 'swav_resnet50', 'swav_resnet50_linearprobe', # 'swav_resnet50_pretrain', 'SwAV', @@ -52,42 +54,48 @@ def save(self, path, local_rank=0, rank=0): class SwAVLinearProbe(SwAV): - def __init__(self, num_classes=1000, linear_arch="resnet50", global_avg=True, use_bn=False, **kwargs): + def __init__(self, class_num=1000, linear_arch="resnet50", global_avg=True, use_bn=False, **kwargs): super().__init__(**kwargs) self.linear = RegLog(1000, "resnet50", global_avg=True, use_bn=False) self.res_model.eval() - self.criterion = nn.CrossEntropyLoss() - def load_pretrained(self, path): + def load_pretrained(self, path, rank=0, finetune=False): # only load res_model - model = path + ".pdparams" if os.path.isfile(path): - state_dict = paddle.load(path) - - # remove prefixe "module." - state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()} - for k, v in model.state_dict().items(): - if k not in list(state_dict): - logger.info('key "{}" could not be found in provided state dict'.format(k)) - elif state_dict[k].shape != v.shape: - logger.info('key "{}" is of different shape in model and provided state dict'.format(k)) - state_dict[k] = v - msg = self.res_model.set_dict(state_dict, strict=False) - logger.info("Load pretrained model with msg: {}".format(msg)) + para_state_dict = paddle.load(path) + + # resnet + model_state_dict = self.res_model.state_dict() + keys = model_state_dict.keys() + num_params_loaded = 0 + for k in keys: + if k not in para_state_dict: + print("{} is not in pretrained model".format(k)) + elif list(para_state_dict[k].shape) != list(model_state_dict[k] + .shape): + print( + "[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})" + .format(k, para_state_dict[k].shape, model_state_dict[k] + .shape)) + else: + model_state_dict[k] = para_state_dict[k] + num_params_loaded += 1 + self.res_model.set_dict(model_state_dict) + print("There are {}/{} variables loaded into {}.".format( + num_params_loaded, len(model_state_dict), "backbone")) else: - logger.info("No pretrained weights found => training with random weights") + print("No pretrained weights found => training with random weights") - def forward() + def forward(self, inp): with paddle.no_grad(): output = self.res_model(inp) - output = reglog(output) + output = self.linear(output) return output def swav_resnet50_linearprobe(**kwargs): - model = SwAVLinearProbe(num_classes=1000, - linear_arch="resnet50", + model = SwAVLinearProbe(linear_arch="resnet50", global_avg=True, use_bn=False, output_dim=0, @@ -96,7 +104,14 @@ def swav_resnet50_linearprobe(**kwargs): return model +def normal_init(param, **kwargs): + initializer = nn.initializer.Normal(**kwargs) + initializer(param, param.block) +def constant_init(param, **kwargs): + initializer = nn.initializer.Constant(**kwargs) + initializer(param, param.block) + class RegLog(paddle.nn.Layer): """Creates logistic regression on top of frozen features""" @@ -120,16 +135,16 @@ def __init__(self, num_labels, arch='resnet50', global_avg=False, self.bn = paddle.nn.BatchNorm2D(num_features=2048, momentum =1 - 0.1, epsilon=1e-05, weight_attr=None, bias_attr= None, use_global_stats=True) + self.linear = paddle.nn.Linear(in_features=s, out_features=num_labels) - x = self.linear.weight.data - paddle.assign(paddle.normal(mean=0.0, std=0.01, shape=x.shape). - astype(x.dtype), x) - self.linear.bias.data.zero_() + normal_init(self.linear.weight, mean=0.0, std=0.01) + constant_init(self.linear.bias, value=0.0) # padiff + def forward(self, x): x = self.av_pool(x) if self.bn is not None: x = self.bn(x) - x = x.view((x.shape[0], -1)) + x = x.reshape((x.shape[0], -1)) return self.linear(x) \ No newline at end of file diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml index 3f8782ca..569f2e86 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml @@ -4,8 +4,8 @@ Global: train_loop: ClassificationTrainingEpochLoop validate_loop: ClassificationEvaluationLoop checkpoint: null - pretrained_model: swav_800ep_pretrain.pdparams - output_dir: ./output/ + pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams + output_dir: ./output/baseline device: gpu save_interval: 1 max_num_latest_checkpoint: 0 @@ -14,7 +14,7 @@ Global: eval_unit: "epoch" accum_steps: 1 epochs: 100 - print_batch_step: 10 + print_batch_step: 100 use_visualdl: False seed: 31 @@ -41,7 +41,7 @@ Loss: LRScheduler: name: TimmCosine - learning_rate: 0.3 + learning_rate: 0.6 decay_unit: epoch last_epoch: 0 warmup_epoch: 0 @@ -57,7 +57,7 @@ DataLoader: Train: dataset: name: ImageFolder - root: ./dataset/ILSVRC2012/train + root: data/ILSVRC2012/train transform: - RandomResizedCrop: size: 224 @@ -78,7 +78,7 @@ DataLoader: Eval: dataset: name: ImageFolder - root: ./dataset/ILSVRC2012/val + root: data/ILSVRC2012/val transform: - Resize: size: 256 diff --git a/tasks/ssl/swav/linearprobe.sh b/tasks/ssl/swav/linearprobe.sh index a0a26b4e..31511a45 100644 --- a/tasks/ssl/swav/linearprobe.sh +++ b/tasks/ssl/swav/linearprobe.sh @@ -13,14 +13,20 @@ # limitations under the License. #unset PADDLE_TRAINER_ENDPOINTS -#export PADDLE_NNODES=1 -#export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538" -#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -export FLAGS_stop_check_timeout=3600 +# export PADDLE_NNODES=1 +# #export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538" +# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +# export FLAGS_stop_check_timeout=3600 +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +export PADDLE_NNODES=1 +export PADDLE_MASTER="127.0.0.1:12538" +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ - passl-train \ - -c ./configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml + tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml + +# python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c \ No newline at end of file diff --git a/tools/train.py b/tools/train.py old mode 100644 new mode 100755 From cc7f630b56899752cecb9ccdfbc49d09d6a147e1 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Thu, 20 Apr 2023 17:52:34 +0800 Subject: [PATCH 03/46] freeze_align --- passl/core/param_fuse.py | 31 ++-- passl/engine/loops/classification_loop.py | 108 +++++++++++- passl/models/resnet.py | 4 +- passl/models/swav.py | 62 +++++-- passl/optimizer/__init__.py | 156 +++++++++++++++--- passl/optimizer/momentum.py | 2 +- ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml | 7 +- 7 files changed, 305 insertions(+), 65 deletions(-) diff --git a/passl/core/param_fuse.py b/passl/core/param_fuse.py index e98cca62..87fc5cb3 100644 --- a/passl/core/param_fuse.py +++ b/passl/core/param_fuse.py @@ -459,18 +459,6 @@ def flatten_dense_tensors(parameters): param_storage.add_rank_params(parameters, _param2align) - # process gradient - # grad_storage = None - grad_storage = GradStorage( - size=_buffer_size, - dtype=dtype, - device="gpu", - destination="0", - parm2align=_param2align) - - for param in parameters: - grad_storage.add_grad(param, _param2align[param.name]) - if in_dygraph_mode(): fused_param = EagerParamBase( shape=param_storage.buffer.shape, @@ -482,7 +470,22 @@ def flatten_dense_tensors(parameters): dtype=dtype, name=unique_name.generate('fused_param')) param_storage.buffer._share_buffer_to(fused_param) - fused_param._copy_gradient_from(grad_storage.buffer) + + if not stop_gradient: + # process gradient + # grad_storage = None + grad_storage = GradStorage( + size=_buffer_size, + dtype=dtype, + device="gpu", + destination="0", + parm2align=_param2align) + + for param in parameters: + grad_storage.add_grad(param, _param2align[param.name]) + + fused_param._copy_gradient_from(grad_storage.buffer) + fused_param.__dict__.update(state) fused_param.stop_gradient = stop_gradient @@ -501,4 +504,4 @@ def get_fused_params(params): for group_idx, parameters in var_groups.items(): fused_param = flatten_dense_tensors(parameters) fused_params.append(fused_param) - return fused_params + return fused_params \ No newline at end of file diff --git a/passl/engine/loops/classification_loop.py b/passl/engine/loops/classification_loop.py index edf9be7c..cf61ad78 100644 --- a/passl/engine/loops/classification_loop.py +++ b/passl/engine/loops/classification_loop.py @@ -31,6 +31,91 @@ from passl.utils import logger from .loop import _Loop, TrainingEpochLoop + +import os +import logging +import time +from datetime import timedelta +import pandas as pd + + +class LogFormatter: + def __init__(self): + self.start_time = time.time() + + def format(self, record): + elapsed_seconds = round(record.created - self.start_time) + + prefix = "%s - %s - %s" % ( + record.levelname, + time.strftime("%x %X"), + timedelta(seconds=elapsed_seconds), + ) + message = record.getMessage() + message = message.replace("\n", "\n" + " " * (len(prefix) + 3)) + return "%s - %s" % (prefix, message) if message else "" + + +def create_logger(filepath, rank): + """ + Create a logger. + Use a different log file for each process. + """ + # create log formatter + log_formatter = LogFormatter() + + # create file handler and set level to debug + if filepath is not None: + if rank > 0: + filepath = "%s-%i" % (filepath, rank) + file_handler = logging.FileHandler(filepath, "a") + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(log_formatter) + + # create console handler and set level to info + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) + console_handler.setFormatter(log_formatter) + + # create logger and set level to debug + logger = logging.getLogger() + logger.handlers = [] + logger.setLevel(logging.DEBUG) + logger.propagate = False + if filepath is not None: + logger.addHandler(file_handler) + logger.addHandler(console_handler) + + # reset logger elapsed time + def reset_time(): + log_formatter.start_time = time.time() + + logger.reset_time = reset_time + + return logger + + +def init_logger(name): + logger = create_logger( + os.path.join("{}.log".format(name)), rank=0 + ) + logger.info("============ Initialized logger ============") + logger.info("") + return logger + + +def log_model(model, logger): + model1 = model.res_model + for name, param in model1.named_parameters(): + logger.info(name) + logger.info(param.abs().sum()) + + model2 = model.linear + for name, param in model2.named_parameters(): + logger.info(name) + logger.info(param.abs().sum()) + + class ClassificationTrainingEpochLoop(TrainingEpochLoop): def __init__(self, trainer, epochs, max_train_step=None, val_loop=None): @@ -60,8 +145,13 @@ def forward_backward(self, batch): out = self.trainer.model(data) final_out.append(out) - + + # label = paddle.to_tensor([133, 141, 371, 254, 89, 244, 33, 64, 542, 93, 262, 674, 898, 796, 785, 727, 228, 792, 853, 639, 410, 357, 545, 473, 637, 400, 863, 386, 689, 359, 476, 960]).cast('int32') + loss_dict = self.trainer.train_loss_func(out, label) + + # logger1 = init_logger('first') + # log_model(self.trainer.model, logger1) for key in loss_dict: loss_dict[key] = loss_dict[key] / self.trainer.accum_steps @@ -72,9 +162,23 @@ def forward_backward(self, batch): # loss scaling if using fp16 otherwise do nothing scaled = self.trainer.scaler.scale(loss_dict["loss"]) scaled.backward() + +# grad_sync(self.trainer.optimizer.param_groups) + +# # do unscale and step if using fp16 and not found nan/inf +# # otherwise do nothing +# self.trainer.scaler.step(self.trainer.optimizer) +# # do update loss scaling if using fp16 +# # otherwise do nothing +# self.trainer.scaler.update() + + # logger2 = init_logger('second') + # log_model(self.trainer.model, logger2) + # import pdb; pdb.set_trace() + out = paddle.concat(final_out, axis=0) - return out, final_loss_dict + return out, final_loss_dict, def train_one_step(self, batch): diff --git a/passl/models/resnet.py b/passl/models/resnet.py index 9abfc0d4..07e9362f 100644 --- a/passl/models/resnet.py +++ b/passl/models/resnet.py @@ -1,3 +1,5 @@ +import functools + import paddle import paddle.nn as nn @@ -106,7 +108,7 @@ def __init__(self, block, layers, zero_init_residual=False, groups=1, super(ResNet, self).__init__() if norm_layer is None: - norm_layer = paddle.nn.BatchNorm2D + norm_layer = functools.partial(paddle.nn.BatchNorm2D, use_global_stats=True) self._norm_layer = norm_layer self.eval_mode = eval_mode self.padding = paddle.nn.Pad2D(padding=1, value=0.0) diff --git a/passl/models/swav.py b/passl/models/swav.py index 7cba9acc..8f20b6d9 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -3,6 +3,7 @@ import paddle import paddle.nn as nn +from passl.nn import init from passl.models.resnet import resnet50 from passl.models.base_model import Model @@ -51,21 +52,36 @@ def load_pretrained(self, path, rank=0, finetune=False): def save(self, path, local_rank=0, rank=0): paddle.save(self.state_dict(), path + ".pdparams") - + class SwAVLinearProbe(SwAV): def __init__(self, class_num=1000, linear_arch="resnet50", global_avg=True, use_bn=False, **kwargs): super().__init__(**kwargs) self.linear = RegLog(1000, "resnet50", global_avg=True, use_bn=False) self.res_model.eval() - - def load_pretrained(self, path, rank=0, finetune=False): - # only load res_model + + # freeze all layers but the last fc + for name, param in self.named_parameters(): + if name not in ['linear.linear.weight', 'linear.linear.bias']: + param.stop_gradient = True + + # optimize only the linear classifier + parameters = list( + filter(lambda p: not p.stop_gradient, self.parameters())) + assert len(parameters) == 2 # weight, bias + + self.apply(self._freeze_norm) + + def _freeze_norm(self, layer): + if isinstance(layer, (nn.layer.norm._BatchNormBase)): + layer._use_global_stats = True + + def _load_model(self, path, model, tag): if os.path.isfile(path): para_state_dict = paddle.load(path) # resnet - model_state_dict = self.res_model.state_dict() + model_state_dict = model.state_dict() keys = model_state_dict.keys() num_params_loaded = 0 for k in keys: @@ -80,13 +96,25 @@ def load_pretrained(self, path, rank=0, finetune=False): else: model_state_dict[k] = para_state_dict[k] num_params_loaded += 1 - self.res_model.set_dict(model_state_dict) + model.set_dict(model_state_dict) print("There are {}/{} variables loaded into {}.".format( - num_params_loaded, len(model_state_dict), "backbone")) + num_params_loaded, len(model_state_dict), tag)) else: - print("No pretrained weights found => training with random weights") + print("No pretrained weights found in {} => training with random weights".format(tag)) + + def load_pretrained(self, path, rank=0, finetune=False): + self._load_model(path, self.res_model, 'backbone') + self._load_model("linear.pdparams", self.linear, 'linear') + def forward(self, inp): +# import numpy as np + # import pdb; pdb.set_trace() + +# np.random.seed(42) +# a = np.random.rand(32, 3, 224, 224) +# inp = paddle.to_tensor(a).astype('float32') + with paddle.no_grad(): output = self.res_model(inp) output = self.linear(output) @@ -104,13 +132,14 @@ def swav_resnet50_linearprobe(**kwargs): return model -def normal_init(param, **kwargs): - initializer = nn.initializer.Normal(**kwargs) - initializer(param, param.block) +# def normal_init(param, **kwargs): +# initializer = nn.initializer.Normal(**kwargs) +# initializer(param, param.block) -def constant_init(param, **kwargs): - initializer = nn.initializer.Constant(**kwargs) - initializer(param, param.block) +# def constant_init(param, **kwargs): +# initializer = nn.initializer.Constant(**kwargs) +# initializer(param, param.block) + class RegLog(paddle.nn.Layer): """Creates logistic regression on top of frozen features""" @@ -137,9 +166,8 @@ def __init__(self, num_labels, arch='resnet50', global_avg=False, None, use_global_stats=True) self.linear = paddle.nn.Linear(in_features=s, out_features=num_labels) - normal_init(self.linear.weight, mean=0.0, std=0.01) - constant_init(self.linear.bias, value=0.0) # padiff - + init.normal_(self.linear.weight, mean=0.0, std=0.01) + init.zeros_(self.linear.bias) def forward(self, x): x = self.av_pool(x) diff --git a/passl/optimizer/__init__.py b/passl/optimizer/__init__.py index 2d87f3f3..451da1ff 100644 --- a/passl/optimizer/__init__.py +++ b/passl/optimizer/__init__.py @@ -1,3 +1,88 @@ +# # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# # +# # Licensed under the Apache License, Version 2.0 (the "License"); +# # you may not use this file except in compliance with the License. +# # You may obtain a copy of the License at +# # http://www.apache.org/licenses/LICENSE-2.0 +# # +# # Unless required by applicable law or agreed to in writing, software +# # distributed under the License is distributed on an "AS IS" BASIS, +# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# # See the License for the specific language governing permissions and +# # limitations under the License. + +# from __future__ import absolute_import +# from __future__ import division +# from __future__ import print_function + +# from collections import defaultdict + +# import copy +# import paddle + +# from passl.core.grad_clip import ClipGradByGlobalNorm +# from passl.core.param_fuse import get_fused_params + +# from passl.utils import logger + +# from .optimizer import Optimizer +# from .adamw import AdamW +# from .adafactor import Adafactor +# from .momentum import Momentum +# from .momentum_lars import MomentumLARS + + +# def build_optimizer(config, lr_scheduler, model=None): +# config = copy.deepcopy(config) + +# grad_clip = None +# grad_clip_config = config.pop('grad_clip', None) +# if grad_clip_config is not None: +# grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm') +# grad_clip = eval(grad_clip_name)(**grad_clip_config) + +# no_weight_decay_name = config.pop('no_weight_decay_name', []) + +# param_group = defaultdict(list) +# for n, p in model.named_parameters(): +# state = copy.deepcopy(p.__dict__) +# if any(nd in n for nd in no_weight_decay_name): +# state['no_weight_decay'] = True +# param_group[str(state)].append(p) + +# # fuse params +# for key in param_group: +# if 'gpu' not in paddle.get_device(): +# continue +# if "'is_distributed': True" in key: +# continue +# if "'has_sparse_grad': True" in key: +# continue + +# param_group[key] = get_fused_params(param_group[key]) + +# # bulid optimizer params +# params = [] +# for key in param_group: +# group = {'params': param_group[key]} + +# if "'is_distributed': True" in key: +# group['is_distributed'] = True + +# if 'no_weight_decay' in key: +# group['weight_decay'] = 0.0 + +# params.append(group) + +# optim_name = config.pop('name') +# optim = eval(optim_name)(params, +# lr=lr_scheduler, +# grad_clip=grad_clip, +# **config) +# logger.debug("build optimizer ({}) success..".format(optim)) +# return optim + + # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -34,7 +119,8 @@ def build_optimizer(config, lr_scheduler, model=None): config = copy.deepcopy(config) - + optim_name = config.pop('name') + grad_clip = None grad_clip_config = config.pop('grad_clip', None) if grad_clip_config is not None: @@ -42,40 +128,56 @@ def build_optimizer(config, lr_scheduler, model=None): grad_clip = eval(grad_clip_name)(**grad_clip_config) no_weight_decay_name = config.pop('no_weight_decay_name', []) + tensor_fusion = config.pop('tensor_fusion', True) + if 'LAR' in optim_name: + tensor_fusion = False + logger.info('LARS or LARC Optimizer can not use tensor fusion technology. It automatically fall back to `tensor_fusion = False`.') - param_group = defaultdict(list) - for n, p in model.named_parameters(): - state = copy.deepcopy(p.__dict__) - if any(nd in n for nd in no_weight_decay_name): - state['no_weight_decay'] = True - param_group[str(state)].append(p) - # fuse params - for key in param_group: - if 'gpu' not in paddle.get_device(): - continue - if "'is_distributed': True" in key: - continue - if "'has_sparse_grad': True" in key: - continue + if hasattr(model, 'param_groups'): + param_group = model.param_groups(no_weight_decay_name, tensor_fusion) + for group in param_group: + if 'tensor_fusion' in group and group['tensor_fusion']: + group['params'] = get_fused_params(group['params']) + else: + param_group_map = defaultdict(list) + for n, p in model.named_parameters(): + state = copy.deepcopy(p.__dict__) + state['stop_gradient'] = p.stop_gradient + if any(nd in n for nd in no_weight_decay_name): + state['no_weight_decay'] = True + param_group_map[str(state)].append(p) - param_group[key] = get_fused_params(param_group[key]) - # bulid optimizer params - params = [] - for key in param_group: - group = {'params': param_group[key]} + if tensor_fusion: + # fuse params + for key in param_group_map: + if 'gpu' not in paddle.get_device(): + continue + if "'is_distributed': True" in key: + continue + if "'has_sparse_grad': True" in key: + continue + param_group_map[key] = get_fused_params(param_group_map[key]) - if "'is_distributed': True" in key: - group['is_distributed'] = True - if 'no_weight_decay' in key: - group['weight_decay'] = 0.0 + # bulid optimizer params + param_group = [] + for key in param_group_map: + group = {'params': param_group_map[key]} - params.append(group) - optim_name = config.pop('name') - optim = eval(optim_name)(params, + if "'is_distributed': True" in key: + group['is_distributed'] = True + + + if 'no_weight_decay' in key: + group['weight_decay'] = 0.0 + + + param_group.append(group) + + optim = eval(optim_name)(param_group, lr=lr_scheduler, grad_clip=grad_clip, **config) diff --git a/passl/optimizer/momentum.py b/passl/optimizer/momentum.py index 55402fd4..8b569c7c 100644 --- a/passl/optimizer/momentum.py +++ b/passl/optimizer/momentum.py @@ -72,7 +72,7 @@ def step(self): grad = p.grad if grad is None: continue - + # print('###########',p.name) if grad.is_selected_rows(): raise RuntimeError( 'Momentum does not support sparse gradients.') diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml index 569f2e86..a290ea19 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml @@ -5,7 +5,7 @@ Global: validate_loop: ClassificationEvaluationLoop checkpoint: null pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams - output_dir: ./output/baseline + output_dir: ./output/baseline_0420_align_trackTrue device: gpu save_interval: 1 max_num_latest_checkpoint: 0 @@ -41,7 +41,8 @@ Loss: LRScheduler: name: TimmCosine - learning_rate: 0.6 + learning_rate: 0.3 + eta_min: 0.0 decay_unit: epoch last_epoch: 0 warmup_epoch: 0 @@ -50,8 +51,8 @@ Optimizer: name: Momentum momentum: 0.9 weight_decay: 1e-6 + tensor_fusion: True - # data loader for train and eval DataLoader: Train: From 4d8dc6b29b6ec2c12d92f91b2fe4ed349aa6e38c Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Fri, 21 Apr 2023 11:01:32 +0800 Subject: [PATCH 04/46] add_ft_swav --- passl/data/dataset/imagefolder_dataset.py | 20 ++- passl/models/swav.py | 107 ++++++------- passl/scheduler/lr_scheduler.py | 4 +- ...se_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml | 148 ------------------ ...v_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml | 112 +++++++++++++ ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml | 2 + 6 files changed, 180 insertions(+), 213 deletions(-) delete mode 100644 tasks/ssl/swav/configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml create mode 100644 tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml diff --git a/passl/data/dataset/imagefolder_dataset.py b/passl/data/dataset/imagefolder_dataset.py index 618f5d77..ef03f1c4 100644 --- a/passl/data/dataset/imagefolder_dataset.py +++ b/passl/data/dataset/imagefolder_dataset.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union -import numpy as np import os +import urllib +import numpy as np +from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union import paddle @@ -56,11 +57,22 @@ def __init__(self, transform=None, target_transform=None, loader=default_loader, - extensions=IMG_EXTENSIONS): + extensions=IMG_EXTENSIONS, + samples_tag=None): self.root = root classes, class_to_idx = self.find_classes(self.root) - samples = self.make_dataset(self.root, class_to_idx, extensions) + if samples_tag is None: + samples = self.make_dataset(self.root, class_to_idx, extensions) + elif samples_tag == "semi_1" or samples == "semi_10": + train_data_path = os.path.join(root, "train") + percent = samples_tag.split('_')[-1] + subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(percent) + "percent.txt") + list_imgs = [li.decode("utf-8").split('\n')[0] for li in subset_file] + samples = [(os.path.join(train_data_path, li.split('_')[0], li), class_to_idx[li.split('_')[0]]) for li in list_imgs] + else: + raise NotImplementedError('{} is not implemented'.format(samples)) + print(f'find total {len(classes)} classes and {len(samples)} images.') self.extensions = extensions diff --git a/passl/models/swav.py b/passl/models/swav.py index 8f20b6d9..e6fb79fb 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -14,6 +14,7 @@ # 'swav_resnet50_pretrain', 'SwAV', 'SwAVLinearProbe', + 'SwAVFinetune', # 'SwAVPretrain', ] @@ -23,7 +24,32 @@ def __init__(self, **kwargs): super().__init__() self.res_model = resnet50(**kwargs) - + def _load_model(self, path, model, tag): + if os.path.isfile(path): + para_state_dict = paddle.load(path) + + # resnet + model_state_dict = model.state_dict() + keys = model_state_dict.keys() + num_params_loaded = 0 + for k in keys: + if k not in para_state_dict: + print("{} is not in pretrained model".format(k)) + elif list(para_state_dict[k].shape) != list(model_state_dict[k] + .shape): + print( + "[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})" + .format(k, para_state_dict[k].shape, model_state_dict[k] + .shape)) + else: + model_state_dict[k] = para_state_dict[k] + num_params_loaded += 1 + model.set_dict(model_state_dict) + print("There are {}/{} variables loaded into {}.".format( + num_params_loaded, len(model_state_dict), tag)) + else: + print("No pretrained weights found in {} => training with random weights".format(tag)) + def load_pretrained(self, path, rank=0, finetune=False): pass # if not os.path.exists(path + '.pdparams'): @@ -55,9 +81,9 @@ def save(self, path, local_rank=0, rank=0): class SwAVLinearProbe(SwAV): - def __init__(self, class_num=1000, linear_arch="resnet50", global_avg=True, use_bn=False, **kwargs): + def __init__(self, class_num=1000, **kwargs): super().__init__(**kwargs) - self.linear = RegLog(1000, "resnet50", global_avg=True, use_bn=False) + self.linear = RegLog(class_num) self.res_model.eval() # freeze all layers but the last fc @@ -75,38 +101,11 @@ def __init__(self, class_num=1000, linear_arch="resnet50", global_avg=True, use_ def _freeze_norm(self, layer): if isinstance(layer, (nn.layer.norm._BatchNormBase)): layer._use_global_stats = True - - def _load_model(self, path, model, tag): - if os.path.isfile(path): - para_state_dict = paddle.load(path) - - # resnet - model_state_dict = model.state_dict() - keys = model_state_dict.keys() - num_params_loaded = 0 - for k in keys: - if k not in para_state_dict: - print("{} is not in pretrained model".format(k)) - elif list(para_state_dict[k].shape) != list(model_state_dict[k] - .shape): - print( - "[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})" - .format(k, para_state_dict[k].shape, model_state_dict[k] - .shape)) - else: - model_state_dict[k] = para_state_dict[k] - num_params_loaded += 1 - model.set_dict(model_state_dict) - print("There are {}/{} variables loaded into {}.".format( - num_params_loaded, len(model_state_dict), tag)) - else: - print("No pretrained weights found in {} => training with random weights".format(tag)) def load_pretrained(self, path, rank=0, finetune=False): self._load_model(path, self.res_model, 'backbone') self._load_model("linear.pdparams", self.linear, 'linear') - def forward(self, inp): # import numpy as np # import pdb; pdb.set_trace() @@ -121,14 +120,23 @@ def forward(self, inp): return output +class SwAVFinetune(SwAV): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def load_pretrained(self, path, rank=0, finetune=False): + self._load_model(path, self.res_model, 'backbone') + + def forward(self, inp): + return self.res_model(inp) + def swav_resnet50_linearprobe(**kwargs): - model = SwAVLinearProbe(linear_arch="resnet50", - global_avg=True, - use_bn=False, - output_dim=0, - eval_mode=True, - **kwargs) + model = SwAVLinearProbe(**kwargs) + return model + +def swav_resnet50_finetune(**kwargs): + model = SwAVFinetune(**kwargs) return model @@ -144,35 +152,16 @@ def swav_resnet50_linearprobe(**kwargs): class RegLog(paddle.nn.Layer): """Creates logistic regression on top of frozen features""" - def __init__(self, num_labels, arch='resnet50', global_avg=False, - use_bn=True): + def __init__(self, num_labels): super(RegLog, self).__init__() - self.bn = None - if global_avg: - if arch == 'resnet50': - s = 2048 - elif arch == 'resnet50w2': - s = 4096 - elif arch == 'resnet50w4': - s = 8192 - self.av_pool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1)) - else: - assert arch == 'resnet50' - s = 8192 - self.av_pool = paddle.nn.AvgPool2D(6, stride=1) - if use_bn: - self.bn = paddle.nn.BatchNorm2D(num_features=2048, momentum - =1 - 0.1, epsilon=1e-05, weight_attr=None, bias_attr= - None, use_global_stats=True) - + s = 2048 + self.av_pool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1)) self.linear = paddle.nn.Linear(in_features=s, out_features=num_labels) + init.normal_(self.linear.weight, mean=0.0, std=0.01) init.zeros_(self.linear.bias) def forward(self, x): x = self.av_pool(x) - if self.bn is not None: - x = self.bn(x) - x = x.reshape((x.shape[0], -1)) return self.linear(x) \ No newline at end of file diff --git a/passl/scheduler/lr_scheduler.py b/passl/scheduler/lr_scheduler.py index 223ca349..6a492920 100644 --- a/passl/scheduler/lr_scheduler.py +++ b/passl/scheduler/lr_scheduler.py @@ -123,8 +123,8 @@ class Step(lr.LRScheduler): def __init__(self, step_each_epoch, epochs, - boundaries, - values, + boundaries, # [12, 16] + values, #[0.01, 0.002, 0.0004], warmup_steps=0, warmup_epochs=0, decay_unit='epoch', diff --git a/tasks/ssl/swav/configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml deleted file mode 100644 index d70c6647..00000000 --- a/tasks/ssl/swav/configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml +++ /dev/null @@ -1,148 +0,0 @@ -# global configs -Global: - task_type: Classification - train_loop: ClassificationTrainingEpochLoop - validate_loop: ClassificationEvaluationLoop - checkpoint: null - pretrained_model: ./pretrained/mocov3/mocov3_vit_base_in1k_300ep_pretrained - finetune: True - output_dir: ./output/ - device: gpu - save_interval: 1 - max_num_latest_checkpoint: 0 - eval_during_train: True - eval_interval: 1 - eval_unit: "epoch" - accum_steps: 1 - epochs: 150 - print_batch_step: 10 - use_visualdl: False - seed: 2022 - -# FP16 setting -FP16: - level: O1 - GradScaler: - init_loss_scaling: 65536.0 - -DistributedStrategy: - data_parallel: True - -# model architecture -Model: - name: DeiT_base_patch16_224 - drop_path_rate : 0.1 - drop_rate : 0.0 - class_num: 1000 - -# loss function config for traing/eval process -Loss: - Train: - - CELoss: - weight: 1.0 - Eval: - - CELoss: - weight: 1.0 - -LRScheduler: - name: TimmCosine - learning_rate: 1e-3 - eta_min: 1e-5 - warmup_epoch: 5 - warmup_start_lr: 1e-6 - decay_unit: epoch - -Optimizer: - name: AdamW - betas: (0.9, 0.999) - eps: 1e-8 - weight_decay: 0.05 - no_weight_decay_name: ["cls_token", "pos_embed", "norm", "bias"] - use_master_param: True - exp_avg_force_fp32: True - -# data loader for train and eval -DataLoader: - Train: - dataset: - name: ImageFolder - root: ./dataset/ILSVRC2012/train - transform: - - RandomResizedCrop: - size: 224 - interpolation: bicubic - - RandomHorizontalFlip: - - TimmAutoAugment: - config_str: rand-m9-mstd0.5-inc1 - interpolation: bicubic - img_size: 224 - mean: [0.485, 0.456, 0.406] - - NormalizeImage: - scale: 1.0/255.0 - mean: [0.485, 0.456, 0.406] - std: [0.229, 0.224, 0.225] - order: '' - - RandomErasing: - EPSILON: 0.25 - sl: 0.02 - sh: 1.0/3.0 - r1: 0.3 - attempt: 10 - use_log_aspect: True - mode: pixel - - ToCHWImage: - batch_transform: - - TransformOpSampler: - Mixup: - alpha: 0.8 - prob: 0.5 - epsilon: 0.1 - class_num: 1000 - Cutmix: - alpha: 1.0 - prob: 0.5 - epsilon: 0.1 - class_num: 1000 - sampler: - name: RepeatedAugSampler - batch_size: 128 # accum_steps: 1, total batchsize: 1024 - drop_last: False - shuffle: True - loader: - num_workers: 8 - use_shared_memory: True - - Eval: - dataset: - name: ImageFolder - root: ./dataset/ILSVRC2012/val - transform: - - Resize: - size: 256 - interpolation: bicubic - backend: pil - - CenterCrop: - size: 224 - - NormalizeImage: - scale: 1.0/255.0 - mean: [0.485, 0.456, 0.406] - std: [0.229, 0.224, 0.225] - order: '' - - ToCHWImage: - sampler: - name: DistributedBatchSampler - batch_size: 256 - drop_last: False - shuffle: False - loader: - num_workers: 8 - use_shared_memory: True - -Metric: - Eval: - - TopkAcc: - topk: [1, 5] - -Export: - export_type: paddle - input_shape: [None, 3, 224, 224] diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml new file mode 100644 index 00000000..f2a229a0 --- /dev/null +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml @@ -0,0 +1,112 @@ +# global configs +Global: + task_type: Classification + train_loop: ClassificationTrainingEpochLoop + validate_loop: ClassificationEvaluationLoop + checkpoint: null + pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams + finetune: True + output_dir: ./output/semi_0420 + device: gpu + save_interval: 1 + max_num_latest_checkpoint: 0 + eval_during_train: True + eval_interval: 1 + eval_unit: "epoch" + accum_steps: 1 + epochs: 20 + print_batch_step: 100 + use_visualdl: False + seed: 31 + +# FP16 setting +# FP16: +# level: O1 + +DistributedStrategy: + data_parallel: True + +# model architecture +Model: + name: swav_resnet50_finetune + output_dim: 1000 + +# loss function config for traing/eval process +Loss: + Train: + - CELoss: + weight: 1.0 + Eval: + - CELoss: + weight: 1.0 + +LRScheduler: + name: Step + learning_rate: 0.01 + boundaries: [12, 16] + values: [0.01, 0.002, 0.0004] + decay_unit: epoch + last_epoch: 0 + +Optimizer: + name: Momentum + momentum: 0.9 + weight_decay: 0.0 + tensor_fusion: False + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: ImageFolder + root: data/ILSVRC2012/train + transform: + - RandomResizedCrop: + size: 224 + - RandomHorizontalFlip: + - ToTensor: + - Normalize: + mean: [0.485, 0.456, 0.406] + std: [0.228, 0.224, 0.225] + sampler: + name: DistributedBatchSampler + batch_size: 32 # accum_steps: 1, total batchsize: 256 + drop_last: False + shuffle: True + loader: + num_workers: 8 + use_shared_memory: True + + Eval: + dataset: + name: ImageFolder + root: data/ILSVRC2012/val + transform: + - Resize: + size: 256 + - CenterCrop: + size: 224 + - ToTensor: + - Normalize: + mean: [0.485, 0.456, 0.406] + std: [0.228, 0.224, 0.225] + sampler: + name: DistributedBatchSampler + batch_size: 32 + drop_last: False + shuffle: False + loader: + num_workers: 8 + use_shared_memory: True + +Metric: + Train: + - TopkAcc: + topk: [1, 5] + Eval: + - TopkAcc: + topk: [1, 5] + +Export: + export_type: paddle + input_shape: [None, 3, 224, 224] \ No newline at end of file diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml index a290ea19..251f5f49 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml @@ -28,6 +28,8 @@ DistributedStrategy: # model architecture Model: name: swav_resnet50_linearprobe + output_dim: 0 + eval_mode: True class_num: 1000 # loss function config for traing/eval process From 32b94c36f115c3c94124246936dbef151e705ba4 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Fri, 21 Apr 2023 17:07:26 +0800 Subject: [PATCH 05/46] add_pretrain --- passl/data/dataset/multicrop_dataset.py | 94 +++++++++++++++ passl/data/preprocess/basic_transforms.py | 19 +++ passl/models/swav.py | 18 ++- passl/scheduler/lr_scheduler.py | 2 +- ...se_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml | 109 ------------------ ...e_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml | 108 ----------------- ...v_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml | 71 ++++++++++++ tasks/ssl/swav/finetune.sh | 5 +- tasks/ssl/swav/linearprobe.sh | 4 - tasks/ssl/swav/pretrain.sh | 13 +-- 10 files changed, 207 insertions(+), 236 deletions(-) create mode 100644 passl/data/dataset/multicrop_dataset.py delete mode 100644 tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml delete mode 100644 tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml create mode 100644 tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml diff --git a/passl/data/dataset/multicrop_dataset.py b/passl/data/dataset/multicrop_dataset.py new file mode 100644 index 00000000..926d4a59 --- /dev/null +++ b/passl/data/dataset/multicrop_dataset.py @@ -0,0 +1,94 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from paddle.vision.transforms import ( + Compose, + Transpose, + ColorJitter, + RandomResizedCrop, + RandomHorizontalFlip, +) +from passl.data.dataset.imagefolder_dataset import ImageFolder +from passl.data.preprocess import ( + RandomApply, + GaussianBlur, + NormalizeImage, + RandomGrayscale, +) + + +class MultiCropDataset(ImageFolder): + def __init__(self, + dataroot, + size_crops, + num_crops, + min_scale_crops, + max_scale_crops, + return_label=False): + super(MultiCropDataset, self).__init__(dataroot) + + assert len(size_crops) == len(num_crops) + assert len(min_scale_crops) == len(num_crops) + assert len(max_scale_crops) == len(num_crops) + self.return_label = return_label + + color_transform = [get_color_distortion(), get_pil_gaussian_blur()] + mean = [0.485, 0.456, 0.406] + std = [0.229, 0.224, 0.225] + trans = [] + for i in range(len(size_crops)): + randomresizedcrop = RandomResizedCrop( + size_crops[i], + scale=(min_scale_crops[i], max_scale_crops[i]), + ) + trans.extend([Compose([ + randomresizedcrop, + RandomHorizontalFlip(prob=0.5), + Compose(color_transform), + Transpose(), + NormalizeImage(scale='1.0/255.0', mean=mean, std=std)]) + ] * num_crops[i]) + self.trans = trans + + def __getitem__(self, index): + """ + Args: + index (int): Index + + Returns: + tuple: (sample, target) where target is class_index of the target class. + """ + path, target = self.samples[index] + sample = self.loader(path) + sample = list(map(lambda trans: trans(sample), self.trans)) + if self.return_label: + return sample, target + + return sample + + +def get_pil_gaussian_blur(p=0.5): + gaussian_blur = GaussianBlur(sigma=[.1, 2.], _PIL=True) + rnd_gaussian_blur = RandomApply([gaussian_blur], p=p) + return rnd_gaussian_blur + + +def get_color_distortion(s=1.0): + # s is the strength of color distortion. + color_jitter = ColorJitter(0.8*s, 0.8*s, 0.8*s, 0.2*s) + rnd_color_jitter = RandomApply([color_jitter], p=0.8) + rnd_gray = RandomGrayscale(p=0.2) + color_distort = Compose([rnd_color_jitter, rnd_gray]) + return color_distort \ No newline at end of file diff --git a/passl/data/preprocess/basic_transforms.py b/passl/data/preprocess/basic_transforms.py index 7be2b26a..9d9eb132 100644 --- a/passl/data/preprocess/basic_transforms.py +++ b/passl/data/preprocess/basic_transforms.py @@ -57,6 +57,7 @@ "SimCLRGaussianBlur", "BYOLSolarize", "MAERandCropImage", + "GaussianBlur" ] @@ -941,3 +942,21 @@ def __call__(self, img): else: img = ImageOps.solarize(img) return img + + +class GaussianBlur(object): + """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709""" + def __init__(self, sigma=[.1, 2.], _PIL=False): + self.sigma = sigma + self.kernel_size = 23 + self._PIL = _PIL + + def __call__(self, x): + sigma = np.random.uniform(self.sigma[0], self.sigma[1]) + if self._PIL: + x = x.filter(ImageFilter.GaussianBlur(radius=sigma)) + return x + else: + x = cv2.GaussianBlur(np.array(x), + (self.kernel_size, self.kernel_size), sigma) + return Image.fromarray(x.astype(np.uint8)) \ No newline at end of file diff --git a/passl/models/swav.py b/passl/models/swav.py index e6fb79fb..0181eb73 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -9,13 +9,13 @@ __all__ = [ - # 'swav_resnet50', + 'swav_resnet50_finetune', 'swav_resnet50_linearprobe', - # 'swav_resnet50_pretrain', + 'swav_resnet50_pretrain', 'SwAV', 'SwAVLinearProbe', 'SwAVFinetune', - # 'SwAVPretrain', + 'SwAVPretrain', ] # def model and @@ -130,6 +130,13 @@ def load_pretrained(self, path, rank=0, finetune=False): def forward(self, inp): return self.res_model(inp) +class SwAVPretrain(SwAV): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def forward(self, inp): + return self.res_model(inp) + def swav_resnet50_linearprobe(**kwargs): model = SwAVLinearProbe(**kwargs) @@ -138,7 +145,10 @@ def swav_resnet50_linearprobe(**kwargs): def swav_resnet50_finetune(**kwargs): model = SwAVFinetune(**kwargs) return model - + +def swav_resnet50_pretrain(**kwargs): + model = SwAVPretrain(**kwargs) + return model # def normal_init(param, **kwargs): # initializer = nn.initializer.Normal(**kwargs) diff --git a/passl/scheduler/lr_scheduler.py b/passl/scheduler/lr_scheduler.py index 6a492920..fb8c7c97 100644 --- a/passl/scheduler/lr_scheduler.py +++ b/passl/scheduler/lr_scheduler.py @@ -23,7 +23,7 @@ class TimmCosine(lr.LRScheduler): def __init__(self, learning_rate, - step_each_epoch, + step_each_epoch, # len(train_loader) = dataset/total_bs epochs, decay_unit='epoch', eta_min=0.0, diff --git a/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml deleted file mode 100644 index ae0efc7b..00000000 --- a/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml +++ /dev/null @@ -1,109 +0,0 @@ -# global configs -Global: - task_type: Classification - train_loop: ClassificationTrainingEpochLoop - validate_loop: ClassificationEvaluationLoop - checkpoint: null - pretrained_model: ./pretrained/mocov3/mocov3_vit_base_in1k_300ep_pretrained - output_dir: ./output/ - device: gpu - save_interval: 1 - max_num_latest_checkpoint: 0 - eval_during_train: True - eval_interval: 1 - eval_unit: "epoch" - accum_steps: 1 - epochs: 90 - print_batch_step: 10 - use_visualdl: False - seed: 2022 - -# FP16 setting -FP16: - level: O1 - -DistributedStrategy: - data_parallel: True - -# model architecture -Model: - name: mocov3_vit_base_linearprobe - class_num: 1000 - -# loss function config for traing/eval process -Loss: - Train: - - CELoss: - weight: 1.0 - Eval: - - CELoss: - weight: 1.0 - -LRScheduler: - name: TimmCosine - learning_rate: 12.0 - decay_unit: epoch - last_epoch: 0 - warmup_epoch: 0 - -Optimizer: - name: Momentum - momentum: 0.9 - weight_decay: 0.0 - -# data loader for train and eval -DataLoader: - Train: - dataset: - name: ImageFolder - root: data/ILSVRC2012/train - transform: - - RandomResizedCrop: - size: 224 - - RandomHorizontalFlip: - - ToTensor: - - Normalize: - mean: [0.485, 0.456, 0.406] - std: [0.229, 0.224, 0.225] - sampler: - name: DistributedBatchSampler - batch_size: 128 # accum_steps: 1, total batchsize: 1024 - drop_last: False - shuffle: True - loader: - num_workers: 8 - use_shared_memory: True - - Eval: - dataset: - name: ImageFolder - root: data/ILSVRC2012/val - transform: - - Resize: - size: 256 - - CenterCrop: - size: 224 - - ToTensor: - - Normalize: - mean: [0.485, 0.456, 0.406] - std: [0.229, 0.224, 0.225] - sampler: - name: DistributedBatchSampler - batch_size: 256 - drop_last: False - shuffle: False - loader: - num_workers: 8 - use_shared_memory: True - -Metric: - Train: - - TopkAcc: - topk: [1, 5] - Eval: - - TopkAcc: - topk: [1, 5] - -Export: - export_type: paddle - input_shape: [None, 3, 224, 224] diff --git a/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml deleted file mode 100644 index cb3a7a9e..00000000 --- a/tasks/ssl/swav/configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml +++ /dev/null @@ -1,108 +0,0 @@ -# global configs -Global: - task_type: ContrastiveLearning - train_loop: ContrastiveLearningTrainingEpochLoop - validate_loop: None - checkpoint: null - pretrained_model: null - output_dir: ./output/ - device: gpu - save_interval: 1 - max_num_latest_checkpoint: 0 - eval_during_train: False - eval_interval: 1 - eval_unit: "epoch" - accum_steps: 1 - epochs: 300 - print_batch_step: 10 - use_visualdl: False - seed: 2023 - -# FP16 setting -FP16: - level: O1 - GradScaler: - init_loss_scaling: 65536.0 - incr_every_n_steps: 2000 - -DistributedStrategy: - data_parallel: True - -# model architecture -Model: - name: mocov3_vit_base_pretrain - -LRScheduler: - name: TimmCosine - learning_rate: 0.0024 - eta_min: 0.0 - warmup_epoch: 40 - warmup_start_lr: 0.0 - decay_unit: step - warmup_prefix: True - -Optimizer: - name: AdamW - betas: (0.9, 0.999) - eps: 1e-8 - weight_decay: 0.1 - use_master_param: True - exp_avg_force_fp32: True - -# data loader for train and eval -DataLoader: - Train: - dataset: - name: ImageFolder - root: ./dataset/ILSVRC2012/train - transform: - - TwoViewsTransform: - base_transform1: - - RandomResizedCrop: - size: 224 - scale: [0.08, 1.0] - interpolation: bicubic - - ColorJitter: - p: 0.8 - brightness: 0.4 - contrast: 0.4 - saturation: 0.2 - hue: 0.1 - - RandomGrayscale: - p: 0.2 - - SimCLRGaussianBlur: - p: 1.0 - sigma: [.1, 2.] - - RandomHorizontalFlip: - - ToTensor: - - Normalize: - mean: [0.485, 0.456, 0.406] - std: [0.229, 0.224, 0.225] - base_transform2: - - RandomResizedCrop: - size: 224 - scale: [0.08, 1.0] - interpolation: bicubic - - ColorJitter: - p: 0.8 - brightness: 0.4 - contrast: 0.4 - saturation: 0.2 - hue: 0.1 - - RandomGrayscale: - p: 0.2 - - BYOLSolarize: - p: 0.2 - - RandomHorizontalFlip: - - ToTensor: - - Normalize: - mean: [0.485, 0.456, 0.406] - std: [0.229, 0.224, 0.225] - sampler: - name: DistributedBatchSampler - batch_size: 128 # accum_steps: 1, total batchsize: 4096 - drop_last: False - shuffle: True - loader: - num_workers: 8 - use_shared_memory: True diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml new file mode 100644 index 00000000..749ef7c6 --- /dev/null +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml @@ -0,0 +1,71 @@ +# global configs +Global: + task_type: ContrastiveLearning + train_loop: ContrastiveLearningTrainingEpochLoop + validate_loop: None + checkpoint: null + pretrained_model: null + output_dir: ./output/pretrain_0420 + device: gpu + save_interval: 1 + max_num_latest_checkpoint: 0 + eval_during_train: False + eval_interval: 1 + eval_unit: "epoch" + accum_steps: 1 + epochs: 800 + print_batch_step: 100 + use_visualdl: False + seed: 31 + +# FP16 setting +# FP16: +# level: O1 +# GradScaler: +# init_loss_scaling: 65536.0 +# incr_every_n_steps: 2000 + +DistributedStrategy: + data_parallel: True + +# model architecture +Model: + name: swav_resnet50_pretrain + normalize: True + hidden_mlp: 2048 + output_dim: 128 + nmb_prototypes: 3000 + +LRScheduler: + name: TimmCosine + learning_rate: 4.8 + decay_unit: step + eta_min: 0.0048 + warmup_epoch: 10 + warmup_start_lr: 0.3 + warmup_prefix: True + +Optimizer: + name: Momentum + momentum: 0.9 + weight_decay: 1e-6 + tensor_fusion: False + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: MultiCropDataset + root: ./dataset/ILSVRC2012/train + size_crops: [224, 96] + num_crops: [2, 6] + min_scale_crops: [0.14, 0.05] + max_scale_crops: [1, 0.14] + sampler: + name: DistributedBatchSampler + batch_size: 128 # accum_steps: 1, total batchsize: 4096 + drop_last: False + shuffle: True + loader: + num_workers: 8 + use_shared_memory: True diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh index cae7ebba..466ecef3 100644 --- a/tasks/ssl/swav/finetune.sh +++ b/tasks/ssl/swav/finetune.sh @@ -15,14 +15,13 @@ # Note: Set the following environment variables # and then need to run the script on each node. unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -export FLAGS_stop_check_timeout=3600 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ - passl-train \ - -c ./configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml + tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yml \ No newline at end of file diff --git a/tasks/ssl/swav/linearprobe.sh b/tasks/ssl/swav/linearprobe.sh index 31511a45..866322e1 100644 --- a/tasks/ssl/swav/linearprobe.sh +++ b/tasks/ssl/swav/linearprobe.sh @@ -12,10 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -#unset PADDLE_TRAINER_ENDPOINTS -# export PADDLE_NNODES=1 -# #export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538" -# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 # export FLAGS_stop_check_timeout=3600 unset PADDLE_TRAINER_ENDPOINTS unset DISTRIBUTED_TRAINER_ENDPOINTS diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh index f5dfc176..fb44a0d4 100644 --- a/tasks/ssl/swav/pretrain.sh +++ b/tasks/ssl/swav/pretrain.sh @@ -12,15 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -#unset PADDLE_TRAINER_ENDPOINTS -#export PADDLE_NNODES=4 -#export PADDLE_MASTER="xxx.xxx.xxx.xxx:12538" -#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -export FLAGS_stop_check_timeout=3600 +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +export PADDLE_NNODES=1 +export PADDLE_MASTER="127.0.0.1:12538" +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ - passl-train \ - -c ./configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml + tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yml \ No newline at end of file From 1da07b192d1e17d8f6bda712830e6b23c012dcc2 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Sun, 23 Apr 2023 16:50:22 +0800 Subject: [PATCH 06/46] update_pretrain --- passl/models/swav.py | 7 ++++++- .../configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml | 2 +- ...aml => swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml} | 8 +++++--- 3 files changed, 12 insertions(+), 5 deletions(-) rename tasks/ssl/swav/configs/{swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml => swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml} (94%) diff --git a/passl/models/swav.py b/passl/models/swav.py index 0181eb73..dec12dcb 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -1,4 +1,5 @@ import os +from sys import flags import paddle import paddle.nn as nn @@ -146,7 +147,11 @@ def swav_resnet50_finetune(**kwargs): model = SwAVFinetune(**kwargs) return model -def swav_resnet50_pretrain(**kwargs): +def swav_resnet50_pretrain(**kwargs): # todo + flags = {} + flags['FLAGS_cudnn_exhaustive_search'] = True + flags['FLAGS_cudnn_deterministic'] = True + paddle.set_flags(flags) model = SwAVPretrain(**kwargs) return model diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml index 251f5f49..33563b0e 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml @@ -5,7 +5,7 @@ Global: validate_loop: ClassificationEvaluationLoop checkpoint: null pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams - output_dir: ./output/baseline_0420_align_trackTrue + output_dir: ./output/baseline_0421_align_trackTrue_nolinearload device: gpu save_interval: 1 max_num_latest_checkpoint: 0 diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml similarity index 94% rename from tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml rename to tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml index 749ef7c6..8c3f9603 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml @@ -19,8 +19,8 @@ Global: seed: 31 # FP16 setting -# FP16: -# level: O1 +FP16: + level: O1 # GradScaler: # init_loss_scaling: 65536.0 # incr_every_n_steps: 2000 @@ -46,9 +46,11 @@ LRScheduler: warmup_prefix: True Optimizer: - name: Momentum + name: MomentumLARC momentum: 0.9 weight_decay: 1e-6 + trust_coefficient: 0.001 + clip: False tensor_fusion: False # data loader for train and eval From 709ea4dd375a62c2a5a6c3a36ea9776e30ead382 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Sun, 23 Apr 2023 19:16:38 +0800 Subject: [PATCH 07/46] ready_for_semi --- passl/models/swav.py | 60 +++++++++++++++++++ passl/optimizer/__init__.py | 4 +- ...v_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml | 21 ++++--- tasks/ssl/swav/finetune.sh | 2 +- 4 files changed, 76 insertions(+), 11 deletions(-) diff --git a/passl/models/swav.py b/passl/models/swav.py index dec12dcb..01627b66 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -1,3 +1,5 @@ +from collections import defaultdict +import copy import os from sys import flags @@ -5,6 +7,7 @@ import paddle.nn as nn from passl.nn import init +from passl.utils import logger from passl.models.resnet import resnet50 from passl.models.base_model import Model @@ -127,6 +130,63 @@ def __init__(self, **kwargs): def load_pretrained(self, path, rank=0, finetune=False): self._load_model(path, self.res_model, 'backbone') + + def param_groups(self, config, tensor_fusion=True, custom_cfg=None): + """ + lr_group(dict|optional): [{'name': 'backbone', 'lr_mult': 0.1}, {'name': 'norm', 'weight_decay_mult': 0}] + """ + if custom_cfg is not None: + assert isinstance(custom_cfg, list), "`custom_cfg` must be a list." + for item in custom_cfg: + assert isinstance( + item, dict), "The item of `custom_cfg` must be a dict" + + param_group = self._collect_params(self.res_model, tensor_fusion, config) + + return param_group + + def _collect_params(self, config, model, tensor_fusion): + # Collect different parameter groups + if self.custom_cfg is None or len(self.custom_cfg) == 0: + return {'params': model.parameters(), 'tensor_fusion': tensor_fusion} + + self.weight_decay = config['weight_decay'] + groups_num = len(self.custom_cfg) + 1 + params_list = [[] for _ in range(groups_num)] + for name, param in model.named_parameters(): + if param.stop_gradient: + continue + for idx, item in enumerate(self.custom_cfg): + if item['name'] in name: + params_list[idx].append(param) + break + else: + params_list[-1].append(param) + + res = [] + for idx, item in enumerate(self.custom_cfg): + lr_mult = item.get("lr_mult", 1.0) + weight_decay_mult = item.get("weight_decay_mult", None) + param_dict = {'params': params_list[idx], 'learning_rate': lr_mult} + if self.weight_decay is not None and weight_decay_mult is not None: + param_dict['weight_decay'] = self.weight_decay * weight_decay_mult + param_dict['tensor_fusion'] = tensor_fusion + res.append(param_dict) + res.append({'params': params_list[-1]}) + + msg = 'Parameter groups for optimizer: \n' + for idx, item in enumerate(self.custom_cfg): + params_name = [p.name for p in params_list[idx]] + item = item.copy() + item['params_name'] = params_name + msg += 'Group {}: \n{} \n'.format(idx, item) + msg += 'Last group:\n params_name: {}'.format( + [p.name for p in params_list[-1]]) + logger.info(msg) + + return res + + def forward(self, inp): return self.res_model(inp) diff --git a/passl/optimizer/__init__.py b/passl/optimizer/__init__.py index b73f0a90..609e83e9 100644 --- a/passl/optimizer/__init__.py +++ b/passl/optimizer/__init__.py @@ -122,6 +122,7 @@ def build_optimizer(config, lr_scheduler, model=None): config = copy.deepcopy(config) optim_name = config.pop('name') + custom_cfg = config.pop('custom_cfg', None) grad_clip = None grad_clip_config = config.pop('grad_clip', None) @@ -136,7 +137,8 @@ def build_optimizer(config, lr_scheduler, model=None): logger.info('LARS or LARC Optimizer can not use tensor fusion technology. It automatically fall back to `tensor_fusion = False`.') if hasattr(model, 'param_groups'): - param_group = model.param_groups(no_weight_decay_name, tensor_fusion) + # param_group = model.param_groups(no_weight_decay_name, tensor_fusion) # todo compact simsaim + param_group = model.param_groups(config, tensor_fusion, custom_cfg) for group in param_group: if 'tensor_fusion' in group and group['tensor_fusion']: group['params'] = get_fused_params(group['params']) diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml index f2a229a0..f7950c0b 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml @@ -4,7 +4,7 @@ Global: train_loop: ClassificationTrainingEpochLoop validate_loop: ClassificationEvaluationLoop checkpoint: null - pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams + pretrained_model: /ssd2/tangshiyu/Code/PASSL/swav_800ep_pretrain.pdparams finetune: True output_dir: ./output/semi_0420 device: gpu @@ -41,18 +41,20 @@ Loss: weight: 1.0 LRScheduler: - name: Step - learning_rate: 0.01 - boundaries: [12, 16] - values: [0.01, 0.002, 0.0004] - decay_unit: epoch - last_epoch: 0 + name: MultiStepDecay + learning_rate: 0.02 + milestones: [12, 16] + gamma: 0.2 + last_epoch: -1 Optimizer: name: Momentum momentum: 0.9 weight_decay: 0.0 tensor_fusion: False + custom_config: + - name: head + lr_mult: 250 # data loader for train and eval DataLoader: @@ -68,9 +70,10 @@ DataLoader: - Normalize: mean: [0.485, 0.456, 0.406] std: [0.228, 0.224, 0.225] + samples_tag: semi_1 sampler: name: DistributedBatchSampler - batch_size: 32 # accum_steps: 1, total batchsize: 256 + batch_size: 64 # accum_steps: 1, total batchsize: 256 drop_last: False shuffle: True loader: @@ -92,7 +95,7 @@ DataLoader: std: [0.228, 0.224, 0.225] sampler: name: DistributedBatchSampler - batch_size: 32 + batch_size: 64 drop_last: False shuffle: False loader: diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh index 466ecef3..c06a84cc 100644 --- a/tasks/ssl/swav/finetune.sh +++ b/tasks/ssl/swav/finetune.sh @@ -18,7 +18,7 @@ unset PADDLE_TRAINER_ENDPOINTS unset DISTRIBUTED_TRAINER_ENDPOINTS export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export CUDA_VISIBLE_DEVICES=4,1,2,3 #,4,5,6,7 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ From 767494f24beb2fd9946ffff34ae4ebbb10e1584b Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Sun, 23 Apr 2023 19:25:09 +0800 Subject: [PATCH 08/46] same --- passl/models/swav.py | 190 ++++++++++++------ ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml | 40 ++-- 2 files changed, 149 insertions(+), 81 deletions(-) diff --git a/passl/models/swav.py b/passl/models/swav.py index 8f20b6d9..01627b66 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -1,20 +1,25 @@ +from collections import defaultdict +import copy import os +from sys import flags import paddle import paddle.nn as nn from passl.nn import init +from passl.utils import logger from passl.models.resnet import resnet50 from passl.models.base_model import Model __all__ = [ - # 'swav_resnet50', + 'swav_resnet50_finetune', 'swav_resnet50_linearprobe', - # 'swav_resnet50_pretrain', + 'swav_resnet50_pretrain', 'SwAV', 'SwAVLinearProbe', - # 'SwAVPretrain', + 'SwAVFinetune', + 'SwAVPretrain', ] # def model and @@ -23,7 +28,32 @@ def __init__(self, **kwargs): super().__init__() self.res_model = resnet50(**kwargs) - + def _load_model(self, path, model, tag): + if os.path.isfile(path): + para_state_dict = paddle.load(path) + + # resnet + model_state_dict = model.state_dict() + keys = model_state_dict.keys() + num_params_loaded = 0 + for k in keys: + if k not in para_state_dict: + print("{} is not in pretrained model".format(k)) + elif list(para_state_dict[k].shape) != list(model_state_dict[k] + .shape): + print( + "[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})" + .format(k, para_state_dict[k].shape, model_state_dict[k] + .shape)) + else: + model_state_dict[k] = para_state_dict[k] + num_params_loaded += 1 + model.set_dict(model_state_dict) + print("There are {}/{} variables loaded into {}.".format( + num_params_loaded, len(model_state_dict), tag)) + else: + print("No pretrained weights found in {} => training with random weights".format(tag)) + def load_pretrained(self, path, rank=0, finetune=False): pass # if not os.path.exists(path + '.pdparams'): @@ -55,9 +85,9 @@ def save(self, path, local_rank=0, rank=0): class SwAVLinearProbe(SwAV): - def __init__(self, class_num=1000, linear_arch="resnet50", global_avg=True, use_bn=False, **kwargs): + def __init__(self, class_num=1000, **kwargs): super().__init__(**kwargs) - self.linear = RegLog(1000, "resnet50", global_avg=True, use_bn=False) + self.linear = RegLog(class_num) self.res_model.eval() # freeze all layers but the last fc @@ -75,38 +105,11 @@ def __init__(self, class_num=1000, linear_arch="resnet50", global_avg=True, use_ def _freeze_norm(self, layer): if isinstance(layer, (nn.layer.norm._BatchNormBase)): layer._use_global_stats = True - - def _load_model(self, path, model, tag): - if os.path.isfile(path): - para_state_dict = paddle.load(path) - - # resnet - model_state_dict = model.state_dict() - keys = model_state_dict.keys() - num_params_loaded = 0 - for k in keys: - if k not in para_state_dict: - print("{} is not in pretrained model".format(k)) - elif list(para_state_dict[k].shape) != list(model_state_dict[k] - .shape): - print( - "[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})" - .format(k, para_state_dict[k].shape, model_state_dict[k] - .shape)) - else: - model_state_dict[k] = para_state_dict[k] - num_params_loaded += 1 - model.set_dict(model_state_dict) - print("There are {}/{} variables loaded into {}.".format( - num_params_loaded, len(model_state_dict), tag)) - else: - print("No pretrained weights found in {} => training with random weights".format(tag)) def load_pretrained(self, path, rank=0, finetune=False): self._load_model(path, self.res_model, 'backbone') self._load_model("linear.pdparams", self.linear, 'linear') - def forward(self, inp): # import numpy as np # import pdb; pdb.set_trace() @@ -121,16 +124,96 @@ def forward(self, inp): return output +class SwAVFinetune(SwAV): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def load_pretrained(self, path, rank=0, finetune=False): + self._load_model(path, self.res_model, 'backbone') + + def param_groups(self, config, tensor_fusion=True, custom_cfg=None): + """ + lr_group(dict|optional): [{'name': 'backbone', 'lr_mult': 0.1}, {'name': 'norm', 'weight_decay_mult': 0}] + """ + if custom_cfg is not None: + assert isinstance(custom_cfg, list), "`custom_cfg` must be a list." + for item in custom_cfg: + assert isinstance( + item, dict), "The item of `custom_cfg` must be a dict" + + param_group = self._collect_params(self.res_model, tensor_fusion, config) + + return param_group + + def _collect_params(self, config, model, tensor_fusion): + # Collect different parameter groups + if self.custom_cfg is None or len(self.custom_cfg) == 0: + return {'params': model.parameters(), 'tensor_fusion': tensor_fusion} + + self.weight_decay = config['weight_decay'] + groups_num = len(self.custom_cfg) + 1 + params_list = [[] for _ in range(groups_num)] + for name, param in model.named_parameters(): + if param.stop_gradient: + continue + for idx, item in enumerate(self.custom_cfg): + if item['name'] in name: + params_list[idx].append(param) + break + else: + params_list[-1].append(param) + + res = [] + for idx, item in enumerate(self.custom_cfg): + lr_mult = item.get("lr_mult", 1.0) + weight_decay_mult = item.get("weight_decay_mult", None) + param_dict = {'params': params_list[idx], 'learning_rate': lr_mult} + if self.weight_decay is not None and weight_decay_mult is not None: + param_dict['weight_decay'] = self.weight_decay * weight_decay_mult + param_dict['tensor_fusion'] = tensor_fusion + res.append(param_dict) + res.append({'params': params_list[-1]}) + + msg = 'Parameter groups for optimizer: \n' + for idx, item in enumerate(self.custom_cfg): + params_name = [p.name for p in params_list[idx]] + item = item.copy() + item['params_name'] = params_name + msg += 'Group {}: \n{} \n'.format(idx, item) + msg += 'Last group:\n params_name: {}'.format( + [p.name for p in params_list[-1]]) + logger.info(msg) + + return res + + + + def forward(self, inp): + return self.res_model(inp) + +class SwAVPretrain(SwAV): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def forward(self, inp): + return self.res_model(inp) + def swav_resnet50_linearprobe(**kwargs): - model = SwAVLinearProbe(linear_arch="resnet50", - global_avg=True, - use_bn=False, - output_dim=0, - eval_mode=True, - **kwargs) + model = SwAVLinearProbe(**kwargs) return model - + +def swav_resnet50_finetune(**kwargs): + model = SwAVFinetune(**kwargs) + return model + +def swav_resnet50_pretrain(**kwargs): # todo + flags = {} + flags['FLAGS_cudnn_exhaustive_search'] = True + flags['FLAGS_cudnn_deterministic'] = True + paddle.set_flags(flags) + model = SwAVPretrain(**kwargs) + return model # def normal_init(param, **kwargs): # initializer = nn.initializer.Normal(**kwargs) @@ -144,35 +227,16 @@ def swav_resnet50_linearprobe(**kwargs): class RegLog(paddle.nn.Layer): """Creates logistic regression on top of frozen features""" - def __init__(self, num_labels, arch='resnet50', global_avg=False, - use_bn=True): + def __init__(self, num_labels): super(RegLog, self).__init__() - self.bn = None - if global_avg: - if arch == 'resnet50': - s = 2048 - elif arch == 'resnet50w2': - s = 4096 - elif arch == 'resnet50w4': - s = 8192 - self.av_pool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1)) - else: - assert arch == 'resnet50' - s = 8192 - self.av_pool = paddle.nn.AvgPool2D(6, stride=1) - if use_bn: - self.bn = paddle.nn.BatchNorm2D(num_features=2048, momentum - =1 - 0.1, epsilon=1e-05, weight_attr=None, bias_attr= - None, use_global_stats=True) - + s = 2048 + self.av_pool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1)) self.linear = paddle.nn.Linear(in_features=s, out_features=num_labels) + init.normal_(self.linear.weight, mean=0.0, std=0.01) init.zeros_(self.linear.bias) def forward(self, x): x = self.av_pool(x) - if self.bn is not None: - x = self.bn(x) - x = x.reshape((x.shape[0], -1)) return self.linear(x) \ No newline at end of file diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml index a290ea19..f7950c0b 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml @@ -4,8 +4,9 @@ Global: train_loop: ClassificationTrainingEpochLoop validate_loop: ClassificationEvaluationLoop checkpoint: null - pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams - output_dir: ./output/baseline_0420_align_trackTrue + pretrained_model: /ssd2/tangshiyu/Code/PASSL/swav_800ep_pretrain.pdparams + finetune: True + output_dir: ./output/semi_0420 device: gpu save_interval: 1 max_num_latest_checkpoint: 0 @@ -13,12 +14,12 @@ Global: eval_interval: 1 eval_unit: "epoch" accum_steps: 1 - epochs: 100 + epochs: 20 print_batch_step: 100 use_visualdl: False seed: 31 -# FP16 setting ignore in align +# FP16 setting # FP16: # level: O1 @@ -27,8 +28,8 @@ DistributedStrategy: # model architecture Model: - name: swav_resnet50_linearprobe - class_num: 1000 + name: swav_resnet50_finetune + output_dim: 1000 # loss function config for traing/eval process Loss: @@ -40,19 +41,21 @@ Loss: weight: 1.0 LRScheduler: - name: TimmCosine - learning_rate: 0.3 - eta_min: 0.0 - decay_unit: epoch - last_epoch: 0 - warmup_epoch: 0 + name: MultiStepDecay + learning_rate: 0.02 + milestones: [12, 16] + gamma: 0.2 + last_epoch: -1 Optimizer: name: Momentum momentum: 0.9 - weight_decay: 1e-6 - tensor_fusion: True - + weight_decay: 0.0 + tensor_fusion: False + custom_config: + - name: head + lr_mult: 250 + # data loader for train and eval DataLoader: Train: @@ -67,9 +70,10 @@ DataLoader: - Normalize: mean: [0.485, 0.456, 0.406] std: [0.228, 0.224, 0.225] + samples_tag: semi_1 sampler: name: DistributedBatchSampler - batch_size: 32 # accum_steps: 1, total batchsize: 256 + batch_size: 64 # accum_steps: 1, total batchsize: 256 drop_last: False shuffle: True loader: @@ -91,7 +95,7 @@ DataLoader: std: [0.228, 0.224, 0.225] sampler: name: DistributedBatchSampler - batch_size: 32 + batch_size: 64 drop_last: False shuffle: False loader: @@ -108,4 +112,4 @@ Metric: Export: export_type: paddle - input_shape: [None, 3, 224, 224] + input_shape: [None, 3, 224, 224] \ No newline at end of file From 0021d00252d6d64f2a014ee09aaa66b32529131e Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Mon, 24 Apr 2023 15:41:15 +0800 Subject: [PATCH 09/46] split_params --- passl/data/__init__.py | 1 - passl/data/dataset/imagefolder_dataset.py | 12 +++-- passl/engine/engine.py | 25 ++++++---- passl/engine/loops/classification_loop.py | 4 +- passl/models/__init__.py | 2 +- passl/models/swav.py | 44 ++++++++-------- passl/optimizer/__init__.py | 50 +++++++++++-------- passl/optimizer/momentum.py | 4 -- passl/optimizer/optimizer.py | 8 +-- passl/scheduler/__init__.py | 10 +++- ...v_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml | 27 ++++++---- ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml | 42 ++++++---------- tasks/ssl/swav/finetune.sh | 3 +- tasks/ssl/swav/pretrain.sh | 2 +- 14 files changed, 121 insertions(+), 113 deletions(-) diff --git a/passl/data/__init__.py b/passl/data/__init__.py index 50ce7ec5..049606f6 100644 --- a/passl/data/__init__.py +++ b/passl/data/__init__.py @@ -50,7 +50,6 @@ def build_dataloader(config, mode, device, use_dali=False, if config_batch_transform_ops is not None: batch_transform = utils.create_preprocess_operators( config_batch_transform_ops) - dataset = eval("dataset.{}".format(dataset_name))(**config_dataset) logger.debug("build dataset({}) success...".format(dataset)) diff --git a/passl/data/dataset/imagefolder_dataset.py b/passl/data/dataset/imagefolder_dataset.py index ef03f1c4..5ad4e208 100644 --- a/passl/data/dataset/imagefolder_dataset.py +++ b/passl/data/dataset/imagefolder_dataset.py @@ -14,6 +14,7 @@ import os import urllib +import urllib.request import numpy as np from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union @@ -65,11 +66,14 @@ def __init__(self, if samples_tag is None: samples = self.make_dataset(self.root, class_to_idx, extensions) elif samples_tag == "semi_1" or samples == "semi_10": - train_data_path = os.path.join(root, "train") + # train_data_path = os.path.join(root, "train") percent = samples_tag.split('_')[-1] - subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(percent) + "percent.txt") - list_imgs = [li.decode("utf-8").split('\n')[0] for li in subset_file] - samples = [(os.path.join(train_data_path, li.split('_')[0], li), class_to_idx[li.split('_')[0]]) for li in list_imgs] + # subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(percent) + "percent.txt") + subset_file = str(percent) + "percent.txt" + with open(subset_file, 'r') as f: + list_imgs = [li.split('\n')[0] for li in f.readlines()] + # print(list_imgs) + samples = [(os.path.join(root, li.split('_')[0], li), class_to_idx[li.split('_')[0]]) for li in list_imgs] else: raise NotImplementedError('{} is not implemented'.format(samples)) diff --git a/passl/engine/engine.py b/passl/engine/engine.py index 23c59ab9..24301420 100644 --- a/passl/engine/engine.py +++ b/passl/engine/engine.py @@ -214,17 +214,20 @@ def worker_init_fn(worker_id): # build optimizer and lr scheduler if self.mode == 'train': - config_lr_scheduler = self.config.get('LRScheduler', None) - self.lr_scheduler = None - if config_lr_scheduler is not None: - self.lr_decay_unit = config_lr_scheduler.get('decay_unit', - 'step') - self.lr_scheduler = build_lr_scheduler( - config_lr_scheduler, self.config["Global"]["epochs"], - len(self.train_dataloader)) - - self.optimizer = build_optimizer(self.config["Optimizer"], - self.lr_scheduler, self.model) + if self.config["Optimizer"].get('decay_unit', None) is not None: + self.lr_decay_unit = self.config["Optimizer"]['decay_unit'] + else: + self.lr_decay_unit = 'step' + Warning('lr_decay_unit is not set in optimizer config, set to step by default') + # self.lr_scheduler = None + # self.lr_scheduler = build_lr_scheduler( + # config_lr_scheduler, self.config["Global"]["epochs"], + # len(self.train_dataloader)) + # # todo add lr scheduler for different group + + # self.optimizer = build_optimizer(self.config["Optimizer"], + # self.lr_scheduler, self.model) + self.optimizer = build_optimizer(self.config["Optimizer"], self.model, self.config, len(self.train_dataloader)) # load pretrained model if self.config["Global"]["pretrained_model"] is not None: diff --git a/passl/engine/loops/classification_loop.py b/passl/engine/loops/classification_loop.py index b1d8d47f..3f3c29d9 100644 --- a/passl/engine/loops/classification_loop.py +++ b/passl/engine/loops/classification_loop.py @@ -195,8 +195,8 @@ def train_one_step(self, batch): self.trainer.scaler.update() # clear gradients self.trainer.optimizer.clear_grad() - - if self.trainer.lr_decay_unit == 'step': + + if self.trainer.lr_decay_unit == 'step': # default is step self.trainer.optimizer.lr_step(self.global_step) return out, loss_dict diff --git a/passl/models/__init__.py b/passl/models/__init__.py index 0792faae..38ea440d 100644 --- a/passl/models/__init__.py +++ b/passl/models/__init__.py @@ -27,7 +27,7 @@ from .convnext import * from .mocov3 import * from .swav import * -from .simsiam import * +# from .simsiam import * __all__ = ["build_model"] diff --git a/passl/models/swav.py b/passl/models/swav.py index 01627b66..9ae220e9 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -7,6 +7,7 @@ import paddle.nn as nn from passl.nn import init +from passl.scheduler import build_lr_scheduler, lr_scheduler from passl.utils import logger from passl.models.resnet import resnet50 from passl.models.base_model import Model @@ -131,62 +132,61 @@ def __init__(self, **kwargs): def load_pretrained(self, path, rank=0, finetune=False): self._load_model(path, self.res_model, 'backbone') - def param_groups(self, config, tensor_fusion=True, custom_cfg=None): + def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=None): """ - lr_group(dict|optional): [{'name': 'backbone', 'lr_mult': 0.1}, {'name': 'norm', 'weight_decay_mult': 0}] + custom_cfg(dict|optional): [{'name': 'backbone', 'lr': 0.1, 'LRScheduler': {"lr":1.0}}, {'name': 'norm', 'weight_decay_mult': 0}] """ - if custom_cfg is not None: - assert isinstance(custom_cfg, list), "`custom_cfg` must be a list." - for item in custom_cfg: + + self.custom_cfg = config.pop('custom_cfg', None) + if self.custom_cfg is not None: + assert isinstance(self.custom_cfg, list), "`custom_cfg` must be a list." + for item in self.custom_cfg: assert isinstance( item, dict), "The item of `custom_cfg` must be a dict" - param_group = self._collect_params(self.res_model, tensor_fusion, config) + param_group = self._collect_params(config, self.res_model, tensor_fusion, epochs, trainset_length) return param_group - def _collect_params(self, config, model, tensor_fusion): + def _collect_params(self, config, model, tensor_fusion, epochs, trainset_length): # Collect different parameter groups if self.custom_cfg is None or len(self.custom_cfg) == 0: - return {'params': model.parameters(), 'tensor_fusion': tensor_fusion} + return [{'params': model.parameters(), 'tensor_fusion': tensor_fusion}] + # split params self.weight_decay = config['weight_decay'] - groups_num = len(self.custom_cfg) + 1 - params_list = [[] for _ in range(groups_num)] + params_dict = {item['name']: [] for item in self.custom_cfg} for name, param in model.named_parameters(): if param.stop_gradient: continue for idx, item in enumerate(self.custom_cfg): - if item['name'] in name: - params_list[idx].append(param) + if item['name'] in name and item['name']!='PasslDefault': + params_dict[item['name']].append(param) break else: - params_list[-1].append(param) + params_dict['PasslDefault'].append(param) res = [] - for idx, item in enumerate(self.custom_cfg): - lr_mult = item.get("lr_mult", 1.0) + for item in self.custom_cfg: weight_decay_mult = item.get("weight_decay_mult", None) - param_dict = {'params': params_list[idx], 'learning_rate': lr_mult} + if item.get("LRScheduler", None) is not None: + lr_scheduler = build_lr_scheduler(item['LRScheduler'], epochs, trainset_length, config['decay_unit']) + param_dict = {'params': params_dict[item['name']], 'lr': lr_scheduler} + if self.weight_decay is not None and weight_decay_mult is not None: param_dict['weight_decay'] = self.weight_decay * weight_decay_mult param_dict['tensor_fusion'] = tensor_fusion res.append(param_dict) - res.append({'params': params_list[-1]}) msg = 'Parameter groups for optimizer: \n' for idx, item in enumerate(self.custom_cfg): - params_name = [p.name for p in params_list[idx]] + params_name = [p.name for p in params_dict[item['name']]] item = item.copy() item['params_name'] = params_name msg += 'Group {}: \n{} \n'.format(idx, item) - msg += 'Last group:\n params_name: {}'.format( - [p.name for p in params_list[-1]]) logger.info(msg) return res - - def forward(self, inp): return self.res_model(inp) diff --git a/passl/optimizer/__init__.py b/passl/optimizer/__init__.py index 609e83e9..2fb4a14f 100644 --- a/passl/optimizer/__init__.py +++ b/passl/optimizer/__init__.py @@ -32,16 +32,16 @@ # from .momentum_lars import MomentumLARS -# def build_optimizer(config, lr_scheduler, model=None): -# config = copy.deepcopy(config) +# def build_optimizer(optim_config, lr_scheduler, model=None): +# optim_config = copy.deepcopy(optim_config) # grad_clip = None -# grad_clip_config = config.pop('grad_clip', None) +# grad_clip_config = optim_config.pop('grad_clip', None) # if grad_clip_config is not None: # grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm') # grad_clip = eval(grad_clip_name)(**grad_clip_config) -# no_weight_decay_name = config.pop('no_weight_decay_name', []) +# no_weight_decay_name = optim_config.pop('no_weight_decay_name', []) # param_group = defaultdict(list) # for n, p in model.named_parameters(): @@ -74,11 +74,11 @@ # params.append(group) -# optim_name = config.pop('name') +# optim_name = optim_config.pop('name') # optim = eval(optim_name)(params, # lr=lr_scheduler, # grad_clip=grad_clip, -# **config) +# **optim_config) # logger.debug("build optimizer ({}) success..".format(optim)) # return optim @@ -119,29 +119,30 @@ from .momentum_larc import MomentumLARC -def build_optimizer(config, lr_scheduler, model=None): - config = copy.deepcopy(config) - optim_name = config.pop('name') - custom_cfg = config.pop('custom_cfg', None) +def build_optimizer(optim_config, model, config, trainset_length): + optim_config = copy.deepcopy(optim_config) + optim_name = optim_config.pop('name') grad_clip = None - grad_clip_config = config.pop('grad_clip', None) + grad_clip_config = optim_config.pop('grad_clip', None) if grad_clip_config is not None: grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm') grad_clip = eval(grad_clip_name)(**grad_clip_config) - no_weight_decay_name = config.pop('no_weight_decay_name', []) - tensor_fusion = config.pop('tensor_fusion', True) + no_weight_decay_name = optim_config.pop('no_weight_decay_name', []) + tensor_fusion = optim_config.pop('tensor_fusion', True) if 'LAR' in optim_name: tensor_fusion = False logger.info('LARS or LARC Optimizer can not use tensor fusion technology. It automatically fall back to `tensor_fusion = False`.') if hasattr(model, 'param_groups'): # param_group = model.param_groups(no_weight_decay_name, tensor_fusion) # todo compact simsaim - param_group = model.param_groups(config, tensor_fusion, custom_cfg) + param_group = model.param_groups(optim_config, tensor_fusion, config["Global"]["epochs"], trainset_length) for group in param_group: if 'tensor_fusion' in group and group['tensor_fusion']: group['params'] = get_fused_params(group['params']) + optim_config.pop('custom_cfg', None) + else: param_group_map = defaultdict(list) for n, p in model.named_parameters(): @@ -175,16 +176,21 @@ def build_optimizer(config, lr_scheduler, model=None): param_group.append(group) - lr = lr_scheduler - lr_func = None - if isinstance(lr_scheduler, LRCallable): - lr = lr_scheduler.lr - lr_func = lr_scheduler + # lr = lr_scheduler + # lr_func = None + # if isinstance(lr_scheduler, LRCallable): # 如果是自定义的 scheduler,则lr为数字,使用lr_func 进行lr的迭代 + # lr = lr_scheduler.lr + # lr_func = lr_scheduler + + for i, item in enumerate(param_group): + for key, val in item.items(): + if key != 'params': + print(' {} is {}'.format(key, val)) + else: + print("Group {}: param: {}".format(i, [p.name for p in item[key]])) optim = eval(optim_name)(param_group, - lr=lr, - lr_func=lr_func, grad_clip=grad_clip, - **config) + **optim_config) logger.debug("build optimizer ({}) success..".format(optim)) return optim diff --git a/passl/optimizer/momentum.py b/passl/optimizer/momentum.py index 8b569c7c..179839fc 100644 --- a/passl/optimizer/momentum.py +++ b/passl/optimizer/momentum.py @@ -26,8 +26,6 @@ class Momentum(Optimizer): def __init__(self, params, - lr=0.001, - lr_func=None, momentum=0.9, weight_decay=0.0, use_master_param=True, @@ -35,8 +33,6 @@ def __init__(self, **args): defaults = dict( - lr=lr, - lr_func=lr_func, momentum=momentum, weight_decay=weight_decay, use_master_param=use_master_param, diff --git a/passl/optimizer/optimizer.py b/passl/optimizer/optimizer.py index 98e6a3b1..d3f4ae63 100644 --- a/passl/optimizer/optimizer.py +++ b/passl/optimizer/optimizer.py @@ -206,12 +206,12 @@ def clear_grad(self, set_to_zero=True): @paddle.no_grad() def lr_step(self, step=None): - for group in self.param_groups: + for i, group in enumerate(self.param_groups): lr = group['lr'] - if isinstance(lr, paddle.optimizer.lr.LRScheduler): + + if isinstance(lr, paddle.optimizer.lr.LRScheduler): # group defined lr scheduler lr.step(step) - elif 'lr_func' in group and callable(group['lr_func']): - group['lr_func'](group, step) + print("####lr0 {}, lr0 {}".format(self.param_groups[0]['lr'].get_lr(), self.param_groups[1]['lr'].get_lr())) @paddle.no_grad() def get_lr(self, group_id=0): diff --git a/passl/scheduler/__init__.py b/passl/scheduler/__init__.py index ecfb2cf6..4f31e170 100644 --- a/passl/scheduler/__init__.py +++ b/passl/scheduler/__init__.py @@ -12,6 +12,7 @@ # limitations under the License. import paddle +from paddle.optimizer.lr import MultiStepDecay from passl.utils import logger @@ -19,10 +20,15 @@ from .lr_callable import LRCallable, CosineWithFixLR -def build_lr_scheduler(lr_config, epochs, step_each_epoch): - lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch}) +def build_lr_scheduler(lr_config, epochs, step_each_epoch, decay_unit): + lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch, 'decay_unit': decay_unit }) if 'name' in lr_config: lr_name = lr_config.pop('name') + if "MultiStepDecay" in lr_name: + lr_config.pop('epochs') + lr_config.pop('step_each_epoch') + lr_config.pop('decay_unit') + print(lr_config) lr = eval(lr_name)(**lr_config) if isinstance(lr, paddle.optimizer.lr.LRScheduler): return lr diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml index f7950c0b..9781c34e 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml @@ -4,9 +4,9 @@ Global: train_loop: ClassificationTrainingEpochLoop validate_loop: ClassificationEvaluationLoop checkpoint: null - pretrained_model: /ssd2/tangshiyu/Code/PASSL/swav_800ep_pretrain.pdparams + pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams finetune: True - output_dir: ./output/semi_0420 + output_dir: ./output/semi_0424 device: gpu save_interval: 1 max_num_latest_checkpoint: 0 @@ -40,21 +40,28 @@ Loss: - CELoss: weight: 1.0 -LRScheduler: - name: MultiStepDecay - learning_rate: 0.02 - milestones: [12, 16] - gamma: 0.2 - last_epoch: -1 Optimizer: name: Momentum momentum: 0.9 weight_decay: 0.0 tensor_fusion: False - custom_config: + decay_unit: epoch + custom_cfg: - name: head - lr_mult: 250 + LRScheduler: + name: MultiStepDecay + learning_rate: 5 + milestones: [12, 16] + gamma: 0.2 + last_epoch: -1 + - name: PasslDefault + LRScheduler: + name: MultiStepDecay + learning_rate: 0.02 + milestones: [12, 16] + gamma: 0.2 + last_epoch: -1 # data loader for train and eval DataLoader: diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml index 3136121c..33563b0e 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml @@ -4,14 +4,8 @@ Global: train_loop: ClassificationTrainingEpochLoop validate_loop: ClassificationEvaluationLoop checkpoint: null -<<<<<<< HEAD - pretrained_model: /ssd2/tangshiyu/Code/PASSL/swav_800ep_pretrain.pdparams - finetune: True - output_dir: ./output/semi_0420 -======= pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams output_dir: ./output/baseline_0421_align_trackTrue_nolinearload ->>>>>>> 709ea4dd375a62c2a5a6c3a36ea9776e30ead382 device: gpu save_interval: 1 max_num_latest_checkpoint: 0 @@ -19,12 +13,12 @@ Global: eval_interval: 1 eval_unit: "epoch" accum_steps: 1 - epochs: 20 + epochs: 100 print_batch_step: 100 use_visualdl: False seed: 31 -# FP16 setting +# FP16 setting ignore in align # FP16: # level: O1 @@ -33,15 +27,10 @@ DistributedStrategy: # model architecture Model: -<<<<<<< HEAD - name: swav_resnet50_finetune - output_dim: 1000 -======= name: swav_resnet50_linearprobe output_dim: 0 eval_mode: True class_num: 1000 ->>>>>>> 709ea4dd375a62c2a5a6c3a36ea9776e30ead382 # loss function config for traing/eval process Loss: @@ -53,21 +42,19 @@ Loss: weight: 1.0 LRScheduler: - name: MultiStepDecay - learning_rate: 0.02 - milestones: [12, 16] - gamma: 0.2 - last_epoch: -1 + name: TimmCosine + learning_rate: 0.3 + eta_min: 0.0 + decay_unit: epoch + last_epoch: 0 + warmup_epoch: 0 Optimizer: name: Momentum momentum: 0.9 - weight_decay: 0.0 - tensor_fusion: False - custom_config: - - name: head - lr_mult: 250 - + weight_decay: 1e-6 + tensor_fusion: True + # data loader for train and eval DataLoader: Train: @@ -82,10 +69,9 @@ DataLoader: - Normalize: mean: [0.485, 0.456, 0.406] std: [0.228, 0.224, 0.225] - samples_tag: semi_1 sampler: name: DistributedBatchSampler - batch_size: 64 # accum_steps: 1, total batchsize: 256 + batch_size: 32 # accum_steps: 1, total batchsize: 256 drop_last: False shuffle: True loader: @@ -107,7 +93,7 @@ DataLoader: std: [0.228, 0.224, 0.225] sampler: name: DistributedBatchSampler - batch_size: 64 + batch_size: 32 drop_last: False shuffle: False loader: @@ -124,4 +110,4 @@ Metric: Export: export_type: paddle - input_shape: [None, 3, 224, 224] \ No newline at end of file + input_shape: [None, 3, 224, 224] diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh index c06a84cc..5aa3ff33 100644 --- a/tasks/ssl/swav/finetune.sh +++ b/tasks/ssl/swav/finetune.sh @@ -19,9 +19,10 @@ unset DISTRIBUTED_TRAINER_ENDPOINTS export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" export CUDA_VISIBLE_DEVICES=4,1,2,3 #,4,5,6,7 +export https_proxy="http://172.19.56.199:3128" python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ - tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yml \ No newline at end of file + passl-train -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh index fb44a0d4..d1c866c6 100644 --- a/tasks/ssl/swav/pretrain.sh +++ b/tasks/ssl/swav/pretrain.sh @@ -22,4 +22,4 @@ python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ - tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yml \ No newline at end of file + tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml \ No newline at end of file From 57af8e9814fa3b244cea0d608b813ac6df8a4eb3 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Mon, 24 Apr 2023 15:41:15 +0800 Subject: [PATCH 10/46] split_params --- passl/data/__init__.py | 1 - passl/data/dataset/imagefolder_dataset.py | 12 +- passl/engine/engine.py | 17 +-- passl/engine/loops/classification_loop.py | 4 +- passl/models/__init__.py | 2 +- passl/models/swav.py | 51 ++++---- passl/optimizer/__init__.py | 112 ++---------------- passl/optimizer/momentum.py | 4 - passl/optimizer/optimizer.py | 7 +- passl/scheduler/__init__.py | 10 +- passl/scheduler/lr_scheduler.py | 6 +- ...v_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml | 27 +++-- ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml | 47 +++----- ..._resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml | 20 ++-- tasks/ssl/swav/finetune.sh | 3 +- tasks/ssl/swav/pretrain.sh | 2 +- 16 files changed, 119 insertions(+), 206 deletions(-) diff --git a/passl/data/__init__.py b/passl/data/__init__.py index 50ce7ec5..049606f6 100644 --- a/passl/data/__init__.py +++ b/passl/data/__init__.py @@ -50,7 +50,6 @@ def build_dataloader(config, mode, device, use_dali=False, if config_batch_transform_ops is not None: batch_transform = utils.create_preprocess_operators( config_batch_transform_ops) - dataset = eval("dataset.{}".format(dataset_name))(**config_dataset) logger.debug("build dataset({}) success...".format(dataset)) diff --git a/passl/data/dataset/imagefolder_dataset.py b/passl/data/dataset/imagefolder_dataset.py index ef03f1c4..5ad4e208 100644 --- a/passl/data/dataset/imagefolder_dataset.py +++ b/passl/data/dataset/imagefolder_dataset.py @@ -14,6 +14,7 @@ import os import urllib +import urllib.request import numpy as np from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union @@ -65,11 +66,14 @@ def __init__(self, if samples_tag is None: samples = self.make_dataset(self.root, class_to_idx, extensions) elif samples_tag == "semi_1" or samples == "semi_10": - train_data_path = os.path.join(root, "train") + # train_data_path = os.path.join(root, "train") percent = samples_tag.split('_')[-1] - subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(percent) + "percent.txt") - list_imgs = [li.decode("utf-8").split('\n')[0] for li in subset_file] - samples = [(os.path.join(train_data_path, li.split('_')[0], li), class_to_idx[li.split('_')[0]]) for li in list_imgs] + # subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(percent) + "percent.txt") + subset_file = str(percent) + "percent.txt" + with open(subset_file, 'r') as f: + list_imgs = [li.split('\n')[0] for li in f.readlines()] + # print(list_imgs) + samples = [(os.path.join(root, li.split('_')[0], li), class_to_idx[li.split('_')[0]]) for li in list_imgs] else: raise NotImplementedError('{} is not implemented'.format(samples)) diff --git a/passl/engine/engine.py b/passl/engine/engine.py index 23c59ab9..378a387b 100644 --- a/passl/engine/engine.py +++ b/passl/engine/engine.py @@ -214,17 +214,12 @@ def worker_init_fn(worker_id): # build optimizer and lr scheduler if self.mode == 'train': - config_lr_scheduler = self.config.get('LRScheduler', None) - self.lr_scheduler = None - if config_lr_scheduler is not None: - self.lr_decay_unit = config_lr_scheduler.get('decay_unit', - 'step') - self.lr_scheduler = build_lr_scheduler( - config_lr_scheduler, self.config["Global"]["epochs"], - len(self.train_dataloader)) - - self.optimizer = build_optimizer(self.config["Optimizer"], - self.lr_scheduler, self.model) + if self.config["Optimizer"].get('decay_unit', None) is not None: + self.lr_decay_unit = self.config["Optimizer"]['decay_unit'] + else: + self.lr_decay_unit = 'step' + Warning('lr_decay_unit is not set in optimizer config, set to step by default') + self.optimizer = build_optimizer(self.config["Optimizer"], self.model, self.config, len(self.train_dataloader)) # load pretrained model if self.config["Global"]["pretrained_model"] is not None: diff --git a/passl/engine/loops/classification_loop.py b/passl/engine/loops/classification_loop.py index b1d8d47f..3f3c29d9 100644 --- a/passl/engine/loops/classification_loop.py +++ b/passl/engine/loops/classification_loop.py @@ -195,8 +195,8 @@ def train_one_step(self, batch): self.trainer.scaler.update() # clear gradients self.trainer.optimizer.clear_grad() - - if self.trainer.lr_decay_unit == 'step': + + if self.trainer.lr_decay_unit == 'step': # default is step self.trainer.optimizer.lr_step(self.global_step) return out, loss_dict diff --git a/passl/models/__init__.py b/passl/models/__init__.py index 0792faae..38ea440d 100644 --- a/passl/models/__init__.py +++ b/passl/models/__init__.py @@ -27,7 +27,7 @@ from .convnext import * from .mocov3 import * from .swav import * -from .simsiam import * +# from .simsiam import * __all__ = ["build_model"] diff --git a/passl/models/swav.py b/passl/models/swav.py index 01627b66..c9ee2e6a 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -7,6 +7,7 @@ import paddle.nn as nn from passl.nn import init +from passl.scheduler import build_lr_scheduler, lr_scheduler from passl.utils import logger from passl.models.resnet import resnet50 from passl.models.base_model import Model @@ -131,62 +132,66 @@ def __init__(self, **kwargs): def load_pretrained(self, path, rank=0, finetune=False): self._load_model(path, self.res_model, 'backbone') - def param_groups(self, config, tensor_fusion=True, custom_cfg=None): + def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=None): """ - lr_group(dict|optional): [{'name': 'backbone', 'lr_mult': 0.1}, {'name': 'norm', 'weight_decay_mult': 0}] + custom_cfg(dict|optional): [{'name': 'backbone', 'lr': 0.1, 'LRScheduler': {"lr":1.0}}, {'name': 'norm', 'weight_decay_mult': 0}] """ - if custom_cfg is not None: - assert isinstance(custom_cfg, list), "`custom_cfg` must be a list." - for item in custom_cfg: + + self.custom_cfg = config.pop('custom_cfg', None) + if self.custom_cfg is not None: + assert isinstance(self.custom_cfg, list), "`custom_cfg` must be a list." + assert self.custom_cfg['PasslDefault'].get('LRScheduler', None) is not None, 'LRScheduler is not set in group with name PasslDefault, please set them.' + for item in self.custom_cfg: assert isinstance( item, dict), "The item of `custom_cfg` must be a dict" - param_group = self._collect_params(self.res_model, tensor_fusion, config) + param_group = self._collect_params(config, self.res_model, tensor_fusion, epochs, trainset_length) return param_group - def _collect_params(self, config, model, tensor_fusion): + def _collect_params(self, config, model, tensor_fusion, epochs, trainset_length): # Collect different parameter groups if self.custom_cfg is None or len(self.custom_cfg) == 0: - return {'params': model.parameters(), 'tensor_fusion': tensor_fusion} + return [{'params': model.parameters(), 'tensor_fusion': tensor_fusion}] + # split params self.weight_decay = config['weight_decay'] - groups_num = len(self.custom_cfg) + 1 - params_list = [[] for _ in range(groups_num)] + params_dict = {item['name']: [] for item in self.custom_cfg} for name, param in model.named_parameters(): if param.stop_gradient: continue for idx, item in enumerate(self.custom_cfg): - if item['name'] in name: - params_list[idx].append(param) + if item['name'] in name and item['name']!='PasslDefault': + params_dict[item['name']].append(param) break else: - params_list[-1].append(param) - + params_dict['PasslDefault'].append(param) res = [] - for idx, item in enumerate(self.custom_cfg): - lr_mult = item.get("lr_mult", 1.0) + for item in self.custom_cfg: weight_decay_mult = item.get("weight_decay_mult", None) - param_dict = {'params': params_list[idx], 'learning_rate': lr_mult} + if item.get("LRScheduler", None) is not None: + lr_scheduler = build_lr_scheduler(item['LRScheduler'], epochs, trainset_length, config['decay_unit']) + + else: + Warning('The LRScheduler is not set for group with name {}, use default LRScheduler'.format(item['name'])) + # todo: initialize LRCallable here. + lr_scheduler = build_lr_scheduler(self.custom_cfg['PasslDefault']['LRScheduler'], epochs, trainset_length, config['decay_unit']) + param_dict = {'params': params_dict[item['name']], 'lr': lr_scheduler} + if self.weight_decay is not None and weight_decay_mult is not None: param_dict['weight_decay'] = self.weight_decay * weight_decay_mult param_dict['tensor_fusion'] = tensor_fusion res.append(param_dict) - res.append({'params': params_list[-1]}) msg = 'Parameter groups for optimizer: \n' for idx, item in enumerate(self.custom_cfg): - params_name = [p.name for p in params_list[idx]] + params_name = [p.name for p in params_dict[item['name']]] item = item.copy() item['params_name'] = params_name msg += 'Group {}: \n{} \n'.format(idx, item) - msg += 'Last group:\n params_name: {}'.format( - [p.name for p in params_list[-1]]) logger.info(msg) return res - - def forward(self, inp): return self.res_model(inp) diff --git a/passl/optimizer/__init__.py b/passl/optimizer/__init__.py index 609e83e9..5a2add56 100644 --- a/passl/optimizer/__init__.py +++ b/passl/optimizer/__init__.py @@ -1,88 +1,3 @@ -# # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# # -# # Licensed under the Apache License, Version 2.0 (the "License"); -# # you may not use this file except in compliance with the License. -# # You may obtain a copy of the License at -# # http://www.apache.org/licenses/LICENSE-2.0 -# # -# # Unless required by applicable law or agreed to in writing, software -# # distributed under the License is distributed on an "AS IS" BASIS, -# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# # See the License for the specific language governing permissions and -# # limitations under the License. - -# from __future__ import absolute_import -# from __future__ import division -# from __future__ import print_function - -# from collections import defaultdict - -# import copy -# import paddle - -# from passl.core.grad_clip import ClipGradByGlobalNorm -# from passl.core.param_fuse import get_fused_params - -# from passl.utils import logger - -# from .optimizer import Optimizer -# from .adamw import AdamW -# from .adafactor import Adafactor -# from .momentum import Momentum -# from .momentum_lars import MomentumLARS - - -# def build_optimizer(config, lr_scheduler, model=None): -# config = copy.deepcopy(config) - -# grad_clip = None -# grad_clip_config = config.pop('grad_clip', None) -# if grad_clip_config is not None: -# grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm') -# grad_clip = eval(grad_clip_name)(**grad_clip_config) - -# no_weight_decay_name = config.pop('no_weight_decay_name', []) - -# param_group = defaultdict(list) -# for n, p in model.named_parameters(): -# state = copy.deepcopy(p.__dict__) -# if any(nd in n for nd in no_weight_decay_name): -# state['no_weight_decay'] = True -# param_group[str(state)].append(p) - -# # fuse params -# for key in param_group: -# if 'gpu' not in paddle.get_device(): -# continue -# if "'is_distributed': True" in key: -# continue -# if "'has_sparse_grad': True" in key: -# continue - -# param_group[key] = get_fused_params(param_group[key]) - -# # bulid optimizer params -# params = [] -# for key in param_group: -# group = {'params': param_group[key]} - -# if "'is_distributed': True" in key: -# group['is_distributed'] = True - -# if 'no_weight_decay' in key: -# group['weight_decay'] = 0.0 - -# params.append(group) - -# optim_name = config.pop('name') -# optim = eval(optim_name)(params, -# lr=lr_scheduler, -# grad_clip=grad_clip, -# **config) -# logger.debug("build optimizer ({}) success..".format(optim)) -# return optim - - # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -119,29 +34,30 @@ from .momentum_larc import MomentumLARC -def build_optimizer(config, lr_scheduler, model=None): - config = copy.deepcopy(config) - optim_name = config.pop('name') - custom_cfg = config.pop('custom_cfg', None) +def build_optimizer(optim_config, model, config, trainset_length): + optim_config = copy.deepcopy(optim_config) + optim_name = optim_config.pop('name') grad_clip = None - grad_clip_config = config.pop('grad_clip', None) + grad_clip_config = optim_config.pop('grad_clip', None) if grad_clip_config is not None: grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm') grad_clip = eval(grad_clip_name)(**grad_clip_config) - no_weight_decay_name = config.pop('no_weight_decay_name', []) - tensor_fusion = config.pop('tensor_fusion', True) + no_weight_decay_name = optim_config.pop('no_weight_decay_name', []) + tensor_fusion = optim_config.pop('tensor_fusion', True) if 'LAR' in optim_name: tensor_fusion = False logger.info('LARS or LARC Optimizer can not use tensor fusion technology. It automatically fall back to `tensor_fusion = False`.') if hasattr(model, 'param_groups'): # param_group = model.param_groups(no_weight_decay_name, tensor_fusion) # todo compact simsaim - param_group = model.param_groups(config, tensor_fusion, custom_cfg) + param_group = model.param_groups(optim_config, tensor_fusion, config["Global"]["epochs"], trainset_length) for group in param_group: if 'tensor_fusion' in group and group['tensor_fusion']: group['params'] = get_fused_params(group['params']) + optim_config.pop('custom_cfg', None) + else: param_group_map = defaultdict(list) for n, p in model.named_parameters(): @@ -175,16 +91,8 @@ def build_optimizer(config, lr_scheduler, model=None): param_group.append(group) - lr = lr_scheduler - lr_func = None - if isinstance(lr_scheduler, LRCallable): - lr = lr_scheduler.lr - lr_func = lr_scheduler - optim = eval(optim_name)(param_group, - lr=lr, - lr_func=lr_func, grad_clip=grad_clip, - **config) + **optim_config) logger.debug("build optimizer ({}) success..".format(optim)) return optim diff --git a/passl/optimizer/momentum.py b/passl/optimizer/momentum.py index 8b569c7c..179839fc 100644 --- a/passl/optimizer/momentum.py +++ b/passl/optimizer/momentum.py @@ -26,8 +26,6 @@ class Momentum(Optimizer): def __init__(self, params, - lr=0.001, - lr_func=None, momentum=0.9, weight_decay=0.0, use_master_param=True, @@ -35,8 +33,6 @@ def __init__(self, **args): defaults = dict( - lr=lr, - lr_func=lr_func, momentum=momentum, weight_decay=weight_decay, use_master_param=use_master_param, diff --git a/passl/optimizer/optimizer.py b/passl/optimizer/optimizer.py index 98e6a3b1..b0026c76 100644 --- a/passl/optimizer/optimizer.py +++ b/passl/optimizer/optimizer.py @@ -206,12 +206,15 @@ def clear_grad(self, set_to_zero=True): @paddle.no_grad() def lr_step(self, step=None): - for group in self.param_groups: + for i, group in enumerate(self.param_groups): lr = group['lr'] - if isinstance(lr, paddle.optimizer.lr.LRScheduler): + + if isinstance(lr, paddle.optimizer.lr.LRScheduler): # group defined lr scheduler lr.step(step) elif 'lr_func' in group and callable(group['lr_func']): group['lr_func'](group, step) + # todo: compact LRCallable + print("####lr0 {}, lr0 {}".format(self.param_groups[0]['lr'].get_lr(), self.param_groups[1]['lr'].get_lr())) @paddle.no_grad() def get_lr(self, group_id=0): diff --git a/passl/scheduler/__init__.py b/passl/scheduler/__init__.py index ecfb2cf6..4f31e170 100644 --- a/passl/scheduler/__init__.py +++ b/passl/scheduler/__init__.py @@ -12,6 +12,7 @@ # limitations under the License. import paddle +from paddle.optimizer.lr import MultiStepDecay from passl.utils import logger @@ -19,10 +20,15 @@ from .lr_callable import LRCallable, CosineWithFixLR -def build_lr_scheduler(lr_config, epochs, step_each_epoch): - lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch}) +def build_lr_scheduler(lr_config, epochs, step_each_epoch, decay_unit): + lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch, 'decay_unit': decay_unit }) if 'name' in lr_config: lr_name = lr_config.pop('name') + if "MultiStepDecay" in lr_name: + lr_config.pop('epochs') + lr_config.pop('step_each_epoch') + lr_config.pop('decay_unit') + print(lr_config) lr = eval(lr_name)(**lr_config) if isinstance(lr, paddle.optimizer.lr.LRScheduler): return lr diff --git a/passl/scheduler/lr_scheduler.py b/passl/scheduler/lr_scheduler.py index fb8c7c97..1159a27c 100644 --- a/passl/scheduler/lr_scheduler.py +++ b/passl/scheduler/lr_scheduler.py @@ -23,7 +23,7 @@ class TimmCosine(lr.LRScheduler): def __init__(self, learning_rate, - step_each_epoch, # len(train_loader) = dataset/total_bs + step_each_epoch, epochs, decay_unit='epoch', eta_min=0.0, @@ -123,8 +123,8 @@ class Step(lr.LRScheduler): def __init__(self, step_each_epoch, epochs, - boundaries, # [12, 16] - values, #[0.01, 0.002, 0.0004], + boundaries, + values, warmup_steps=0, warmup_epochs=0, decay_unit='epoch', diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml index f7950c0b..9781c34e 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml @@ -4,9 +4,9 @@ Global: train_loop: ClassificationTrainingEpochLoop validate_loop: ClassificationEvaluationLoop checkpoint: null - pretrained_model: /ssd2/tangshiyu/Code/PASSL/swav_800ep_pretrain.pdparams + pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams finetune: True - output_dir: ./output/semi_0420 + output_dir: ./output/semi_0424 device: gpu save_interval: 1 max_num_latest_checkpoint: 0 @@ -40,21 +40,28 @@ Loss: - CELoss: weight: 1.0 -LRScheduler: - name: MultiStepDecay - learning_rate: 0.02 - milestones: [12, 16] - gamma: 0.2 - last_epoch: -1 Optimizer: name: Momentum momentum: 0.9 weight_decay: 0.0 tensor_fusion: False - custom_config: + decay_unit: epoch + custom_cfg: - name: head - lr_mult: 250 + LRScheduler: + name: MultiStepDecay + learning_rate: 5 + milestones: [12, 16] + gamma: 0.2 + last_epoch: -1 + - name: PasslDefault + LRScheduler: + name: MultiStepDecay + learning_rate: 0.02 + milestones: [12, 16] + gamma: 0.2 + last_epoch: -1 # data loader for train and eval DataLoader: diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml index 3136121c..f80b2fa5 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml @@ -4,14 +4,8 @@ Global: train_loop: ClassificationTrainingEpochLoop validate_loop: ClassificationEvaluationLoop checkpoint: null -<<<<<<< HEAD - pretrained_model: /ssd2/tangshiyu/Code/PASSL/swav_800ep_pretrain.pdparams - finetune: True - output_dir: ./output/semi_0420 -======= pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams output_dir: ./output/baseline_0421_align_trackTrue_nolinearload ->>>>>>> 709ea4dd375a62c2a5a6c3a36ea9776e30ead382 device: gpu save_interval: 1 max_num_latest_checkpoint: 0 @@ -19,12 +13,12 @@ Global: eval_interval: 1 eval_unit: "epoch" accum_steps: 1 - epochs: 20 + epochs: 100 print_batch_step: 100 use_visualdl: False seed: 31 -# FP16 setting +# FP16 setting ignore in align # FP16: # level: O1 @@ -33,15 +27,10 @@ DistributedStrategy: # model architecture Model: -<<<<<<< HEAD - name: swav_resnet50_finetune - output_dim: 1000 -======= name: swav_resnet50_linearprobe output_dim: 0 eval_mode: True class_num: 1000 ->>>>>>> 709ea4dd375a62c2a5a6c3a36ea9776e30ead382 # loss function config for traing/eval process Loss: @@ -52,22 +41,21 @@ Loss: - CELoss: weight: 1.0 -LRScheduler: - name: MultiStepDecay - learning_rate: 0.02 - milestones: [12, 16] - gamma: 0.2 - last_epoch: -1 - Optimizer: name: Momentum momentum: 0.9 - weight_decay: 0.0 - tensor_fusion: False - custom_config: - - name: head - lr_mult: 250 - + weight_decay: 1e-6 + tensor_fusion: True + decay_unit: epoch + custom_cfg: + - name: PasslDefault + LRScheduler: + name: TimmCosine + learning_rate: 0.3 + eta_min: 0.0 + last_epoch: 0 + warmup_epoch: 0 + # data loader for train and eval DataLoader: Train: @@ -82,10 +70,9 @@ DataLoader: - Normalize: mean: [0.485, 0.456, 0.406] std: [0.228, 0.224, 0.225] - samples_tag: semi_1 sampler: name: DistributedBatchSampler - batch_size: 64 # accum_steps: 1, total batchsize: 256 + batch_size: 32 # accum_steps: 1, total batchsize: 256 drop_last: False shuffle: True loader: @@ -107,7 +94,7 @@ DataLoader: std: [0.228, 0.224, 0.225] sampler: name: DistributedBatchSampler - batch_size: 64 + batch_size: 32 drop_last: False shuffle: False loader: @@ -124,4 +111,4 @@ Metric: Export: export_type: paddle - input_shape: [None, 3, 224, 224] \ No newline at end of file + input_shape: [None, 3, 224, 224] diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml index 8c3f9603..c514c6bc 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml @@ -36,15 +36,6 @@ Model: output_dim: 128 nmb_prototypes: 3000 -LRScheduler: - name: TimmCosine - learning_rate: 4.8 - decay_unit: step - eta_min: 0.0048 - warmup_epoch: 10 - warmup_start_lr: 0.3 - warmup_prefix: True - Optimizer: name: MomentumLARC momentum: 0.9 @@ -52,6 +43,17 @@ Optimizer: trust_coefficient: 0.001 clip: False tensor_fusion: False + decay_unit: epoch + custom_cfg: + - name: PasslDefault + LRScheduler: + name: TimmCosine + learning_rate: 4.8 + decay_unit: step + eta_min: 0.0048 + warmup_epoch: 10 + warmup_start_lr: 0.3 + warmup_prefix: True # data loader for train and eval DataLoader: diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh index c06a84cc..5aa3ff33 100644 --- a/tasks/ssl/swav/finetune.sh +++ b/tasks/ssl/swav/finetune.sh @@ -19,9 +19,10 @@ unset DISTRIBUTED_TRAINER_ENDPOINTS export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" export CUDA_VISIBLE_DEVICES=4,1,2,3 #,4,5,6,7 +export https_proxy="http://172.19.56.199:3128" python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ - tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yml \ No newline at end of file + passl-train -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh index fb44a0d4..d1c866c6 100644 --- a/tasks/ssl/swav/pretrain.sh +++ b/tasks/ssl/swav/pretrain.sh @@ -22,4 +22,4 @@ python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ - tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yml \ No newline at end of file + tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml \ No newline at end of file From 277ab82ceebb9d9f49ee5f9b570b87ce3976e884 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Tue, 25 Apr 2023 17:49:44 +0800 Subject: [PATCH 11/46] validate_ft --- passl/data/dataset/imagefolder_dataset.py | 2 +- passl/engine/loops/classification_loop.py | 43 ++++++++++++------- passl/models/resnet.py | 3 +- passl/models/swav.py | 34 +++++++++------ ...v_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml | 5 +-- ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml | 2 +- tasks/ssl/swav/finetune.sh | 6 ++- 7 files changed, 60 insertions(+), 35 deletions(-) diff --git a/passl/data/dataset/imagefolder_dataset.py b/passl/data/dataset/imagefolder_dataset.py index 5ad4e208..5d994267 100644 --- a/passl/data/dataset/imagefolder_dataset.py +++ b/passl/data/dataset/imagefolder_dataset.py @@ -65,7 +65,7 @@ def __init__(self, classes, class_to_idx = self.find_classes(self.root) if samples_tag is None: samples = self.make_dataset(self.root, class_to_idx, extensions) - elif samples_tag == "semi_1" or samples == "semi_10": + elif samples_tag == "semi_1" or samples_tag == "semi_10": # train_data_path = os.path.join(root, "train") percent = samples_tag.split('_')[-1] # subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(percent) + "percent.txt") diff --git a/passl/engine/loops/classification_loop.py b/passl/engine/loops/classification_loop.py index 3f3c29d9..6463357a 100644 --- a/passl/engine/loops/classification_loop.py +++ b/passl/engine/loops/classification_loop.py @@ -108,12 +108,7 @@ def log_model(model, logger): model1 = model.res_model for name, param in model1.named_parameters(): logger.info(name) - logger.info(param.abs().sum()) - - model2 = model.linear - for name, param in model2.named_parameters(): - logger.info(name) - logger.info(param.abs().sum()) + logger.info(param.abs().mean()) class ClassificationTrainingEpochLoop(TrainingEpochLoop): @@ -135,7 +130,14 @@ def forward_backward(self, batch): for idx in range(self.trainer.accum_steps): data = batch[0][idx * step_size:(idx + 1) * step_size] label = batch[1][idx * step_size:(idx + 1) * step_size] - + + ####### test ####### + # label = paddle.to_tensor([133, 141, 371, 254, 89, 244, 33, 64, 542, 93, 262, 674, 898, 796, 785, 727, 228, 792, 853, 639, 410, 357, 545, 473, 637, 400, 863, 386, 689, 359, 476, 960]).cast('int32') + # import numpy as np + # np.random.seed(42) + # a = np.random.rand(32, 3, 224, 224) + # data = paddle.to_tensor(a).astype('float32') + # do cast if using fp16 otherwise do nothing with paddle.amp.auto_cast( enable=self.trainer.fp16, @@ -145,12 +147,12 @@ def forward_backward(self, batch): out = self.trainer.model(data) final_out.append(out) - - # label = paddle.to_tensor([133, 141, 371, 254, 89, 244, 33, 64, 542, 93, 262, 674, 898, 796, 785, 727, 228, 792, 853, 639, 410, 357, 545, 473, 637, 400, 863, 386, 689, 359, 476, 960]).cast('int32') - + loss_dict = self.trainer.train_loss_func(out, label) - - # logger1 = init_logger('first') + # import pdb; pdb.set_trace() + + ####### test ####### + # logger1 = init_logger('before') # log_model(self.trainer.model, logger1) for key in loss_dict: @@ -163,6 +165,7 @@ def forward_backward(self, batch): scaled = self.trainer.scaler.scale(loss_dict["loss"]) scaled.backward() + ####### test ####### # grad_sync(self.trainer.optimizer.param_groups) # # do unscale and step if using fp16 and not found nan/inf @@ -172,13 +175,12 @@ def forward_backward(self, batch): # # otherwise do nothing # self.trainer.scaler.update() - # logger2 = init_logger('second') + # logger2 = init_logger('after') # log_model(self.trainer.model, logger2) - # import pdb; pdb.set_trace() out = paddle.concat(final_out, axis=0) - return out, final_loss_dict, + return out, final_loss_dict def train_one_step(self, batch): @@ -278,9 +280,20 @@ def eval_one_dataset(self, eval_dataloader): custom_white_list=self.trainer.fp16_custom_white_list, custom_black_list=self.trainer.fp16_custom_black_list, level=self.trainer.fp16_level): + + ####### test ####### + # label = paddle.to_tensor([133, 141, 371, 254, 89, 244, 33, 64, 542, 93, 262, 674, 898, 796, 785, 727, 228, 792, 853, 639, 410, 357, 545, 473, 637, 400, 863, 386, 689, 359, 476, 960, 133, 141, 371, 254, 89, 244, 33, 64, 542, 93, 262, 674, 898, 796, 785, 727, 228, 792, 853, 639, 410, 357, 545, 473, 637, 400, 863, 386, 689, 359, 476, 960]).cast('int32') + # import numpy as np + # np.random.seed(42) + # a = np.random.rand(32, 3, 224, 224) + # data = paddle.to_tensor(a).astype('float32') + + # import pdb; pdb.set_trace() + # out = self.trainer.model(data) out = self.trainer.model(batch[0]) # calc loss if self.trainer.eval_loss_func is not None: + # loss_dict = self.trainer.eval_loss_func(out, target) loss_dict = self.trainer.eval_loss_func(out, batch[1]) for key in loss_dict: if key not in output_info: diff --git a/passl/models/resnet.py b/passl/models/resnet.py index 1fa48c34..34761215 100644 --- a/passl/models/resnet.py +++ b/passl/models/resnet.py @@ -109,7 +109,7 @@ def __init__(self, block, layers, zero_init_residual=False, groups=1, super(ResNet, self).__init__() if norm_layer is None: - norm_layer = functools.partial(paddle.nn.BatchNorm2D, use_global_stats=False) + norm_layer = functools.partial(paddle.nn.BatchNorm2D, use_global_stats=True) self._norm_layer = norm_layer self.eval_mode = eval_mode self.padding = paddle.nn.Pad2D(padding=1, value=0.0) @@ -196,7 +196,6 @@ def _make_layer(self, block, planes, blocks, stride=1, dilate=False): return paddle.nn.Sequential(*layers) def forward_backbone(self, x): - x = self.padding(x) x = self.conv1(x) x = self.bn1(x) diff --git a/passl/models/swav.py b/passl/models/swav.py index 414fd5f2..377e6839 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -109,16 +109,9 @@ def _freeze_norm(self, layer): def load_pretrained(self, path, rank=0, finetune=False): self._load_model(path, self.res_model, 'backbone') - self._load_model("linear.pdparams", self.linear, 'linear') + # self._load_model("linear.pdparams", self.linear, 'linear') def forward(self, inp): -# import numpy as np - # import pdb; pdb.set_trace() - -# np.random.seed(42) -# a = np.random.rand(32, 3, 224, 224) -# inp = paddle.to_tensor(a).astype('float32') - with paddle.no_grad(): output = self.res_model(inp) output = self.linear(output) @@ -128,10 +121,16 @@ def forward(self, inp): class SwAVFinetune(SwAV): def __init__(self, **kwargs): super().__init__(**kwargs) + self.apply(self._freeze_norm) def load_pretrained(self, path, rank=0, finetune=False): self._load_model(path, self.res_model, 'backbone') - + # self._load_model("projection_head.pdparams", self.res_model.projection_head, 'projection_head') + + def _freeze_norm(self, layer): + if isinstance(layer, (nn.layer.norm._BatchNormBase)): + layer._use_global_stats = True + def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=None): """ custom_cfg(dict|optional): [{'name': 'backbone', 'lr': 0.1, 'LRScheduler': {"lr":1.0}}, {'name': 'norm', 'weight_decay_mult': 0}] @@ -140,9 +139,12 @@ def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length= self.custom_cfg = config.pop('custom_cfg', None) if self.custom_cfg is not None: assert isinstance(self.custom_cfg, list), "`custom_cfg` must be a list." - assert self.custom_cfg['PasslDefault'].get('LRScheduler', None) is not None, 'LRScheduler is not set in group with name PasslDefault, please set them.' for item in self.custom_cfg: - assert isinstance( + if item['name']=='PasslDefault': + assert item.get('LRScheduler', None) is not None, 'LRScheduler is not set in group with name PasslDefault, please set them.' + + for item in self.custom_cfg: + assert isinstance( item, dict), "The item of `custom_cfg` must be a dict" param_group = self._collect_params(config, self.res_model, tensor_fusion, epochs, trainset_length) @@ -209,14 +211,22 @@ def swav_resnet50_linearprobe(**kwargs): return model def swav_resnet50_finetune(**kwargs): + # flags = {} + # flags['FLAGS_cudnn_exhaustive_search'] = False + # flags['FLAGS_cudnn_deterministic'] = False + # paddle.set_flags(flags) model = SwAVFinetune(**kwargs) + if paddle.distributed.get_world_size() > 1: + model = nn.SyncBatchNorm.convert_sync_batchnorm(model) return model def swav_resnet50_pretrain(**kwargs): # todo flags = {} flags['FLAGS_cudnn_exhaustive_search'] = True - flags['FLAGS_cudnn_deterministic'] = True + flags['FLAGS_cudnn_deterministic'] = False paddle.set_flags(flags) + if paddle.distributed.get_world_size() > 1: + model = nn.SyncBatchNorm.convert_sync_batchnorm(model) model = SwAVPretrain(**kwargs) return model diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml index 9781c34e..dd7fa58d 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml @@ -6,7 +6,7 @@ Global: checkpoint: null pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams finetune: True - output_dir: ./output/semi_0424 + output_dir: ./output/semi_0425_readyagain device: gpu save_interval: 1 max_num_latest_checkpoint: 0 @@ -15,7 +15,7 @@ Global: eval_unit: "epoch" accum_steps: 1 epochs: 20 - print_batch_step: 100 + print_batch_step: 50 # 50 use_visualdl: False seed: 31 @@ -40,7 +40,6 @@ Loss: - CELoss: weight: 1.0 - Optimizer: name: Momentum momentum: 0.9 diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml index e3b179bb..8e86cb41 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml @@ -5,7 +5,7 @@ Global: validate_loop: ClassificationEvaluationLoop checkpoint: null pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams - output_dir: ./output/baseline_0421_align_trackTrue_nolinearload + output_dir: ./output/baseline_0425 device: gpu save_interval: 1 max_num_latest_checkpoint: 0 diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh index 5aa3ff33..f37e8767 100644 --- a/tasks/ssl/swav/finetune.sh +++ b/tasks/ssl/swav/finetune.sh @@ -18,7 +18,8 @@ unset PADDLE_TRAINER_ENDPOINTS unset DISTRIBUTED_TRAINER_ENDPOINTS export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" -export CUDA_VISIBLE_DEVICES=4,1,2,3 #,4,5,6,7 +# export CUDA_VISIBLE_DEVICES=4 #,1,2,3 +export CUDA_VISIBLE_DEVICES=5,6,7,0 export https_proxy="http://172.19.56.199:3128" python -m paddle.distributed.launch \ @@ -26,3 +27,6 @@ python -m paddle.distributed.launch \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ passl-train -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml + + # --log_dir='output' \ + \ No newline at end of file From 5e739fa5b3c8d90af82373a948c82b4575fe820c Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Wed, 26 Apr 2023 10:20:42 +0800 Subject: [PATCH 12/46] format --- passl/data/dataset/imagefolder_dataset.py | 8 ++++---- passl/models/swav.py | 13 ------------- passl/scheduler/lr_scheduler.py | 4 ++-- .../swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml | 10 +++------- .../swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml | 7 +------ tasks/ssl/swav/finetune.sh | 9 ++------- tasks/ssl/swav/linearprobe.sh | 3 --- 7 files changed, 12 insertions(+), 42 deletions(-) diff --git a/passl/data/dataset/imagefolder_dataset.py b/passl/data/dataset/imagefolder_dataset.py index 5d994267..dac2634a 100644 --- a/passl/data/dataset/imagefolder_dataset.py +++ b/passl/data/dataset/imagefolder_dataset.py @@ -66,13 +66,13 @@ def __init__(self, if samples_tag is None: samples = self.make_dataset(self.root, class_to_idx, extensions) elif samples_tag == "semi_1" or samples_tag == "semi_10": - # train_data_path = os.path.join(root, "train") - percent = samples_tag.split('_')[-1] + # connection reset # subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(percent) + "percent.txt") - subset_file = str(percent) + "percent.txt" + # list_imgs = [li.decode("utf-8").split('\n')[0] for li in subset_file] + subset_file = str(samples_tag.split('_')[-1]) + "percent.txt" with open(subset_file, 'r') as f: list_imgs = [li.split('\n')[0] for li in f.readlines()] - # print(list_imgs) + samples = [(os.path.join(root, li.split('_')[0], li), class_to_idx[li.split('_')[0]]) for li in list_imgs] else: raise NotImplementedError('{} is not implemented'.format(samples)) diff --git a/passl/models/swav.py b/passl/models/swav.py index 377e6839..3a318ca7 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -211,10 +211,6 @@ def swav_resnet50_linearprobe(**kwargs): return model def swav_resnet50_finetune(**kwargs): - # flags = {} - # flags['FLAGS_cudnn_exhaustive_search'] = False - # flags['FLAGS_cudnn_deterministic'] = False - # paddle.set_flags(flags) model = SwAVFinetune(**kwargs) if paddle.distributed.get_world_size() > 1: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) @@ -230,15 +226,6 @@ def swav_resnet50_pretrain(**kwargs): # todo model = SwAVPretrain(**kwargs) return model -# def normal_init(param, **kwargs): -# initializer = nn.initializer.Normal(**kwargs) -# initializer(param, param.block) - -# def constant_init(param, **kwargs): -# initializer = nn.initializer.Constant(**kwargs) -# initializer(param, param.block) - - class RegLog(paddle.nn.Layer): """Creates logistic regression on top of frozen features""" diff --git a/passl/scheduler/lr_scheduler.py b/passl/scheduler/lr_scheduler.py index 1159a27c..223ca349 100644 --- a/passl/scheduler/lr_scheduler.py +++ b/passl/scheduler/lr_scheduler.py @@ -23,7 +23,7 @@ class TimmCosine(lr.LRScheduler): def __init__(self, learning_rate, - step_each_epoch, + step_each_epoch, epochs, decay_unit='epoch', eta_min=0.0, @@ -124,7 +124,7 @@ def __init__(self, step_each_epoch, epochs, boundaries, - values, + values, warmup_steps=0, warmup_epochs=0, decay_unit='epoch', diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml index dd7fa58d..946001dd 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml @@ -6,7 +6,7 @@ Global: checkpoint: null pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams finetune: True - output_dir: ./output/semi_0425_readyagain + output_dir: ./output/semi_0426_semi10 device: gpu save_interval: 1 max_num_latest_checkpoint: 0 @@ -15,14 +15,10 @@ Global: eval_unit: "epoch" accum_steps: 1 epochs: 20 - print_batch_step: 50 # 50 + print_batch_step: 50 use_visualdl: False seed: 31 -# FP16 setting -# FP16: -# level: O1 - DistributedStrategy: data_parallel: True @@ -76,7 +72,7 @@ DataLoader: - Normalize: mean: [0.485, 0.456, 0.406] std: [0.228, 0.224, 0.225] - samples_tag: semi_1 + samples_tag: semi_10 sampler: name: DistributedBatchSampler batch_size: 64 # accum_steps: 1, total batchsize: 256 diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml index 8e86cb41..4780a9e1 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml @@ -5,7 +5,7 @@ Global: validate_loop: ClassificationEvaluationLoop checkpoint: null pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams - output_dir: ./output/baseline_0425 + output_dir: ./output device: gpu save_interval: 1 max_num_latest_checkpoint: 0 @@ -18,10 +18,6 @@ Global: use_visualdl: False seed: 31 -# FP16 setting ignore in align -# FP16: -# level: O1 - DistributedStrategy: data_parallel: True @@ -41,7 +37,6 @@ Loss: - CELoss: weight: 1.0 - Optimizer: name: Momentum momentum: 0.9 diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh index f37e8767..c577ddb1 100644 --- a/tasks/ssl/swav/finetune.sh +++ b/tasks/ssl/swav/finetune.sh @@ -12,21 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Note: Set the following environment variables -# and then need to run the script on each node. + unset PADDLE_TRAINER_ENDPOINTS unset DISTRIBUTED_TRAINER_ENDPOINTS export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" -# export CUDA_VISIBLE_DEVICES=4 #,1,2,3 -export CUDA_VISIBLE_DEVICES=5,6,7,0 -export https_proxy="http://172.19.56.199:3128" +export CUDA_VISIBLE_DEVICES=4,5,6,7 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ passl-train -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml - - # --log_dir='output' \ \ No newline at end of file diff --git a/tasks/ssl/swav/linearprobe.sh b/tasks/ssl/swav/linearprobe.sh index 866322e1..4c37392b 100644 --- a/tasks/ssl/swav/linearprobe.sh +++ b/tasks/ssl/swav/linearprobe.sh @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# export FLAGS_stop_check_timeout=3600 unset PADDLE_TRAINER_ENDPOINTS unset DISTRIBUTED_TRAINER_ENDPOINTS export PADDLE_NNODES=1 @@ -24,5 +23,3 @@ python -m paddle.distributed.launch \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml - -# python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" tools/train.py -c \ No newline at end of file From 76056f523b5ee3c1228a6982f2e0495dcc031df0 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Fri, 28 Apr 2023 15:18:52 +0800 Subject: [PATCH 13/46] add_pretrain --- passl/data/dataset/__init__.py | 1 + passl/data/dataset/multicrop_dataset.py | 10 +- passl/data/preprocess/basic_transforms.py | 32 +++--- passl/engine/engine.py | 10 +- passl/engine/loops/classification_loop.py | 2 +- .../engine/loops/contrastive_learning_loop.py | 9 +- passl/engine/loops/loop.py | 9 +- passl/models/swav.py | 104 ++++++++++++++++-- passl/optimizer/__init__.py | 11 +- ...v_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml | 13 +-- ...av_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml | 14 +-- ..._resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml | 54 ++++++--- tasks/ssl/swav/pretrain.sh | 4 +- 13 files changed, 198 insertions(+), 75 deletions(-) diff --git a/passl/data/dataset/__init__.py b/passl/data/dataset/__init__.py index b3e14445..b19912e1 100644 --- a/passl/data/dataset/__init__.py +++ b/passl/data/dataset/__init__.py @@ -63,3 +63,4 @@ def default_loader(path: str): from .imagenet_dataset import ImageNetDataset from .imagefolder_dataset import ImageFolder +from .multicrop_dataset import MultiCropDataset diff --git a/passl/data/dataset/multicrop_dataset.py b/passl/data/dataset/multicrop_dataset.py index 926d4a59..ffa30008 100644 --- a/passl/data/dataset/multicrop_dataset.py +++ b/passl/data/dataset/multicrop_dataset.py @@ -23,7 +23,8 @@ from passl.data.dataset.imagefolder_dataset import ImageFolder from passl.data.preprocess import ( RandomApply, - GaussianBlur, + # GaussianBlur, + SimCLRGaussianBlur, NormalizeImage, RandomGrayscale, ) @@ -31,13 +32,13 @@ class MultiCropDataset(ImageFolder): def __init__(self, - dataroot, + root, size_crops, num_crops, min_scale_crops, max_scale_crops, return_label=False): - super(MultiCropDataset, self).__init__(dataroot) + super(MultiCropDataset, self).__init__(root) assert len(size_crops) == len(num_crops) assert len(min_scale_crops) == len(num_crops) @@ -80,7 +81,8 @@ def __getitem__(self, index): def get_pil_gaussian_blur(p=0.5): - gaussian_blur = GaussianBlur(sigma=[.1, 2.], _PIL=True) + # gaussian_blur = GaussianBlur(sigma=[.1, 2.], _PIL=True) + gaussian_blur = SimCLRGaussianBlur(sigma=[.1, 2.]) rnd_gaussian_blur = RandomApply([gaussian_blur], p=p) return rnd_gaussian_blur diff --git a/passl/data/preprocess/basic_transforms.py b/passl/data/preprocess/basic_transforms.py index 9d9eb132..96a784c3 100644 --- a/passl/data/preprocess/basic_transforms.py +++ b/passl/data/preprocess/basic_transforms.py @@ -944,19 +944,19 @@ def __call__(self, img): return img -class GaussianBlur(object): - """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709""" - def __init__(self, sigma=[.1, 2.], _PIL=False): - self.sigma = sigma - self.kernel_size = 23 - self._PIL = _PIL - - def __call__(self, x): - sigma = np.random.uniform(self.sigma[0], self.sigma[1]) - if self._PIL: - x = x.filter(ImageFilter.GaussianBlur(radius=sigma)) - return x - else: - x = cv2.GaussianBlur(np.array(x), - (self.kernel_size, self.kernel_size), sigma) - return Image.fromarray(x.astype(np.uint8)) \ No newline at end of file +# class GaussianBlur(object): +# """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709""" +# def __init__(self, sigma=[.1, 2.], _PIL=False): +# self.sigma = sigma +# self.kernel_size = 23 +# self._PIL = _PIL + +# def __call__(self, x): +# sigma = np.random.uniform(self.sigma[0], self.sigma[1]) +# if self._PIL: +# x = x.filter(ImageFilter.GaussianBlur(radius=sigma)) +# return x +# else: +# x = cv2.GaussianBlur(np.array(x), +# (self.kernel_size, self.kernel_size), sigma) +# return Image.fromarray(x.astype(np.uint8)) \ No newline at end of file diff --git a/passl/engine/engine.py b/passl/engine/engine.py index 378a387b..aa8164a0 100644 --- a/passl/engine/engine.py +++ b/passl/engine/engine.py @@ -214,15 +214,21 @@ def worker_init_fn(worker_id): # build optimizer and lr scheduler if self.mode == 'train': + assert self.config.get("Optimizer", None) is not None, "Optimizer must be defined in config." if self.config["Optimizer"].get('decay_unit', None) is not None: self.lr_decay_unit = self.config["Optimizer"]['decay_unit'] else: self.lr_decay_unit = 'step' Warning('lr_decay_unit is not set in optimizer config, set to step by default') - self.optimizer = build_optimizer(self.config["Optimizer"], self.model, self.config, len(self.train_dataloader)) + + config_lr_scheduler = self.config["Optimizer"].get('LRScheduler', None) + self.lr_scheduler = None + if config_lr_scheduler is not None: + self.lr_scheduler = build_lr_scheduler(config_lr_scheduler, self.config["Global"]["epochs"], len(self.train_dataloader), self.lr_decay_unit) + + self.optimizer = build_optimizer(self.config["Optimizer"], self.model, self.config, len(self.train_dataloader), self.lr_scheduler) # load pretrained model - if self.config["Global"]["pretrained_model"] is not None: assert isinstance( self.config["Global"]["pretrained_model"], str ), "pretrained_model type is not available. Please use `string`." diff --git a/passl/engine/loops/classification_loop.py b/passl/engine/loops/classification_loop.py index 6463357a..4c7349a2 100644 --- a/passl/engine/loops/classification_loop.py +++ b/passl/engine/loops/classification_loop.py @@ -182,7 +182,7 @@ def forward_backward(self, batch): out = paddle.concat(final_out, axis=0) return out, final_loss_dict - def train_one_step(self, batch): + def train_one_step(self, batch, total_iterations=None): # do forward and backward out, loss_dict = self.forward_backward(batch) diff --git a/passl/engine/loops/contrastive_learning_loop.py b/passl/engine/loops/contrastive_learning_loop.py index a772a28d..f6fe4fbe 100644 --- a/passl/engine/loops/contrastive_learning_loop.py +++ b/passl/engine/loops/contrastive_learning_loop.py @@ -69,19 +69,24 @@ def forward_backward(self, batch): return final_loss_dict - def train_one_step(self, batch): + def train_one_step(self, batch, total_iterations): # remove label batch = batch[0] # do forward and backward loss_dict = self.forward_backward(batch) + + try: + self.trainer.model.after_loss_backward(total_iterations) + except AttributeError: + logger.warning("Model has no after_loss_backward method, ignored this process") grad_sync(self.trainer.optimizer.param_groups) # do unscale and step if using fp16 and not found nan/inf # otherwise do nothing - self.trainer.scaler.step(self.trainer.optimizer) + self.trainer.scaler.step(self.trainer.optimizer) # todo # check this will updata weight, before this weight is not updated # do update loss scaling if using fp16 # otherwise do nothing self.trainer.scaler.update() diff --git a/passl/engine/loops/loop.py b/passl/engine/loops/loop.py index 35bdfa1d..dbe60dd6 100644 --- a/passl/engine/loops/loop.py +++ b/passl/engine/loops/loop.py @@ -219,7 +219,7 @@ def run(self): self.trainer.train_dataloader.batch_sampler.set_epoch(epoch_id) # for one epoch train - self.train_one_epoch() + self.train_one_epoch(epoch_id) if self.trainer.lr_decay_unit == 'epoch': self.trainer.optimizer.lr_step(self.cur_epoch_id) @@ -257,13 +257,14 @@ def run(self): self.trainer.training = False - def train_one_epoch(self): + def train_one_epoch(self, epoch_id): self.trainer.model.train() tic = time.time() for batch_idx, batch in enumerate(self.trainer.train_dataloader): self.cur_batch_idx = batch_idx + total_iterations = epoch_id*self.total_batch_idx + batch_idx if self.max_train_step is not None and self.global_step >= self.max_train_step: logger.info( @@ -288,7 +289,7 @@ def train_one_epoch(self): self.global_step += 1 # do forward and backward - out, loss_dict = self.train_one_step(batch) + out, loss_dict = self.train_one_step(batch, total_iterations) self.time_info["batch_cost"].update(time.time() - tic) @@ -310,7 +311,7 @@ def train_one_epoch(self): tic = time.time() - def train_one_step(self, batch): + def train_one_step(self, batch, total_iterations): raise NotImplementedError def save_checkpoint(self): diff --git a/passl/models/swav.py b/passl/models/swav.py index 3a318ca7..502bff83 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -1,7 +1,8 @@ -from collections import defaultdict -import copy import os +import copy +import numpy as np from sys import flags +from collections import defaultdict import paddle import paddle.nn as nn @@ -139,9 +140,6 @@ def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length= self.custom_cfg = config.pop('custom_cfg', None) if self.custom_cfg is not None: assert isinstance(self.custom_cfg, list), "`custom_cfg` must be a list." - for item in self.custom_cfg: - if item['name']=='PasslDefault': - assert item.get('LRScheduler', None) is not None, 'LRScheduler is not set in group with name PasslDefault, please set them.' for item in self.custom_cfg: assert isinstance( @@ -158,12 +156,13 @@ def _collect_params(self, config, model, tensor_fusion, epochs, trainset_length) # split params self.weight_decay = config['weight_decay'] - params_dict = {item['name']: [] for item in self.custom_cfg} + params_dict = {item['name']: [] for item in self.custom_cfg} # key name and a PasslDefault + params_dict['PasslDefault'] = [] for name, param in model.named_parameters(): if param.stop_gradient: continue for idx, item in enumerate(self.custom_cfg): - if item['name'] in name and item['name']!='PasslDefault': + if item['name'] in name: params_dict[item['name']].append(param) break else: @@ -177,13 +176,14 @@ def _collect_params(self, config, model, tensor_fusion, epochs, trainset_length) else: Warning('The LRScheduler is not set for group with name {}, use default LRScheduler'.format(item['name'])) # todo: initialize LRCallable here. - lr_scheduler = build_lr_scheduler(self.custom_cfg['PasslDefault']['LRScheduler'], epochs, trainset_length, config['decay_unit']) param_dict = {'params': params_dict[item['name']], 'lr': lr_scheduler} if self.weight_decay is not None and weight_decay_mult is not None: param_dict['weight_decay'] = self.weight_decay * weight_decay_mult param_dict['tensor_fusion'] = tensor_fusion res.append(param_dict) + else: + res.append({'params': params_dict['PasslDefault'], 'tensor_fusion': tensor_fusion}) msg = 'Parameter groups for optimizer: \n' for idx, item in enumerate(self.custom_cfg): @@ -199,11 +199,85 @@ def forward(self, inp): return self.res_model(inp) class SwAVPretrain(SwAV): - def __init__(self, **kwargs): + def __init__(self, queue_length=0, crops_for_assign=(0, 1), nmb_crops=[2, 6], epsilon=0.05, freeze_prototypes_niters=5005, **kwargs): super().__init__(**kwargs) + self.crops_for_assign = crops_for_assign + self.nmb_crops = nmb_crops + self.temperature = 0.1 + self.epsilon = epsilon + self.freeze_prototypes_niters = freeze_prototypes_niters + + # initialize queue + self.queue = None + # queue_path = os.path.join('.', "queue" + str(0) + ".pth") + # if os.path.isfile(queue_path): + # self.queue = paddle.load(queue_path)["queue"] + # # the queue needs to be divisible by the batch size + # queue_length = queue_length + # queue_length -= queue_length % (256) + # if queue_length > 0 and epoch >= 15 and self.queue is None: + # self.queue = paddle.zeros([len(crops_for_assign), + # queue_length // 4, kwargs['output_dim']]) + @paddle.no_grad() + def distributed_sinkhorn(self, out, sinkhorn_iterations=3): + Q = paddle.exp(x=out / self.epsilon).t() + B = Q.shape[1] * 4 + K = Q.shape[0] + sum_Q = paddle.sum(x=Q) + paddle.distributed.all_reduce(sum_Q) + Q /= sum_Q + for it in range(sinkhorn_iterations): + sum_of_rows = paddle.sum(x=Q, axis=1, keepdim=True) + paddle.distributed.all_reduce(sum_of_rows) + Q /= sum_of_rows + Q /= K + Q /= paddle.sum(x=Q, axis=0, keepdim=True) + Q /= B + Q *= B + return Q.t() + def forward(self, inp): - return self.res_model(inp) + bs = inp[0].shape[0] + + # normalize the prototypes + with paddle.no_grad(): + w = self.res_model.prototypes.weight.clone() + w = paddle.nn.functional.normalize(x=w, axis=1, p=2) + self.res_model.prototypes.weight.copy_(w) + embedding, output = self.res_model(inp) + embedding = embedding.detach() + + # compute loss + loss = 0 + for i, crop_id in enumerate(self.crops_for_assign): + with paddle.no_grad(): + out = output[bs * crop_id:bs * (crop_id + 1)].detach() + if self.queue is not None: + if use_the_queue or not paddle.all(x=self.queue[(i), (-1), :] == 0): + use_the_queue = True + out = paddle.concat(x=(paddle.mm(input=self.queue[i], + mat2=self.res_model.prototypes.weight.t()), out)) + self.queue[(i), bs:] = self.queue[(i), :-bs].clone() + self.queue[(i), :bs] = embedding[crop_id * bs:(crop_id + 1) * bs] + + q = self.distributed_sinkhorn(out)[-bs:] + subloss = 0 + for v in np.delete(np.arange(np.sum(self.nmb_crops)), crop_id): + x = output[bs * v:bs * (v + 1)] / self.temperature + subloss -= paddle.mean(x=paddle.sum(x=q * paddle.nn. + functional.log_softmax(x=x, axis=1), axis=1)) + loss += subloss / (np.sum(self.nmb_crops) - 1) + loss /= len(self.crops_for_assign) + + return + + def after_loss_backward(self, iteration): + if iteration < self.freeze_prototypes_niters: + for name, p in self.res_model.named_parameters(): + if 'prototypes' in name: + p.grad = None + def swav_resnet50_linearprobe(**kwargs): @@ -216,13 +290,19 @@ def swav_resnet50_finetune(**kwargs): model = nn.SyncBatchNorm.convert_sync_batchnorm(model) return model -def swav_resnet50_pretrain(**kwargs): # todo +def swav_resnet50_pretrain(apex, **kwargs): # todo flags = {} flags['FLAGS_cudnn_exhaustive_search'] = True flags['FLAGS_cudnn_deterministic'] = False paddle.set_flags(flags) if paddle.distributed.get_world_size() > 1: - model = nn.SyncBatchNorm.convert_sync_batchnorm(model) + if not apex: + model = nn.SyncBatchNorm.convert_sync_batchnorm(model) + else: + # with apex syncbn speeds up computation than global syncbn + process_group = apex.parallel.create_syncbn_process_group(8) + model = apex.parallel.convert_syncbn_model(model, process_group=process_group) + model = SwAVPretrain(**kwargs) return model diff --git a/passl/optimizer/__init__.py b/passl/optimizer/__init__.py index 5a2add56..43216690 100644 --- a/passl/optimizer/__init__.py +++ b/passl/optimizer/__init__.py @@ -34,7 +34,7 @@ from .momentum_larc import MomentumLARC -def build_optimizer(optim_config, model, config, trainset_length): +def build_optimizer(optim_config, model, config, trainset_length, lr_scheduler): optim_config = copy.deepcopy(optim_config) optim_name = optim_config.pop('name') @@ -91,8 +91,17 @@ def build_optimizer(optim_config, model, config, trainset_length): param_group.append(group) + lr = lr_scheduler + lr_func = None + if isinstance(lr_scheduler, LRCallable): + lr = lr_scheduler.lr + lr_func = lr_scheduler + optim = eval(optim_name)(param_group, + lr=lr, + lr_func=lr_func, grad_clip=grad_clip, **optim_config) + logger.debug("build optimizer ({}) success..".format(optim)) return optim diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml index 946001dd..974d84b1 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml @@ -42,6 +42,12 @@ Optimizer: weight_decay: 0.0 tensor_fusion: False decay_unit: epoch + LRScheduler: + name: MultiStepDecay + learning_rate: 0.02 + milestones: [12, 16] + gamma: 0.2 + last_epoch: -1 custom_cfg: - name: head LRScheduler: @@ -50,13 +56,6 @@ Optimizer: milestones: [12, 16] gamma: 0.2 last_epoch: -1 - - name: PasslDefault - LRScheduler: - name: MultiStepDecay - learning_rate: 0.02 - milestones: [12, 16] - gamma: 0.2 - last_epoch: -1 # data loader for train and eval DataLoader: diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml index 4780a9e1..c67ddd2a 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml @@ -43,14 +43,12 @@ Optimizer: weight_decay: 1e-6 tensor_fusion: True decay_unit: epoch - custom_cfg: - - name: PasslDefault - LRScheduler: - name: TimmCosine - learning_rate: 0.3 - eta_min: 0.0 - last_epoch: 0 - warmup_epoch: 0 + LRScheduler: + name: TimmCosine + learning_rate: 0.3 + eta_min: 0.0 + last_epoch: 0 + warmup_epoch: 0 # data loader for train and eval DataLoader: diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml index c514c6bc..935d4ec7 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml @@ -13,14 +13,14 @@ Global: eval_interval: 1 eval_unit: "epoch" accum_steps: 1 - epochs: 800 + epochs: 400 # 800 print_batch_step: 100 use_visualdl: False seed: 31 # FP16 setting -FP16: - level: O1 +# FP16: +# level: O1 # GradScaler: # init_loss_scaling: 65536.0 # incr_every_n_steps: 2000 @@ -31,11 +31,33 @@ DistributedStrategy: # model architecture Model: name: swav_resnet50_pretrain + apex: False + queue_length: 3804 # 0 + crops_for_assign: [0, 1] + nmb_crops: [2, 6] + epsilon: 0.05 + freeze_prototypes_niters: 5005 # 313 normalize: True hidden_mlp: 2048 output_dim: 128 nmb_prototypes: 3000 +# Optimizer: +# name: MomentumLARC +# momentum: 0.9 +# weight_decay: 1e-6 +# trust_coefficient: 0.001 +# clip: False +# tensor_fusion: False +# decay_unit: step +# LRScheduler: +# name: TimmCosine +# learning_rate: 4.8 +# eta_min: 0.0048 +# warmup_epoch: 10 +# warmup_start_lr: 0.3 +# warmup_prefix: True + Optimizer: name: MomentumLARC momentum: 0.9 @@ -43,33 +65,31 @@ Optimizer: trust_coefficient: 0.001 clip: False tensor_fusion: False - decay_unit: epoch - custom_cfg: - - name: PasslDefault - LRScheduler: - name: TimmCosine - learning_rate: 4.8 - decay_unit: step - eta_min: 0.0048 - warmup_epoch: 10 - warmup_start_lr: 0.3 - warmup_prefix: True + decay_unit: step + LRScheduler: + name: TimmCosine + learning_rate: 0.6 + eta_min: 0.0006 + warmup_epoch: 0 + warmup_start_lr: 0. + warmup_prefix: True + last_epoch: 0 # data loader for train and eval DataLoader: Train: dataset: name: MultiCropDataset - root: ./dataset/ILSVRC2012/train + root: ./data/ILSVRC2012/train size_crops: [224, 96] num_crops: [2, 6] min_scale_crops: [0.14, 0.05] max_scale_crops: [1, 0.14] sampler: name: DistributedBatchSampler - batch_size: 128 # accum_steps: 1, total batchsize: 4096 + batch_size: 64 # 4card # 128 32 card # accum_steps: 1, total batchsize: 4096 drop_last: False shuffle: True loader: - num_workers: 8 + num_workers: 10 use_shared_memory: True diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh index d1c866c6..6972eddd 100644 --- a/tasks/ssl/swav/pretrain.sh +++ b/tasks/ssl/swav/pretrain.sh @@ -16,7 +16,9 @@ unset PADDLE_TRAINER_ENDPOINTS unset DISTRIBUTED_TRAINER_ENDPOINTS export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export CUDA_VISIBLE_DEVICES=4,5,6,7 +# export CUDA_VISIBLE_DEVICES=7 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ From 1d69baa127a7dddf22498a2546d7ed7691ea53c6 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Thu, 4 May 2023 20:52:07 +0800 Subject: [PATCH 14/46] valid_pretrain --- passl/data/dataset/multicrop_dataset.py | 12 +- passl/data/preprocess/basic_transforms.py | 2 +- passl/engine/engine.py | 15 +-- .../engine/loops/contrastive_learning_loop.py | 116 ++++++++++++++++-- passl/engine/loops/loop.py | 2 +- passl/models/resnet.py | 10 +- passl/models/swav.py | 51 +++++--- passl/optimizer/optimizer.py | 2 +- ...v_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml | 95 ++++++++++++++ ..._resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml | 43 ++----- tasks/ssl/swav/pretrain.sh | 7 +- 11 files changed, 271 insertions(+), 84 deletions(-) create mode 100644 tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml diff --git a/passl/data/dataset/multicrop_dataset.py b/passl/data/dataset/multicrop_dataset.py index ffa30008..42b800f7 100644 --- a/passl/data/dataset/multicrop_dataset.py +++ b/passl/data/dataset/multicrop_dataset.py @@ -36,14 +36,12 @@ def __init__(self, size_crops, num_crops, min_scale_crops, - max_scale_crops, - return_label=False): + max_scale_crops): super(MultiCropDataset, self).__init__(root) assert len(size_crops) == len(num_crops) assert len(min_scale_crops) == len(num_crops) assert len(max_scale_crops) == len(num_crops) - self.return_label = return_label color_transform = [get_color_distortion(), get_pil_gaussian_blur()] mean = [0.485, 0.456, 0.406] @@ -71,13 +69,11 @@ def __getitem__(self, index): Returns: tuple: (sample, target) where target is class_index of the target class. """ - path, target = self.samples[index] + path, target = self.imgs[index] sample = self.loader(path) sample = list(map(lambda trans: trans(sample), self.trans)) - if self.return_label: - return sample, target - - return sample + + return sample, target def get_pil_gaussian_blur(p=0.5): diff --git a/passl/data/preprocess/basic_transforms.py b/passl/data/preprocess/basic_transforms.py index 96a784c3..ace7c1d2 100644 --- a/passl/data/preprocess/basic_transforms.py +++ b/passl/data/preprocess/basic_transforms.py @@ -57,7 +57,7 @@ "SimCLRGaussianBlur", "BYOLSolarize", "MAERandCropImage", - "GaussianBlur" + # "GaussianBlur" ] diff --git a/passl/engine/engine.py b/passl/engine/engine.py index aa8164a0..c50b5084 100644 --- a/passl/engine/engine.py +++ b/passl/engine/engine.py @@ -229,13 +229,14 @@ def worker_init_fn(worker_id): self.optimizer = build_optimizer(self.config["Optimizer"], self.model, self.config, len(self.train_dataloader), self.lr_scheduler) # load pretrained model - assert isinstance( - self.config["Global"]["pretrained_model"], str - ), "pretrained_model type is not available. Please use `string`." - self.model.load_pretrained( - self.config["Global"]["pretrained_model"], - self.config["Global"]["rank"], - self.config["Global"].get("finetune", False)) + if self.config["Global"]["pretrained_model"] is not None: + assert isinstance( + self.config["Global"]["pretrained_model"], str + ), "pretrained_model type is not available. Please use `string`." + self.model.load_pretrained( + self.config["Global"]["pretrained_model"], + self.config["Global"]["rank"], + self.config["Global"].get("finetune", False)) # for distributed if self.config["Global"]["distributed"]: diff --git a/passl/engine/loops/contrastive_learning_loop.py b/passl/engine/loops/contrastive_learning_loop.py index f6fe4fbe..4ebf1346 100644 --- a/passl/engine/loops/contrastive_learning_loop.py +++ b/passl/engine/loops/contrastive_learning_loop.py @@ -16,7 +16,11 @@ from __future__ import division from __future__ import print_function +import os import sys +import logging +from datetime import timedelta + import time import collections import platform @@ -28,12 +32,87 @@ from passl.utils import logger from .loop import TrainingEpochLoop + +class LogFormatter: + def __init__(self): + self.start_time = time.time() + + def format(self, record): + elapsed_seconds = round(record.created - self.start_time) + + prefix = "%s - %s - %s" % ( + record.levelname, + time.strftime("%x %X"), + timedelta(seconds=elapsed_seconds), + ) + message = record.getMessage() + message = message.replace("\n", "\n" + " " * (len(prefix) + 3)) + return "%s - %s" % (prefix, message) if message else "" + + +def create_logger(filepath, rank): + """ + Create a logger. + Use a different log file for each process. + """ + # create log formatter + log_formatter = LogFormatter() + + # create file handler and set level to debug + if filepath is not None: + if rank > 0: + filepath = "%s-%i" % (filepath, rank) + file_handler = logging.FileHandler(filepath, "a") + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(log_formatter) + + # create console handler and set level to info + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) + console_handler.setFormatter(log_formatter) + + # create logger and set level to debug + logger = logging.getLogger() + logger.handlers = [] + logger.setLevel(logging.DEBUG) + logger.propagate = False + if filepath is not None: + logger.addHandler(file_handler) + logger.addHandler(console_handler) + + # reset logger elapsed time + def reset_time(): + log_formatter.start_time = time.time() + + logger.reset_time = reset_time + + return logger + + +def init_logger(name): + logger = create_logger( + os.path.join("{}.log".format(name)), rank=0 + ) + logger.info("============ Initialized logger ============") + logger.info("") + return logger + + +def log_model(model, logger): + model1 = model.res_model + for name, param in model1.named_parameters(): + logger.info(name) + logger.info(param.abs().sum()) + if param.grad is not None: + logger.info(name+'grad') + logger.info(param.grad.abs().sum()) + class ContrastiveLearningTrainingEpochLoop(TrainingEpochLoop): def __init__(self, trainer, epochs, max_train_step=None, val_loop=None): super().__init__(trainer, epochs, max_train_step=max_train_step, val_loop=val_loop) - def forward_backward(self, batch): + def forward_backward(self, batch, total_iterations): # Gradient Merge(GuoxiaWang): Accumulate gradient over multiple # steps to save on memory. @@ -57,6 +136,9 @@ def forward_backward(self, batch): if isinstance(loss_dict, paddle.Tensor): loss_dict = {'loss': loss_dict} + ####### test ####### + # logger1 = init_logger('before_pretrain') + # log_model(self.trainer.model, logger1) for key in loss_dict: loss_dict[key] = loss_dict[key] / self.trainer.accum_steps @@ -66,30 +148,44 @@ def forward_backward(self, batch): # loss scaling if using fp16 otherwise do nothing scaled = self.trainer.scaler.scale(loss_dict["loss"]) scaled.backward() - + + try: + self.trainer.model.after_loss_backward(total_iterations) + except AttributeError: + logger.warning("Model has no after_loss_backward method, ignored this process") + + ####### test ####### +# grad_sync(self.trainer.optimizer.param_groups) + +# # do unscale and step if using fp16 and not found nan/inf +# # otherwise do nothing +# self.trainer.scaler.step(self.trainer.optimizer) +# # do update loss scaling if using fp16 +# # otherwise do nothing +# self.trainer.scaler.update() + +# logger2 = init_logger('after_pretrain') + # log_model(self.trainer.model, logger2) + # print('final_loss_dict', final_loss_dict) return final_loss_dict def train_one_step(self, batch, total_iterations): # remove label batch = batch[0] - - # do forward and backward - loss_dict = self.forward_backward(batch) - try: - self.trainer.model.after_loss_backward(total_iterations) - except AttributeError: - logger.warning("Model has no after_loss_backward method, ignored this process") + # do forward and backward + loss_dict = self.forward_backward(batch, total_iterations) grad_sync(self.trainer.optimizer.param_groups) # do unscale and step if using fp16 and not found nan/inf # otherwise do nothing - self.trainer.scaler.step(self.trainer.optimizer) # todo # check this will updata weight, before this weight is not updated + self.trainer.scaler.step(self.trainer.optimizer) # do update loss scaling if using fp16 # otherwise do nothing self.trainer.scaler.update() + # clear gradients self.trainer.optimizer.clear_grad() diff --git a/passl/engine/loops/loop.py b/passl/engine/loops/loop.py index dbe60dd6..959bb386 100644 --- a/passl/engine/loops/loop.py +++ b/passl/engine/loops/loop.py @@ -264,7 +264,7 @@ def train_one_epoch(self, epoch_id): for batch_idx, batch in enumerate(self.trainer.train_dataloader): self.cur_batch_idx = batch_idx - total_iterations = epoch_id*self.total_batch_idx + batch_idx + total_iterations = (epoch_id-1)*self.total_batch_idx + batch_idx if self.max_train_step is not None and self.global_step >= self.max_train_step: logger.info( diff --git a/passl/models/resnet.py b/passl/models/resnet.py index 34761215..735d6485 100644 --- a/passl/models/resnet.py +++ b/passl/models/resnet.py @@ -109,7 +109,7 @@ def __init__(self, block, layers, zero_init_residual=False, groups=1, super(ResNet, self).__init__() if norm_layer is None: - norm_layer = functools.partial(paddle.nn.BatchNorm2D, use_global_stats=True) + norm_layer = functools.partial(paddle.nn.BatchNorm2D, use_global_stats=False) self._norm_layer = norm_layer self.eval_mode = eval_mode self.padding = paddle.nn.Pad2D(padding=1, value=0.0) @@ -198,7 +198,9 @@ def _make_layer(self, block, planes, blocks, stride=1, dilate=False): def forward_backbone(self, x): x = self.padding(x) x = self.conv1(x) + # print("before bn mean var", self.bn1._mean.mean(), self.bn1._variance.mean()) x = self.bn1(x) + # print("bn mean var", self.bn1._mean.mean(), self.bn1._variance.mean()) x = self.relu(x) x = self.maxpool(x) x = self.layer1(x) @@ -213,7 +215,9 @@ def forward_backbone(self, x): def forward_head(self, x): if self.projection_head is not None: + # print("before proj bn mean var", self.projection_head[1]._mean.mean(), self.projection_head[1]._variance.mean()) x = self.projection_head(x) + # print(" proj bn mean var", self.projection_head[1]._mean.mean(), self.projection_head[1]._variance.mean()) if self.l2norm: x = paddle.nn.functional.normalize(x=x, axis=1, p=2) if self.prototypes is not None: @@ -229,8 +233,7 @@ def forward(self, inputs): return_counts=True)[1], axis=0) # padiff start_idx = 0 for end_idx in idx_crops: - _out = self.forward_backbone(paddle.concat(x=inputs[start_idx: - end_idx])) + _out = self.forward_backbone(paddle.concat(x=inputs[start_idx:end_idx])) if start_idx == 0: output = _out else: @@ -240,7 +243,6 @@ def forward(self, inputs): class MultiPrototypes(paddle.nn.Layer): - def __init__(self, output_dim, nmb_prototypes): super(MultiPrototypes, self).__init__() self.nmb_heads = len(nmb_prototypes) diff --git a/passl/models/swav.py b/passl/models/swav.py index 502bff83..311b7b19 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -84,7 +84,10 @@ def load_pretrained(self, path, rank=0, finetune=False): def save(self, path, local_rank=0, rank=0): paddle.save(self.state_dict(), path + ".pdparams") - + + def _freeze_norm(self, layer): + if isinstance(layer, (nn.layer.norm._BatchNormBase)): + layer._use_global_stats = True class SwAVLinearProbe(SwAV): def __init__(self, class_num=1000, **kwargs): @@ -103,10 +106,6 @@ def __init__(self, class_num=1000, **kwargs): assert len(parameters) == 2 # weight, bias self.apply(self._freeze_norm) - - def _freeze_norm(self, layer): - if isinstance(layer, (nn.layer.norm._BatchNormBase)): - layer._use_global_stats = True def load_pretrained(self, path, rank=0, finetune=False): self._load_model(path, self.res_model, 'backbone') @@ -127,11 +126,7 @@ def __init__(self, **kwargs): def load_pretrained(self, path, rank=0, finetune=False): self._load_model(path, self.res_model, 'backbone') # self._load_model("projection_head.pdparams", self.res_model.projection_head, 'projection_head') - - def _freeze_norm(self, layer): - if isinstance(layer, (nn.layer.norm._BatchNormBase)): - layer._use_global_stats = True - + def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=None): """ custom_cfg(dict|optional): [{'name': 'backbone', 'lr': 0.1, 'LRScheduler': {"lr":1.0}}, {'name': 'norm', 'weight_decay_mult': 0}] @@ -218,7 +213,12 @@ def __init__(self, queue_length=0, crops_for_assign=(0, 1), nmb_crops=[2, 6], ep # if queue_length > 0 and epoch >= 15 and self.queue is None: # self.queue = paddle.zeros([len(crops_for_assign), # queue_length // 4, kwargs['output_dim']]) + # self.load_pretrained('swav_800ep_pretrain.pdparams') + self.apply(self._freeze_norm) + def load_pretrained(self, path, rank=0, finetune=False): + self._load_model('swav_800ep_pretrain.pdparams', self.res_model, 'backbone') + @paddle.no_grad() def distributed_sinkhorn(self, out, sinkhorn_iterations=3): Q = paddle.exp(x=out / self.epsilon).t() @@ -238,14 +238,21 @@ def distributed_sinkhorn(self, out, sinkhorn_iterations=3): return Q.t() def forward(self, inp): + # ####### test ####### + # import numpy as np + # np.random.seed(42) + # a = np.random.rand(32, 3, 224, 224) + # inp = paddle.to_tensor(a).astype('float32') bs = inp[0].shape[0] # normalize the prototypes with paddle.no_grad(): w = self.res_model.prototypes.weight.clone() - w = paddle.nn.functional.normalize(x=w, axis=1, p=2) - self.res_model.prototypes.weight.copy_(w) + w = paddle.nn.functional.normalize(x=w, axis=0, p=2) # 1 + paddle.assign(w, self.res_model.prototypes.weight) embedding, output = self.res_model(inp) + # print('output, embedding', embedding.mean(), output.mean(), inp.mean()) + # import pdb; pdb.set_trace() embedding = embedding.detach() # compute loss @@ -253,6 +260,7 @@ def forward(self, inp): for i, crop_id in enumerate(self.crops_for_assign): with paddle.no_grad(): out = output[bs * crop_id:bs * (crop_id + 1)].detach() + # print('bs, crop_id', bs, crop_id, self.nmb_crops) if self.queue is not None: if use_the_queue or not paddle.all(x=self.queue[(i), (-1), :] == 0): use_the_queue = True @@ -262,23 +270,28 @@ def forward(self, inp): self.queue[(i), :bs] = embedding[crop_id * bs:(crop_id + 1) * bs] q = self.distributed_sinkhorn(out)[-bs:] + # print('out.mean(), q.mean()', out.mean(), q.mean()) + subloss = 0 + # print(output.shape) for v in np.delete(np.arange(np.sum(self.nmb_crops)), crop_id): x = output[bs * v:bs * (v + 1)] / self.temperature subloss -= paddle.mean(x=paddle.sum(x=q * paddle.nn. functional.log_softmax(x=x, axis=1), axis=1)) + # print('v, subloss', v, subloss) + loss += subloss / (np.sum(self.nmb_crops) - 1) + # print('i, loss', i, loss) + # import pdb; pdb.set_trace() loss /= len(self.crops_for_assign) - return + return loss def after_loss_backward(self, iteration): if iteration < self.freeze_prototypes_niters: for name, p in self.res_model.named_parameters(): - if 'prototypes' in name: - p.grad = None - - + if 'prototypes' in name and p.grad is not None: + p.clear_grad() def swav_resnet50_linearprobe(**kwargs): model = SwAVLinearProbe(**kwargs) @@ -295,6 +308,9 @@ def swav_resnet50_pretrain(apex, **kwargs): # todo flags['FLAGS_cudnn_exhaustive_search'] = True flags['FLAGS_cudnn_deterministic'] = False paddle.set_flags(flags) + + model = SwAVPretrain(**kwargs) + if paddle.distributed.get_world_size() > 1: if not apex: model = nn.SyncBatchNorm.convert_sync_batchnorm(model) @@ -303,7 +319,6 @@ def swav_resnet50_pretrain(apex, **kwargs): # todo process_group = apex.parallel.create_syncbn_process_group(8) model = apex.parallel.convert_syncbn_model(model, process_group=process_group) - model = SwAVPretrain(**kwargs) return model class RegLog(paddle.nn.Layer): diff --git a/passl/optimizer/optimizer.py b/passl/optimizer/optimizer.py index 94c4561f..234af8b9 100644 --- a/passl/optimizer/optimizer.py +++ b/passl/optimizer/optimizer.py @@ -214,7 +214,7 @@ def lr_step(self, step=None): elif 'lr_func' in group and callable(group['lr_func']): group['lr_func'](group, step) - print("####lr0 {}, lr0 {}".format(self.param_groups[0]['lr'].get_lr(), self.param_groups[1]['lr'].get_lr())) + # print("####lr0 {}, lr0 {}".format(self.param_groups[0]['lr'].get_lr(), self.param_groups[1]['lr'].get_lr())) @paddle.no_grad() def get_lr(self, group_id=0): diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml new file mode 100644 index 00000000..ba38de7e --- /dev/null +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml @@ -0,0 +1,95 @@ +# global configs +Global: + task_type: ContrastiveLearning + train_loop: ContrastiveLearningTrainingEpochLoop + validate_loop: None + checkpoint: null + pretrained_model: null + output_dir: ./output/pretrain_0504_fp16 + device: gpu + save_interval: 1 + max_num_latest_checkpoint: 0 + eval_during_train: False + eval_interval: 1 + eval_unit: "epoch" + accum_steps: 1 + epochs: 400 # 800 + print_batch_step: 100 + use_visualdl: False + seed: 31 + +# FP16 setting +FP16: + level: O1 +# GradScaler: +# init_loss_scaling: 65536.0 +# incr_every_n_steps: 2000 + +DistributedStrategy: + data_parallel: True + +# model architecture +Model: + name: swav_resnet50_pretrain + apex: False + queue_length: 3804 # 0 + crops_for_assign: [0, 1] + nmb_crops: [2, 6] + epsilon: 0.05 + freeze_prototypes_niters: 5005 # 313 + normalize: True + hidden_mlp: 2048 + output_dim: 128 + nmb_prototypes: 3000 + +# Optimizer: +# name: MomentumLARC +# momentum: 0.9 +# weight_decay: 1e-6 +# trust_coefficient: 0.001 +# clip: False +# tensor_fusion: False +# decay_unit: step +# LRScheduler: +# name: TimmCosine +# learning_rate: 4.8 +# eta_min: 0.0048 +# warmup_epoch: 10 +# warmup_start_lr: 0.3 +# warmup_prefix: True + +Optimizer: + name: MomentumLARC + momentum: 0.9 + weight_decay: 1e-6 + trust_coefficient: 0.001 + clip: False + tensor_fusion: False + decay_unit: step + LRScheduler: + name: TimmCosine + learning_rate: 0.6 + eta_min: 0.0006 + warmup_epoch: 0 + warmup_start_lr: 0. + warmup_prefix: True + last_epoch: 0 + +# data loader for train and eval +DataLoader: + Train: + dataset: + name: MultiCropDataset + root: ./data/ILSVRC2012 + size_crops: [224, 96] + num_crops: [2, 6] + min_scale_crops: [0.14, 0.05] + max_scale_crops: [1, 0.14] + sampler: + name: DistributedBatchSampler + batch_size: 64 # 4 card # 128 32 card # accum_steps: 1, total batchsize: 4096 + drop_last: True + shuffle: True + loader: + num_workers: 10 + use_shared_memory: True diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml index 935d4ec7..d0292c58 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml @@ -5,7 +5,7 @@ Global: validate_loop: None checkpoint: null pretrained_model: null - output_dir: ./output/pretrain_0420 + output_dir: ./output/pretrain_0504_fp16 device: gpu save_interval: 1 max_num_latest_checkpoint: 0 @@ -13,14 +13,14 @@ Global: eval_interval: 1 eval_unit: "epoch" accum_steps: 1 - epochs: 400 # 800 + epochs: 800 print_batch_step: 100 use_visualdl: False seed: 31 # FP16 setting -# FP16: -# level: O1 +FP16: + level: O1 # GradScaler: # init_loss_scaling: 65536.0 # incr_every_n_steps: 2000 @@ -32,32 +32,16 @@ DistributedStrategy: Model: name: swav_resnet50_pretrain apex: False - queue_length: 3804 # 0 + queue_length: 0 crops_for_assign: [0, 1] nmb_crops: [2, 6] epsilon: 0.05 - freeze_prototypes_niters: 5005 # 313 + freeze_prototypes_niters: 313 normalize: True hidden_mlp: 2048 output_dim: 128 nmb_prototypes: 3000 -# Optimizer: -# name: MomentumLARC -# momentum: 0.9 -# weight_decay: 1e-6 -# trust_coefficient: 0.001 -# clip: False -# tensor_fusion: False -# decay_unit: step -# LRScheduler: -# name: TimmCosine -# learning_rate: 4.8 -# eta_min: 0.0048 -# warmup_epoch: 10 -# warmup_start_lr: 0.3 -# warmup_prefix: True - Optimizer: name: MomentumLARC momentum: 0.9 @@ -68,27 +52,26 @@ Optimizer: decay_unit: step LRScheduler: name: TimmCosine - learning_rate: 0.6 - eta_min: 0.0006 - warmup_epoch: 0 - warmup_start_lr: 0. + learning_rate: 4.8 + eta_min: 0.0048 + warmup_epoch: 10 + warmup_start_lr: 0.3 warmup_prefix: True - last_epoch: 0 # data loader for train and eval DataLoader: Train: dataset: name: MultiCropDataset - root: ./data/ILSVRC2012/train + root: ./data/ILSVRC2012 size_crops: [224, 96] num_crops: [2, 6] min_scale_crops: [0.14, 0.05] max_scale_crops: [1, 0.14] sampler: name: DistributedBatchSampler - batch_size: 64 # 4card # 128 32 card # accum_steps: 1, total batchsize: 4096 - drop_last: False + batch_size: 64 # 4 card # 128 32 card # accum_steps: 1, total batchsize: 4096 + drop_last: True shuffle: True loader: num_workers: 10 diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh index 6972eddd..d30ff34b 100644 --- a/tasks/ssl/swav/pretrain.sh +++ b/tasks/ssl/swav/pretrain.sh @@ -16,12 +16,11 @@ unset PADDLE_TRAINER_ENDPOINTS unset DISTRIBUTED_TRAINER_ENDPOINTS export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" -# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -export CUDA_VISIBLE_DEVICES=4,5,6,7 -# export CUDA_VISIBLE_DEVICES=7 +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ - tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml \ No newline at end of file + tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml + # tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml \ No newline at end of file From 5922ab1f41d62dec533d8a1dc910ed6880a39745 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Thu, 4 May 2023 21:14:41 +0800 Subject: [PATCH 15/46] format --- passl/data/preprocess/basic_transforms.py | 19 --- passl/engine/loops/classification_loop.py | 117 +----------------- .../engine/loops/contrastive_learning_loop.py | 3 +- passl/models/swav.py | 29 +---- ..._resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml} | 0 ...v_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml | 19 --- tasks/ssl/swav/finetune.sh | 2 +- 7 files changed, 6 insertions(+), 183 deletions(-) rename tasks/ssl/swav/configs/{swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml => swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml} (100%) diff --git a/passl/data/preprocess/basic_transforms.py b/passl/data/preprocess/basic_transforms.py index ace7c1d2..7be2b26a 100644 --- a/passl/data/preprocess/basic_transforms.py +++ b/passl/data/preprocess/basic_transforms.py @@ -57,7 +57,6 @@ "SimCLRGaussianBlur", "BYOLSolarize", "MAERandCropImage", - # "GaussianBlur" ] @@ -942,21 +941,3 @@ def __call__(self, img): else: img = ImageOps.solarize(img) return img - - -# class GaussianBlur(object): -# """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709""" -# def __init__(self, sigma=[.1, 2.], _PIL=False): -# self.sigma = sigma -# self.kernel_size = 23 -# self._PIL = _PIL - -# def __call__(self, x): -# sigma = np.random.uniform(self.sigma[0], self.sigma[1]) -# if self._PIL: -# x = x.filter(ImageFilter.GaussianBlur(radius=sigma)) -# return x -# else: -# x = cv2.GaussianBlur(np.array(x), -# (self.kernel_size, self.kernel_size), sigma) -# return Image.fromarray(x.astype(np.uint8)) \ No newline at end of file diff --git a/passl/engine/loops/classification_loop.py b/passl/engine/loops/classification_loop.py index 4c7349a2..08bdc1d0 100644 --- a/passl/engine/loops/classification_loop.py +++ b/passl/engine/loops/classification_loop.py @@ -32,85 +32,6 @@ from .loop import _Loop, TrainingEpochLoop -import os -import logging -import time -from datetime import timedelta -import pandas as pd - - -class LogFormatter: - def __init__(self): - self.start_time = time.time() - - def format(self, record): - elapsed_seconds = round(record.created - self.start_time) - - prefix = "%s - %s - %s" % ( - record.levelname, - time.strftime("%x %X"), - timedelta(seconds=elapsed_seconds), - ) - message = record.getMessage() - message = message.replace("\n", "\n" + " " * (len(prefix) + 3)) - return "%s - %s" % (prefix, message) if message else "" - - -def create_logger(filepath, rank): - """ - Create a logger. - Use a different log file for each process. - """ - # create log formatter - log_formatter = LogFormatter() - - # create file handler and set level to debug - if filepath is not None: - if rank > 0: - filepath = "%s-%i" % (filepath, rank) - file_handler = logging.FileHandler(filepath, "a") - file_handler.setLevel(logging.DEBUG) - file_handler.setFormatter(log_formatter) - - # create console handler and set level to info - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.INFO) - console_handler.setFormatter(log_formatter) - - # create logger and set level to debug - logger = logging.getLogger() - logger.handlers = [] - logger.setLevel(logging.DEBUG) - logger.propagate = False - if filepath is not None: - logger.addHandler(file_handler) - logger.addHandler(console_handler) - - # reset logger elapsed time - def reset_time(): - log_formatter.start_time = time.time() - - logger.reset_time = reset_time - - return logger - - -def init_logger(name): - logger = create_logger( - os.path.join("{}.log".format(name)), rank=0 - ) - logger.info("============ Initialized logger ============") - logger.info("") - return logger - - -def log_model(model, logger): - model1 = model.res_model - for name, param in model1.named_parameters(): - logger.info(name) - logger.info(param.abs().mean()) - - class ClassificationTrainingEpochLoop(TrainingEpochLoop): def __init__(self, trainer, epochs, max_train_step=None, val_loop=None): @@ -131,13 +52,6 @@ def forward_backward(self, batch): data = batch[0][idx * step_size:(idx + 1) * step_size] label = batch[1][idx * step_size:(idx + 1) * step_size] - ####### test ####### - # label = paddle.to_tensor([133, 141, 371, 254, 89, 244, 33, 64, 542, 93, 262, 674, 898, 796, 785, 727, 228, 792, 853, 639, 410, 357, 545, 473, 637, 400, 863, 386, 689, 359, 476, 960]).cast('int32') - # import numpy as np - # np.random.seed(42) - # a = np.random.rand(32, 3, 224, 224) - # data = paddle.to_tensor(a).astype('float32') - # do cast if using fp16 otherwise do nothing with paddle.amp.auto_cast( enable=self.trainer.fp16, @@ -149,11 +63,6 @@ def forward_backward(self, batch): final_out.append(out) loss_dict = self.trainer.train_loss_func(out, label) - # import pdb; pdb.set_trace() - - ####### test ####### - # logger1 = init_logger('before') - # log_model(self.trainer.model, logger1) for key in loss_dict: loss_dict[key] = loss_dict[key] / self.trainer.accum_steps @@ -164,20 +73,6 @@ def forward_backward(self, batch): # loss scaling if using fp16 otherwise do nothing scaled = self.trainer.scaler.scale(loss_dict["loss"]) scaled.backward() - - ####### test ####### -# grad_sync(self.trainer.optimizer.param_groups) - -# # do unscale and step if using fp16 and not found nan/inf -# # otherwise do nothing -# self.trainer.scaler.step(self.trainer.optimizer) -# # do update loss scaling if using fp16 -# # otherwise do nothing -# self.trainer.scaler.update() - - # logger2 = init_logger('after') - # log_model(self.trainer.model, logger2) - out = paddle.concat(final_out, axis=0) return out, final_loss_dict @@ -198,7 +93,7 @@ def train_one_step(self, batch, total_iterations=None): # clear gradients self.trainer.optimizer.clear_grad() - if self.trainer.lr_decay_unit == 'step': # default is step + if self.trainer.lr_decay_unit == 'step': self.trainer.optimizer.lr_step(self.global_step) return out, loss_dict @@ -281,19 +176,9 @@ def eval_one_dataset(self, eval_dataloader): custom_black_list=self.trainer.fp16_custom_black_list, level=self.trainer.fp16_level): - ####### test ####### - # label = paddle.to_tensor([133, 141, 371, 254, 89, 244, 33, 64, 542, 93, 262, 674, 898, 796, 785, 727, 228, 792, 853, 639, 410, 357, 545, 473, 637, 400, 863, 386, 689, 359, 476, 960, 133, 141, 371, 254, 89, 244, 33, 64, 542, 93, 262, 674, 898, 796, 785, 727, 228, 792, 853, 639, 410, 357, 545, 473, 637, 400, 863, 386, 689, 359, 476, 960]).cast('int32') - # import numpy as np - # np.random.seed(42) - # a = np.random.rand(32, 3, 224, 224) - # data = paddle.to_tensor(a).astype('float32') - - # import pdb; pdb.set_trace() - # out = self.trainer.model(data) out = self.trainer.model(batch[0]) # calc loss if self.trainer.eval_loss_func is not None: - # loss_dict = self.trainer.eval_loss_func(out, target) loss_dict = self.trainer.eval_loss_func(out, batch[1]) for key in loss_dict: if key not in output_info: diff --git a/passl/engine/loops/contrastive_learning_loop.py b/passl/engine/loops/contrastive_learning_loop.py index 4ebf1346..d943d4cc 100644 --- a/passl/engine/loops/contrastive_learning_loop.py +++ b/passl/engine/loops/contrastive_learning_loop.py @@ -181,14 +181,13 @@ def train_one_step(self, batch, total_iterations): # do unscale and step if using fp16 and not found nan/inf # otherwise do nothing - self.trainer.scaler.step(self.trainer.optimizer) + self.trainer.scaler.step(self.trainer.optimizer) # do update loss scaling if using fp16 # otherwise do nothing self.trainer.scaler.update() # clear gradients self.trainer.optimizer.clear_grad() - if self.trainer.lr_decay_unit == 'step': self.trainer.optimizer.lr_step(self.global_step) diff --git a/passl/models/swav.py b/passl/models/swav.py index 311b7b19..cf9500fb 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -1,5 +1,4 @@ import os -import copy import numpy as np from sys import flags from collections import defaultdict @@ -48,6 +47,9 @@ def _load_model(self, path, model, tag): .format(k, para_state_dict[k].shape, model_state_dict[k] .shape)) else: + # conpact FP16 saving pretrained weight + if model_state_dict[k].dtype != para_state_dict[k].dtype: + para_state_dict[k] = para_state_dict[k].astype(model_state_dict[k].dtype) model_state_dict[k] = para_state_dict[k] num_params_loaded += 1 model.set_dict(model_state_dict) @@ -58,29 +60,6 @@ def _load_model(self, path, model, tag): def load_pretrained(self, path, rank=0, finetune=False): pass -# if not os.path.exists(path + '.pdparams'): -# raise ValueError("Model pretrain path {} does not " -# "exists.".format(path)) - -# state_dict = self.state_dict() -# param_state_dict = paddle.load(path + ".pdparams") - -# # for FP16 saving pretrained weight -# for key, value in param_state_dict.items(): -# if key in param_state_dict and key in state_dict and param_state_dict[ -# key].dtype != state_dict[key].dtype: -# param_state_dict[key] = param_state_dict[key].astype( -# state_dict[key].dtype) - -# if not finetune: -# self.set_dict(param_state_dict) -# else: # load model when finetune -# for k in ['head0.weight', 'head0.bias', 'head.weight', 'head.bias']: -# if k in param_state_dict: -# logger.info(f"Removing key {k} from pretrained checkpoint") -# del param_state_dict[k] - -# self.set_dict(param_state_dict) def save(self, path, local_rank=0, rank=0): paddle.save(self.state_dict(), path + ".pdparams") @@ -109,7 +88,6 @@ def __init__(self, class_num=1000, **kwargs): def load_pretrained(self, path, rank=0, finetune=False): self._load_model(path, self.res_model, 'backbone') - # self._load_model("linear.pdparams", self.linear, 'linear') def forward(self, inp): with paddle.no_grad(): @@ -125,7 +103,6 @@ def __init__(self, **kwargs): def load_pretrained(self, path, rank=0, finetune=False): self._load_model(path, self.res_model, 'backbone') - # self._load_model("projection_head.pdparams", self.res_model.projection_head, 'projection_head') def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=None): """ diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml similarity index 100% rename from tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml rename to tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml index ba38de7e..7cca7774 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml @@ -21,9 +21,6 @@ Global: # FP16 setting FP16: level: O1 -# GradScaler: -# init_loss_scaling: 65536.0 -# incr_every_n_steps: 2000 DistributedStrategy: data_parallel: True @@ -42,22 +39,6 @@ Model: output_dim: 128 nmb_prototypes: 3000 -# Optimizer: -# name: MomentumLARC -# momentum: 0.9 -# weight_decay: 1e-6 -# trust_coefficient: 0.001 -# clip: False -# tensor_fusion: False -# decay_unit: step -# LRScheduler: -# name: TimmCosine -# learning_rate: 4.8 -# eta_min: 0.0048 -# warmup_epoch: 10 -# warmup_start_lr: 0.3 -# warmup_prefix: True - Optimizer: name: MomentumLARC momentum: 0.9 diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh index c577ddb1..494e2002 100644 --- a/tasks/ssl/swav/finetune.sh +++ b/tasks/ssl/swav/finetune.sh @@ -23,5 +23,5 @@ python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ - passl-train -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp16o1.yaml + passl-train -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml \ No newline at end of file From 8270a3cecc6c18b8b3f9e1fa138afe8bf81f1097 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Fri, 5 May 2023 11:02:26 +0800 Subject: [PATCH 16/46] fix_AttrDict_error --- passl/utils/io.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/passl/utils/io.py b/passl/utils/io.py index deec5fef..8904215c 100644 --- a/passl/utils/io.py +++ b/passl/utils/io.py @@ -157,6 +157,12 @@ def save_checkpoint(net, if local_rank == 0: if loss_scaler is not None: opt_state_dict['scaler_state'] = loss_scaler.state_dict() + + # Solve AttrDict can't pickle error + for group in opt_state_dict['param_groups']: + if 'LRScheduler' in group: + group['LRScheduler'] = dict(group['LRScheduler']) + for model_prefix in model_prefixs: paddle.save(opt_state_dict, model_prefix + ".pdopt") paddle.save(metric_info, model_prefix + ".pdstates") From 4e42f8e6277d2161a04458e4b5f47fb7b9a977d8 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Fri, 5 May 2023 14:41:27 +0800 Subject: [PATCH 17/46] replace_swav_resnet --- passl/models/__init__.py | 3 +- passl/models/resnet.py | 458 ++++++++---------- passl/models/swav.py | 6 +- passl/models/swav_resnet.py | 111 +++++ passl/optimizer/momentum_larc.py | 2 - ...v_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml | 2 +- tasks/ssl/swav/finetune.sh | 2 +- 7 files changed, 317 insertions(+), 267 deletions(-) create mode 100644 passl/models/swav_resnet.py diff --git a/passl/models/__init__.py b/passl/models/__init__.py index 38ea440d..85f9663b 100644 --- a/passl/models/__init__.py +++ b/passl/models/__init__.py @@ -27,7 +27,8 @@ from .convnext import * from .mocov3 import * from .swav import * -# from .simsiam import * +from .swav_resnet import * +from .simsiam import * __all__ = ["build_model"] diff --git a/passl/models/resnet.py b/passl/models/resnet.py index 735d6485..f15f3443 100644 --- a/passl/models/resnet.py +++ b/passl/models/resnet.py @@ -1,274 +1,214 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os import paddle -import functools -import paddle.nn as nn +from paddle.vision.models.resnet import ResNet as PDResNet +from paddle.vision.models.resnet import BottleneckBlock, BasicBlock from passl.models.base_model import Model -# from base_model import Model - -def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): - """3x3 convolution with padding""" - return paddle.nn.Conv2D(in_channels=in_planes, out_channels=out_planes, - kernel_size=3, stride=stride, padding=dilation, groups=groups, - dilation=dilation, bias_attr=False, ) - - -def conv1x1(in_planes, out_planes, stride=1): - """1x1 convolution""" - return paddle.nn.Conv2D(in_channels=in_planes, out_channels=out_planes, - kernel_size=1, stride=stride, bias_attr=False) - - -class BasicBlock(nn.Layer): - expansion = 1 - __constants__ = ['downsample'] - - def __init__(self, inplanes, planes, stride=1, downsample=None, groups= - 1, base_width=64, dilation=1, norm_layer=None): - super(BasicBlock, self).__init__() - if norm_layer is None: - norm_layer = paddle.nn.BatchNorm2D - if groups != 1 or base_width != 64: - raise ValueError( - 'BasicBlock only supports groups=1 and base_width=64') - if dilation > 1: - raise NotImplementedError( - 'Dilation > 1 not supported in BasicBlock') - self.conv1 = conv3x3(inplanes, planes, stride) - self.bn1 = norm_layer(planes) - self.relu = paddle.nn.ReLU() - self.conv2 = conv3x3(planes, planes) - self.bn2 = norm_layer(planes) - self.downsample = downsample - self.stride = stride - - def forward(self, x): - identity = x - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - out = self.conv2(out) - out = self.bn2(out) - if self.downsample is not None: - identity = self.downsample(x) - out += identity - out = self.relu(out) - return out - - -class Bottleneck(paddle.nn.Layer): - expansion = 4 - __constants__ = ['downsample'] - - def __init__(self, inplanes, planes, stride=1, downsample=None, groups= - 1, base_width=64, dilation=1, norm_layer=None): - super(Bottleneck, self).__init__() - if norm_layer is None: - norm_layer = paddle.nn.BatchNorm2D - width = int(planes * (base_width / 64.0)) * groups - self.conv1 = conv1x1(inplanes, width) - self.bn1 = norm_layer(width) - self.conv2 = conv3x3(width, width, stride, groups, dilation) - self.bn2 = norm_layer(width) - self.conv3 = conv1x1(width, planes * self.expansion) - self.bn3 = norm_layer(planes * self.expansion) - self.relu = paddle.nn.ReLU() - self.downsample = downsample - self.stride = stride - - def forward(self, x): - identity = x - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - out = self.conv2(out) - out = self.bn2(out) - out = self.relu(out) - out = self.conv3(out) - out = self.bn3(out) - if self.downsample is not None: - identity = self.downsample(x) - out += identity - out = self.relu(out) - return out - -def kaiming_normal_init(param, **kwargs): - initializer = nn.initializer.KaimingNormal(**kwargs) - initializer(param, param.block) - -def constant_init(param, **kwargs): - initializer = nn.initializer.Constant(**kwargs) - initializer(param, param.block) - - -class ResNet(paddle.nn.Layer): - def __init__(self, block, layers, zero_init_residual=False, groups=1, - widen=1, width_per_group=64, replace_stride_with_dilation=None, - norm_layer=None, normalize=False, output_dim=0, hidden_mlp=0, - nmb_prototypes=0, eval_mode=False): - - super(ResNet, self).__init__() - if norm_layer is None: - norm_layer = functools.partial(paddle.nn.BatchNorm2D, use_global_stats=False) - self._norm_layer = norm_layer - self.eval_mode = eval_mode - self.padding = paddle.nn.Pad2D(padding=1, value=0.0) - self.inplanes = width_per_group * widen - self.dilation = 1 - if replace_stride_with_dilation is None: - replace_stride_with_dilation = [False, False, False] - if len(replace_stride_with_dilation) != 3: - raise ValueError( - 'replace_stride_with_dilation should be None or a 3-element tuple, got {}' - .format(replace_stride_with_dilation)) - self.groups = groups - self.base_width = width_per_group - num_out_filters = width_per_group * widen - self.conv1 = paddle.nn.Conv2D(in_channels=3, out_channels= - num_out_filters, kernel_size=7, stride=2, padding=2, bias_attr= - False) - self.bn1 = norm_layer(num_out_filters) - self.relu = paddle.nn.ReLU() - self.maxpool = paddle.nn.MaxPool2D(kernel_size=3, stride=2, padding=1) - self.layer1 = self._make_layer(block, num_out_filters, layers[0]) - num_out_filters *= 2 - self.layer2 = self._make_layer(block, num_out_filters, layers[1], - stride=2, dilate=replace_stride_with_dilation[0]) - num_out_filters *= 2 - self.layer3 = self._make_layer(block, num_out_filters, layers[2], - stride=2, dilate=replace_stride_with_dilation[1]) - num_out_filters *= 2 - self.layer4 = self._make_layer(block, num_out_filters, layers[3], - stride=2, dilate=replace_stride_with_dilation[2]) - self.avgpool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1)) - self.l2norm = normalize - if output_dim == 0: - self.projection_head = None - elif hidden_mlp == 0: - self.projection_head = paddle.nn.Linear(in_features= - num_out_filters * block.expansion, out_features=output_dim) - else: - self.projection_head = paddle.nn.Sequential(paddle.nn.Linear( - in_features=num_out_filters * block.expansion, out_features - =hidden_mlp), paddle.nn.BatchNorm1D(num_features=hidden_mlp, - momentum=1 - 0.1, epsilon=1e-05, weight_attr=None, - bias_attr=None, use_global_stats=True), paddle.nn.ReLU(), - paddle.nn.Linear(in_features=hidden_mlp, out_features= - output_dim)) - self.prototypes = None - if isinstance(nmb_prototypes, list): - self.prototypes = MultiPrototypes(output_dim, nmb_prototypes) - elif nmb_prototypes > 0: - self.prototypes = paddle.nn.Linear(in_features=output_dim, - out_features=nmb_prototypes, bias_attr=False) - for sublayer in self.sublayers(): - if isinstance(sublayer, nn.Conv2D): - kaiming_normal_init(sublayer.weight) # todo mode='fan_out', - elif isinstance(sublayer, (nn.BatchNorm2D, nn.GroupNorm)): - constant_init(sublayer.weight, value=1.0) - constant_init(sublayer.bias, value=0.0) - +from passl.nn import init + +__all__ = [ + "ResNet", + "resnet18", + "resnet34", + "resnet50", + "resnet101", + "resnet152", + "resnext50_32x4d", + "resnext50_64x4d", + "resnext101_32x4d", + "resnext101_64x4d", + "resnext152_32x4d", + "resnext152_64x4d", + "wide_resnet50_2", + "wide_resnet101_2", +] + +class ResNet(PDResNet, Model): + def __init__( + self, + block, + depth=50, + width=64, + class_num=1000, + with_pool=True, + groups=1, + zero_init_residual=True, + ): + super().__init__(block, depth=depth, width=width, num_classes=class_num, with_pool=with_pool, groups=groups) + + # Zero-initialize the last BN in each residual branch, + # so that the residual branch starts with zeros, and each residual block behaves like an identity. + # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 if zero_init_residual: - for sublayer in self.sublayers(): - if isinstance(m, Bottleneck): - param_init.constant_init(sublayer.bn3.weight, value=0.0) + for m in self.sublayers(): + if isinstance(m, BottleneckBlock): + init.constant_(m.bn3.weight, 0) elif isinstance(m, BasicBlock): - param_init.constant_init(sublayer.bn2.weight, value=0.0) - - def _make_layer(self, block, planes, blocks, stride=1, dilate=False): - norm_layer = self._norm_layer - downsample = None - previous_dilation = self.dilation - if dilate: - self.dilation *= stride - stride = 1 - if stride != 1 or self.inplanes != planes * block.expansion: - downsample = paddle.nn.Sequential(conv1x1(self.inplanes, planes * - block.expansion, stride), norm_layer(planes * block.expansion)) - layers = [] - layers.append(block(self.inplanes, planes, stride, downsample, self - .groups, self.base_width, previous_dilation, norm_layer)) - self.inplanes = planes * block.expansion - for _ in range(1, blocks): - layers.append(block(self.inplanes, planes, groups=self.groups, - base_width=self.base_width, dilation=self.dilation, - norm_layer=norm_layer)) - return paddle.nn.Sequential(*layers) - - def forward_backbone(self, x): - x = self.padding(x) - x = self.conv1(x) - # print("before bn mean var", self.bn1._mean.mean(), self.bn1._variance.mean()) - x = self.bn1(x) - # print("bn mean var", self.bn1._mean.mean(), self.bn1._variance.mean()) - x = self.relu(x) - x = self.maxpool(x) - x = self.layer1(x) - x = self.layer2(x) - x = self.layer3(x) - x = self.layer4(x) - if self.eval_mode: - return x - x = self.avgpool(x) - x = paddle.flatten(x=x, start_axis=1) - return x - - def forward_head(self, x): - if self.projection_head is not None: - # print("before proj bn mean var", self.projection_head[1]._mean.mean(), self.projection_head[1]._variance.mean()) - x = self.projection_head(x) - # print(" proj bn mean var", self.projection_head[1]._mean.mean(), self.projection_head[1]._variance.mean()) - if self.l2norm: - x = paddle.nn.functional.normalize(x=x, axis=1, p=2) - if self.prototypes is not None: - return x, self.prototypes(x) - return x - - def forward(self, inputs): - if not isinstance(inputs, list): - inputs = [inputs] - - idx_crops = paddle.cumsum(x=paddle.unique_consecutive(x=paddle. - to_tensor(data=[inp.shape[-1] for inp in inputs]), - return_counts=True)[1], axis=0) # padiff - start_idx = 0 - for end_idx in idx_crops: - _out = self.forward_backbone(paddle.concat(x=inputs[start_idx:end_idx])) - if start_idx == 0: - output = _out - else: - output = paddle.concat(x=(output, _out)) - start_idx = end_idx - return self.forward_head(output) - - -class MultiPrototypes(paddle.nn.Layer): - def __init__(self, output_dim, nmb_prototypes): - super(MultiPrototypes, self).__init__() - self.nmb_heads = len(nmb_prototypes) - for i, k in enumerate(nmb_prototypes): - self.add_module('prototypes' + str(i), paddle.nn.Linear( - in_features=output_dim, out_features=k, bias_attr=False)) - - def forward(self, x): - out = [] - for i in range(self.nmb_heads): - out.append(getattr(self, 'prototypes' + str(i))(x)) - return out + init.constant_(m.bn2.weight, 0) + def load_pretrained(self, path, rank=0, finetune=False): + if not os.path.exists(path + '.pdparams'): + raise ValueError("Model pretrain path {} does not " + "exists.".format(path)) -def resnet50(**kwargs): - return ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) + state_dict = self.state_dict() + param_state_dict = paddle.load(path + ".pdparams") + # for FP16 saving pretrained weight + for key, value in param_state_dict.items(): + if key in param_state_dict and key in state_dict and param_state_dict[ + key].dtype != state_dict[key].dtype: + param_state_dict[key] = param_state_dict[key].astype( + state_dict[key].dtype) -def resnet50w2(**kwargs): - return ResNet(Bottleneck, [3, 4, 6, 3], widen=2, **kwargs) + self.set_dict(param_state_dict) + def save(self, path, local_rank=0, rank=0): + paddle.save(self.state_dict(), path + ".pdparams") -def resnet50w4(**kwargs): - return ResNet(Bottleneck, [3, 4, 6, 3], widen=4, **kwargs) +def resnet18(**kwargs): + """ResNet 18-layer model from + `"Deep Residual Learning for Image Recognition" `_. + """ + model = ResNet(BasicBlock, 18, **kwargs) + return model -def resnet50w5(**kwargs): - return ResNet(Bottleneck, [3, 4, 6, 3], widen=5, **kwargs) +def resnet34(**kwargs): + """ResNet 34-layer model from + `"Deep Residual Learning for Image Recognition" `_. + """ + model = ResNet(BasicBlock, 34, **kwargs) + return model + +def resnet50(**kwargs): + """ResNet 50-layer model from + `"Deep Residual Learning for Image Recognition" `_. + """ + + model = ResNet(BottleneckBlock, 50, **kwargs) + return model + + +def resnet101(**kwargs): + """ResNet 101-layer model from + `"Deep Residual Learning for Image Recognition" `_. + """ + + model = ResNet(BottleneckBlock, 101, **kwargs) + return model + +def resnet152(**kwargs): + """ResNet 152-layer model from + `"Deep Residual Learning for Image Recognition" `_. + """ + + model = ResNet(BottleneckBlock, 152, **kwargs) + return model + + +def resnext50_32x4d(**kwargs): + """ResNeXt-50 32x4d model from + `"Aggregated Residual Transformations for Deep Neural Networks" `_. + """ + + kwargs['groups'] = 32 + kwargs['width'] = 4 + model = ResNet(BottleneckBlock, 50, **kwargs) + return model + +def resnext50_64x4d(**kwargs): + """ResNeXt-50 64x4d model from + `"Aggregated Residual Transformations for Deep Neural Networks" `_. + """ + + kwargs['groups'] = 64 + kwargs['width'] = 4 + model = ResNet(BottleneckBlock, 50, **kwargs) + return model + +def resnext101_32x4d(**kwargs): + """ResNeXt-101 32x4d model from + `"Aggregated Residual Transformations for Deep Neural Networks" `_. + """ + + kwargs['groups'] = 32 + kwargs['width'] = 4 + model = ResNet(BottleneckBlock, 101, **kwargs) + return model + +def resnext101_64x4d(**kwargs): + """ResNeXt-101 64x4d model from + `"Aggregated Residual Transformations for Deep Neural Networks" `_. + """ + + kwargs['groups'] = 64 + kwargs['width'] = 4 + model = ResNet(BottleneckBlock, 101, **kwargs) + return model + + +def resnext152_32x4d(**kwargs): + """ResNeXt-152 32x4d model from + `"Aggregated Residual Transformations for Deep Neural Networks" `_. + """ + + kwargs['groups'] = 32 + kwargs['width'] = 4 + model = ResNet(BottleneckBlock, 152, **kwargs) + return model + +def resnext152_64x4d(**kwargs): + """ResNeXt-152 64x4d model from + `"Aggregated Residual Transformations for Deep Neural Networks" `_. + """ + + kwargs['groups'] = 64 + kwargs['width'] = 4 + model = ResNet(BottleneckBlock, 152, **kwargs) + return model + +def wide_resnet50_2(**kwargs): + """Wide ResNet-50-2 model from + `"Wide Residual Networks" `_. + """ + + kwargs['width'] = 64 * 2 + model = ResNet(BottleneckBlock, 50, **kwargs) + return model + +def wide_resnet101_2(**kwargs): + """Wide ResNet-101-2 model from + `"Wide Residual Networks" `_. + """ + + kwargs['width'] = 64 * 2 + model = ResNet(BottleneckBlock, 101, **kwargs) + return model diff --git a/passl/models/swav.py b/passl/models/swav.py index cf9500fb..0b0b554c 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -7,9 +7,9 @@ import paddle.nn as nn from passl.nn import init -from passl.scheduler import build_lr_scheduler, lr_scheduler +from passl.scheduler import build_lr_scheduler from passl.utils import logger -from passl.models.resnet import resnet50 +from passl.models.swav_resnet import swavresnet50 from passl.models.base_model import Model @@ -27,7 +27,7 @@ class SwAV(Model): def __init__(self, **kwargs): super().__init__() - self.res_model = resnet50(**kwargs) + self.res_model = swavresnet50(**kwargs) def _load_model(self, path, model, tag): if os.path.isfile(path): diff --git a/passl/models/swav_resnet.py b/passl/models/swav_resnet.py new file mode 100644 index 00000000..2869eedc --- /dev/null +++ b/passl/models/swav_resnet.py @@ -0,0 +1,111 @@ +import paddle +import functools +import paddle.nn as nn + +from .resnet import ResNet, BottleneckBlock + +def kaiming_normal_init(param, **kwargs): + initializer = nn.initializer.KaimingNormal(**kwargs) + initializer(param, param.block) + +def constant_init(param, **kwargs): + initializer = nn.initializer.Constant(**kwargs) + initializer(param, param.block) + + +class SwAVResNet(paddle.nn.Layer): + def __init__(self, block, depth, + normalize=False, output_dim=0, hidden_mlp=0, + nmb_prototypes=0, eval_mode=False): + + super(SwAVResNet, self).__init__() + self.l2norm = normalize + self.eval_mode = eval_mode + num_out_filters = 512 + + self.avgpool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1)) + + if output_dim == 0: + self.projection_head = None + elif hidden_mlp == 0: + self.projection_head = paddle.nn.Linear(in_features= + num_out_filters * block.expansion, out_features=output_dim) + else: + self.projection_head = paddle.nn.Sequential(paddle.nn.Linear( + in_features=num_out_filters * block.expansion, out_features + =hidden_mlp), paddle.nn.BatchNorm1D(num_features=hidden_mlp, + momentum=1 - 0.1, epsilon=1e-05, weight_attr=None, + bias_attr=None, use_global_stats=True), paddle.nn.ReLU(), + paddle.nn.Linear(in_features=hidden_mlp, out_features= + output_dim)) + + self.prototypes = None + if isinstance(nmb_prototypes, list): + self.prototypes = MultiPrototypes(output_dim, nmb_prototypes) + elif nmb_prototypes > 0: + self.prototypes = paddle.nn.Linear(in_features=output_dim, + out_features=nmb_prototypes, bias_attr=False) + for sublayer in self.sublayers(): + if isinstance(sublayer, nn.Conv2D): + kaiming_normal_init(sublayer.weight) # todo mode='fan_out', + elif isinstance(sublayer, (nn.BatchNorm2D, nn.GroupNorm)): + constant_init(sublayer.weight, value=1.0) + constant_init(sublayer.bias, value=0.0) + + self.encoder = functools.partial(ResNet, block=block, depth=depth)(with_pool=False, class_num=0) + + def forward_backbone(self, x): + x = self.encoder(x) + + if self.eval_mode: + return x + + x = self.avgpool(x) + x = paddle.flatten(x=x, start_axis=1) + return x + + def forward_head(self, x): + if self.projection_head is not None: + x = self.projection_head(x) + if self.l2norm: + x = paddle.nn.functional.normalize(x=x, axis=1, p=2) + if self.prototypes is not None: + return x, self.prototypes(x) + return x + + def forward(self, inputs): + if not isinstance(inputs, list): + inputs = [inputs] + + idx_crops = paddle.cumsum(x=paddle.unique_consecutive(x=paddle. + to_tensor(data=[inp.shape[-1] for inp in inputs]), + return_counts=True)[1], axis=0) # padiff + start_idx = 0 + for end_idx in idx_crops: + _out = self.forward_backbone(paddle.concat(x=inputs[start_idx:end_idx])) + if start_idx == 0: + output = _out + else: + output = paddle.concat(x=(output, _out)) + start_idx = end_idx + return self.forward_head(output) + + +class MultiPrototypes(paddle.nn.Layer): + def __init__(self, output_dim, nmb_prototypes): + super(MultiPrototypes, self).__init__() + self.nmb_heads = len(nmb_prototypes) + for i, k in enumerate(nmb_prototypes): + self.add_module('prototypes' + str(i), paddle.nn.Linear( + in_features=output_dim, out_features=k, bias_attr=False)) + + def forward(self, x): + out = [] + for i in range(self.nmb_heads): + out.append(getattr(self, 'prototypes' + str(i))(x)) + return out + + +def swavresnet50(**kwargs): + return SwAVResNet(block=BottleneckBlock, depth=50, **kwargs) + diff --git a/passl/optimizer/momentum_larc.py b/passl/optimizer/momentum_larc.py index 80427b9d..09982f78 100644 --- a/passl/optimizer/momentum_larc.py +++ b/passl/optimizer/momentum_larc.py @@ -16,10 +16,8 @@ from __future__ import division from __future__ import print_function -import math import paddle from .optimizer import Optimizer -from passl.utils import logger class MomentumLARC(Optimizer): diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml index 974d84b1..a01f5680 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml @@ -4,7 +4,7 @@ Global: train_loop: ClassificationTrainingEpochLoop validate_loop: ClassificationEvaluationLoop checkpoint: null - pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams + pretrained_model: swav_800ep_pretrain_adjustresnet.pdparams finetune: True output_dir: ./output/semi_0426_semi10 device: gpu diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh index 494e2002..af548129 100644 --- a/tasks/ssl/swav/finetune.sh +++ b/tasks/ssl/swav/finetune.sh @@ -23,5 +23,5 @@ python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ - passl-train -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml + tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml \ No newline at end of file From 21639c2e5df86e7a2077ac1a5fcc1e9f2b953b24 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Fri, 5 May 2023 15:41:32 +0800 Subject: [PATCH 18/46] add_ci_readme --- passl/models/swav.py | 1 + tasks/ssl/swav/README.md | 50 ++++++++----------- ...av_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml} | 2 +- ...wav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml} | 2 +- ...v_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml | 2 +- ..._resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml | 4 +- tasks/ssl/swav/finetune.sh | 2 +- tasks/ssl/swav/linearprobe.sh | 2 +- tests/CI/case.sh | 49 ++++++++++++++++++ .../swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh | 30 +++++++++++ .../swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh | 30 +++++++++++ ...wav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh | 29 +++++++++++ 12 files changed, 168 insertions(+), 35 deletions(-) rename tasks/ssl/swav/configs/{swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml => swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml} (97%) rename tasks/ssl/swav/configs/{swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml => swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml} (95%) create mode 100644 tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh create mode 100644 tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh create mode 100644 tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh diff --git a/passl/models/swav.py b/passl/models/swav.py index 0b0b554c..22905d9c 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -30,6 +30,7 @@ def __init__(self, **kwargs): self.res_model = swavresnet50(**kwargs) def _load_model(self, path, model, tag): + path = path + ".pdparams" if os.path.isfile(path): para_state_dict = paddle.load(path) diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md index b3f14b0e..d14c1b81 100644 --- a/tasks/ssl/swav/README.md +++ b/tasks/ssl/swav/README.md @@ -1,10 +1,10 @@ -## MoCo v3 for Self-supervised ResNet and ViT +## SwAV: Unsupervised Learning of Visual Features by Contrasting Cluster Assignments -PaddlePaddle reimplementation of [facebookresearch's repository for the MoCo v3 model](https://github.com/facebookresearch/moco-v3) that was released with the paper [An Empirical Study of Training Self-Supervised Vision Transformers](https://arxiv.org/abs/2104.02057). +PaddlePaddle reimplementation of [facebookresearch's repository for the SwAV model](https://github.com/facebookresearch/swav) that was released with the paper [Unsupervised Learning of Visual Features by Contrasting Cluster Assignments](https://arxiv.org/abs/2006.09882). ## Requirements -To enjoy some new features, PaddlePaddle 2.4 is required. For more installation tutorials +To enjoy some new features, PaddlePaddle develop is required. For more installation tutorials refer to [installation.md](../../../tutorials/get_started/installation.md) ## Data Preparation @@ -20,7 +20,7 @@ dataset/ ## How to Self-supervised Pre-Training -With a batch size of 4096, ViT-Base is trained with 4 nodes: +With a batch size of 4096, SwAV is trained with 4 nodes: ```bash # Note: Set the following environment variables @@ -36,12 +36,12 @@ python -m paddle.distributed.launch \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ passl-train \ - -c ./configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml + -c ./configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml ``` ## How to Linear Classification -By default, we use momentum-SGD and a batch size of 1024 for linear classification on frozen features/weights. This can be done with a single 8-GPU node. +By default, we use momentum-SGD and a batch size of 256 for linear classification on frozen features/weights. This can be done with a single 8-GPU node. ```bash unset PADDLE_TRAINER_ENDPOINTS @@ -55,25 +55,17 @@ python -m paddle.distributed.launch \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ passl-train \ - -c ./configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml + -c ./configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yaml ``` ## How to End-to-End Fine-tuning -To perform end-to-end fine-tuning for ViT, use our script to convert the pre-trained ViT checkpoint to PASSL DeiT format: - -```bash -python extract_weight.py \ - --input pretrained/checkpoint_0299.pd \ - --output pretrained/moco_vit_base.pdparams -``` - -Then run the training with the converted PASSL format checkpoint: +To perform end-to-end fine-tuning for SwAV, run the training with the trained PASSL format checkpoint: ```bash unset PADDLE_TRAINER_ENDPOINTS export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export CUDA_VISIBLE_DEVICES=0,1,2,3 export FLAGS_stop_check_timeout=3600 python -m paddle.distributed.launch \ @@ -81,28 +73,30 @@ python -m paddle.distributed.launch \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ passl-train \ - -c ./configs/mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1.yaml + -c ./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml ``` ## Other Configurations -We provide more directly runnable configurations, see [MoCoV3 Configurations](./configs/). +We provide more directly runnable configurations, see [SwAV Configurations](./configs/). ## Models ### ViT-Base -| Model | Phase | Dataset | Configs | GPUs | Epochs | Top1 Acc | Checkpoint | +| Model | Phase | Dataset | Configs | GPUs | Epochs | Top1 Acc (%) | Links | | ------------- | ----------- | ------------ | ------------------------------------------------------------ | ---------- | ------ | -------- | ------------------------------------------------------------ | -| moco_vit_base | pretrain | ImageNet2012 | [config](./configs/mocov3_vit_base_patch16_224_pt_in1k_4n32c_dp_fp16o1.yaml) | A100*N4C32 | 300 | - | [download](https://plsc.bj.bcebos.com/models/mocov3/v2.4/moco_vit_base_in1k_300ep.pd) | -| moco_vit_base | linear prob | ImageNet2012 | [config](./configs/mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1.yaml) | A100*N1C8 | 90 | 0.7662 | | -| moco_vit_base | finetune | ImageNet2012 | [config](./configs/DeiT_base_patch16_224_in1k_1n8c_dp_fp16o1.yaml) | A100*N1C8 | 150 | 0.8288 | | +| resnet50 | pretrain | ImageNet2012 | [config](./configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml) | A100*N4C32 | 800 | - | [model]() \| [log]() | +| resnet50 | linear probe | ImageNet2012 | [config](./configs/swav_resnet50_224_lp_in1k_4n32c_dp_fp16o1.yaml) | A100*N1C8 | 75.3 | 0.7662 | [model]() \| [log]() | +| resnet50 | finetune | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4 | 100 | 69.0 | [model]() \| [log]() | ## Citations ```bibtex -@Article{chen2021mocov3, - author = {Xinlei Chen* and Saining Xie* and Kaiming He}, - title = {An Empirical Study of Training Self-Supervised Vision Transformers}, - journal = {arXiv preprint arXiv:2104.02057}, - year = {2021}, +@misc{caron2021unsupervised, + title={Unsupervised Learning of Visual Features by Contrasting Cluster Assignments}, + author={Mathilde Caron and Ishan Misra and Julien Mairal and Priya Goyal and Piotr Bojanowski and Armand Joulin}, + year={2021}, + eprint={2006.09882}, + archivePrefix={arXiv}, + primaryClass={cs.CV} } ``` diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml similarity index 97% rename from tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml rename to tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml index a01f5680..8f641d0f 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml @@ -4,7 +4,7 @@ Global: train_loop: ClassificationTrainingEpochLoop validate_loop: ClassificationEvaluationLoop checkpoint: null - pretrained_model: swav_800ep_pretrain_adjustresnet.pdparams + pretrained_model: swav_800ep_pretrain_adjustresnet finetune: True output_dir: ./output/semi_0426_semi10 device: gpu diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml similarity index 95% rename from tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml rename to tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml index c67ddd2a..5a4e9a83 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml @@ -4,7 +4,7 @@ Global: train_loop: ClassificationTrainingEpochLoop validate_loop: ClassificationEvaluationLoop checkpoint: null - pretrained_model: /root/paddlejob/workspace/env_run/tangshiyu/PASSL/swav_800ep_pretrain.pdparams + pretrained_model: ./pretrained/swav/swav_resnet50_in1k_800ep_pretrained output_dir: ./output device: gpu save_interval: 1 diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml index 7cca7774..9eaf4fa1 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml @@ -5,7 +5,7 @@ Global: validate_loop: None checkpoint: null pretrained_model: null - output_dir: ./output/pretrain_0504_fp16 + output_dir: ./output/ device: gpu save_interval: 1 max_num_latest_checkpoint: 0 diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml index d0292c58..5ccbc6ad 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml @@ -5,7 +5,7 @@ Global: validate_loop: None checkpoint: null pretrained_model: null - output_dir: ./output/pretrain_0504_fp16 + output_dir: ./output/ device: gpu save_interval: 1 max_num_latest_checkpoint: 0 @@ -70,7 +70,7 @@ DataLoader: max_scale_crops: [1, 0.14] sampler: name: DistributedBatchSampler - batch_size: 64 # 4 card # 128 32 card # accum_steps: 1, total batchsize: 4096 + batch_size: 128 # 64 8 card # 128 32 card # accum_steps: 1, total batchsize: 4096 drop_last: True shuffle: True loader: diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh index af548129..9844d806 100644 --- a/tasks/ssl/swav/finetune.sh +++ b/tasks/ssl/swav/finetune.sh @@ -23,5 +23,5 @@ python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ - tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml + tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml \ No newline at end of file diff --git a/tasks/ssl/swav/linearprobe.sh b/tasks/ssl/swav/linearprobe.sh index 4c37392b..07ced970 100644 --- a/tasks/ssl/swav/linearprobe.sh +++ b/tasks/ssl/swav/linearprobe.sh @@ -22,4 +22,4 @@ python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ - tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yml + tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml diff --git a/tests/CI/case.sh b/tests/CI/case.sh index 4d863ed7..60814b7c 100644 --- a/tests/CI/case.sh +++ b/tests/CI/case.sh @@ -38,6 +38,9 @@ function model_list(){ mocov3_vit_base_patch16_224_pt_in1k_1n8c_dp_fp16o1 mocov3_deit_base_patch16_224_ft_in1k_1n8c_dp_fp16o1 mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1 + swav_resnet50_224_ft_in1k_1n4c_dp_fp32 + swav_resnet50_224_lp_in1k_1n8c_dp_fp32 + swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1 } ############ case start ############ @@ -354,6 +357,52 @@ function mocov3_vit_base_patch16_224_lp_in1k_1n8c_dp_fp16o1() { } +function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() { + echo "=========== $FUNCNAME run begin ===========" + rm -rf log + bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh + + loss=`cat log/workerlog.0 | grep '200/501' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` + ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` + mem=`cat log/workerlog.0 | grep '200/501' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` + loss_base=2.23445 + ips_base=793.89847 + mem_base=5.67 + check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} + echo "=========== $FUNCNAME run end ===========" +} + +function swav_resnet50_224_lp_in1k_1n8c_dp_fp32() { + echo "=========== $FUNCNAME run begin ===========" + rm -rf log + bash ./ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh + + loss=`cat log/workerlog.0 | grep '200/5005' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` + ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` + mem=`cat log/workerlog.0 | grep '200/5005' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` + loss_base=4.89133 + ips_base=11111.52955 + mem_base=0.83 + check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} + echo "=========== $FUNCNAME run end ===========" +} + + +function swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1() { + echo "=========== $FUNCNAME run begin ===========" + rm -rf log + bash ./ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh + + loss=`cat log/workerlog.0 | grep '200/2599' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` + ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` + mem=`cat log/workerlog.0 | grep '200/2599' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` + loss_base=8.00343 + ips_base=1385.94186 + mem_base=8.63 + check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} + echo "=========== $FUNCNAME run end ===========" +} + function check_result() { if [ $? -ne 0 ];then echo -e "\033 $1 model runs failed! \033" | tee -a $log_path/result.log diff --git a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh new file mode 100644 index 00000000..187b8e8b --- /dev/null +++ b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh @@ -0,0 +1,30 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export PADDLE_NNODES=1 +export PADDLE_MASTER="127.0.0.0:12538" +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export FLAGS_stop_check_timeout=3600 + +python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + passl-train \ + -c ../../tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yaml \ + -o Global.print_batch_step=1 \ + -o Global.max_train_step=201 \ + -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \ + -o Global.flags.FLAGS_cudnn_deterministic=1 + -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained \ No newline at end of file diff --git a/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh b/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh new file mode 100644 index 00000000..7c748f15 --- /dev/null +++ b/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh @@ -0,0 +1,30 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export PADDLE_NNODES=1 +export PADDLE_MASTER="127.0.0.0:12538" +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export FLAGS_stop_check_timeout=3600 + +python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + passl-train \ + -c ../../tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yaml \ + -o Global.print_batch_step=1 \ + -o Global.max_train_step=201 \ + -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \ + -o Global.flags.FLAGS_cudnn_deterministic=1 \ + -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained \ No newline at end of file diff --git a/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh b/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh new file mode 100644 index 00000000..954705ad --- /dev/null +++ b/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh @@ -0,0 +1,29 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export PADDLE_NNODES=1 +export PADDLE_MASTER="127.0.0.0:12538" +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export FLAGS_stop_check_timeout=3600 + +python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + passl-train \ + -c ../../tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml \ + -o Global.print_batch_step=1 \ + -o Global.max_train_step=201 \ + -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \ + -o Global.flags.FLAGS_cudnn_deterministic=1 \ No newline at end of file From 6814c125ec8b28b2bdaf95b232d522cf6be5ebbf Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Fri, 5 May 2023 17:02:24 +0800 Subject: [PATCH 19/46] compact_lr_group --- passl/engine/engine.py | 32 +-- passl/models/swav.py | 115 +++++---- passl/optimizer/__init__.py | 227 +++++++++++++----- passl/optimizer/momentum.py | 6 +- passl/optimizer/optimizer.py | 19 +- passl/optimizer/utils/__init__.py | 1 + passl/optimizer/utils/group_params.py | 194 +++++++++++++++ passl/scheduler/__init__.py | 8 +- passl/scheduler/lr_callable.py | 16 +- ...wav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml | 14 +- ...swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml | 4 +- ...v_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml | 4 +- ..._resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml | 4 +- tasks/ssl/swav/finetune.sh | 2 +- 14 files changed, 473 insertions(+), 173 deletions(-) create mode 100644 passl/optimizer/utils/__init__.py create mode 100644 passl/optimizer/utils/group_params.py diff --git a/passl/engine/engine.py b/passl/engine/engine.py index c50b5084..7cacb83f 100644 --- a/passl/engine/engine.py +++ b/passl/engine/engine.py @@ -213,20 +213,24 @@ def worker_init_fn(worker_id): paddle.set_default_dtype(default_dtype) # build optimizer and lr scheduler + assert self.config.get("Optimizer", None) is not None, "Optimizer must be defined in config." + self.lr_decay_unit = self.config["Optimizer"].pop('lr_decay_unit', None) + if self.lr_decay_unit is None: + self.lr_decay_unit = 'step' + logger.warning('lr_decay_unit is not set in optimizer config, set to step by default!') if self.mode == 'train': - assert self.config.get("Optimizer", None) is not None, "Optimizer must be defined in config." - if self.config["Optimizer"].get('decay_unit', None) is not None: - self.lr_decay_unit = self.config["Optimizer"]['decay_unit'] - else: - self.lr_decay_unit = 'step' - Warning('lr_decay_unit is not set in optimizer config, set to step by default') - - config_lr_scheduler = self.config["Optimizer"].get('LRScheduler', None) - self.lr_scheduler = None - if config_lr_scheduler is not None: - self.lr_scheduler = build_lr_scheduler(config_lr_scheduler, self.config["Global"]["epochs"], len(self.train_dataloader), self.lr_decay_unit) - - self.optimizer = build_optimizer(self.config["Optimizer"], self.model, self.config, len(self.train_dataloader), self.lr_scheduler) + config_lr_scheduler = self.config.get('LRScheduler', None) + self.lr_scheduler = None + if config_lr_scheduler is not None: + self.lr_decay_unit = config_lr_scheduler.get('decay_unit', + 'step') + self.lr_scheduler = build_lr_scheduler( + config_lr_scheduler, self.config["Global"]["epochs"], + len(self.train_dataloader)) + + self.optimizer = build_optimizer(self.config["Optimizer"], self.lr_scheduler, self.model, + self.config["Global"]["epochs"], len(self.train_dataloader), + self.lr_decay_unit) # load pretrained model if self.config["Global"]["pretrained_model"] is not None: @@ -368,4 +372,4 @@ def export(self): self.model.eval() path = os.path.join(self.output_dir, self.config["Model"]["name"]) - io.export(self.config["Export"], self.model, path) + io.export(self.config["Export"], self.model, path) \ No newline at end of file diff --git a/passl/models/swav.py b/passl/models/swav.py index 22905d9c..8d509560 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -105,68 +105,67 @@ def __init__(self, **kwargs): def load_pretrained(self, path, rank=0, finetune=False): self._load_model(path, self.res_model, 'backbone') - def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=None): - """ - custom_cfg(dict|optional): [{'name': 'backbone', 'lr': 0.1, 'LRScheduler': {"lr":1.0}}, {'name': 'norm', 'weight_decay_mult': 0}] - """ - - self.custom_cfg = config.pop('custom_cfg', None) - if self.custom_cfg is not None: - assert isinstance(self.custom_cfg, list), "`custom_cfg` must be a list." + # def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=None): + # """ + # custom_cfg(dict|optional): [{'name': 'backbone', 'lr': 0.1, 'LRScheduler': {"lr":1.0}}, {'name': 'norm', 'weight_decay_mult': 0}] + # """ + + # self.custom_cfg = config.pop('custom_cfg', None) + # if self.custom_cfg is not None: + # assert isinstance(self.custom_cfg, list), "`custom_cfg` must be a list." - for item in self.custom_cfg: - assert isinstance( - item, dict), "The item of `custom_cfg` must be a dict" + # for item in self.custom_cfg: + # assert isinstance( + # item, dict), "The item of `custom_cfg` must be a dict" - param_group = self._collect_params(config, self.res_model, tensor_fusion, epochs, trainset_length) + # param_group = self._collect_params(config, self.res_model, tensor_fusion, epochs, trainset_length) - return param_group + # return param_group - def _collect_params(self, config, model, tensor_fusion, epochs, trainset_length): - # Collect different parameter groups - if self.custom_cfg is None or len(self.custom_cfg) == 0: - return [{'params': model.parameters(), 'tensor_fusion': tensor_fusion}] - - # split params - self.weight_decay = config['weight_decay'] - params_dict = {item['name']: [] for item in self.custom_cfg} # key name and a PasslDefault - params_dict['PasslDefault'] = [] - for name, param in model.named_parameters(): - if param.stop_gradient: - continue - for idx, item in enumerate(self.custom_cfg): - if item['name'] in name: - params_dict[item['name']].append(param) - break - else: - params_dict['PasslDefault'].append(param) - - res = [] - for item in self.custom_cfg: - weight_decay_mult = item.get("weight_decay_mult", None) - if item.get("LRScheduler", None) is not None: - lr_scheduler = build_lr_scheduler(item['LRScheduler'], epochs, trainset_length, config['decay_unit']) - else: - Warning('The LRScheduler is not set for group with name {}, use default LRScheduler'.format(item['name'])) - # todo: initialize LRCallable here. - param_dict = {'params': params_dict[item['name']], 'lr': lr_scheduler} - - if self.weight_decay is not None and weight_decay_mult is not None: - param_dict['weight_decay'] = self.weight_decay * weight_decay_mult - param_dict['tensor_fusion'] = tensor_fusion - res.append(param_dict) - else: - res.append({'params': params_dict['PasslDefault'], 'tensor_fusion': tensor_fusion}) - - msg = 'Parameter groups for optimizer: \n' - for idx, item in enumerate(self.custom_cfg): - params_name = [p.name for p in params_dict[item['name']]] - item = item.copy() - item['params_name'] = params_name - msg += 'Group {}: \n{} \n'.format(idx, item) - logger.info(msg) - - return res + # def _collect_params(self, config, model, tensor_fusion, epochs, trainset_length): + # # Collect different parameter groups + # if self.custom_cfg is None or len(self.custom_cfg) == 0: + # return [{'params': model.parameters(), 'tensor_fusion': tensor_fusion}] + + # # split params + # self.weight_decay = config['weight_decay'] + # params_dict = {item['name']: [] for item in self.custom_cfg} # key name and a PasslDefault + # params_dict['PasslDefault'] = [] + # for name, param in model.named_parameters(): + # if param.stop_gradient: + # continue + # for idx, item in enumerate(self.custom_cfg): + # if item['name'] in name: + # params_dict[item['name']].append(param) + # break + # else: + # params_dict['PasslDefault'].append(param) + + # res = [] + # for item in self.custom_cfg: + # weight_decay_mult = item.get("weight_decay_mult", None) + # if item.get("LRScheduler", None) is not None: + # lr_scheduler = build_lr_scheduler(item['LRScheduler'], epochs, trainset_length, config['decay_unit']) + # else: + # Warning('The LRScheduler is not set for group with name {}, use default LRScheduler'.format(item['name'])) + # param_dict = {'params': params_dict[item['name']], 'lr': lr_scheduler} + + # if self.weight_decay is not None and weight_decay_mult is not None: + # param_dict['weight_decay'] = self.weight_decay * weight_decay_mult + # param_dict['tensor_fusion'] = tensor_fusion + # res.append(param_dict) + # else: + # res.append({'params': params_dict['PasslDefault'], 'tensor_fusion': tensor_fusion}) + + # msg = 'Parameter groups for optimizer: \n' + # for idx, item in enumerate(self.custom_cfg): + # params_name = [p.name for p in params_dict[item['name']]] + # item = item.copy() + # item['params_name'] = params_name + # msg += 'Group {}: \n{} \n'.format(idx, item) + # logger.info(msg) + + # return res def forward(self, inp): return self.res_model(inp) diff --git a/passl/optimizer/__init__.py b/passl/optimizer/__init__.py index 43216690..9f2170ae 100644 --- a/passl/optimizer/__init__.py +++ b/passl/optimizer/__init__.py @@ -18,12 +18,12 @@ from collections import defaultdict import copy +import re import paddle from passl.core.grad_clip import ClipGradByGlobalNorm from passl.core.param_fuse import get_fused_params -from passl.scheduler import LRCallable - +from passl.scheduler import build_lr_scheduler, LRCallable from passl.utils import logger from .optimizer import Optimizer @@ -32,76 +32,183 @@ from .momentum import Momentum from .momentum_lars import MomentumLARS from .momentum_larc import MomentumLARC - - -def build_optimizer(optim_config, model, config, trainset_length, lr_scheduler): - optim_config = copy.deepcopy(optim_config) - optim_name = optim_config.pop('name') - +from .utils.group_params import ( + param_group_layer_decay, + param_group_weight_decay, + group_params_by_state) + + +def build_group_lr_scheduler(param_groups_cfg, epochs, step_each_epoch, lr_decay_unit): + ''' + Build lr scheduler in each param_group. + Args: + param_groups_cfg: Dict, param_groups config + epochs: Int, epochs + step_each_epoch: Int, step for each epoch + + Returns: + param_groups_cfg: Dict of param_groups config in which lr has beed build + ''' + for idx, item in enumerate(param_groups_cfg): + lr_cfg = item.get('lr', None) + if isinstance(lr_cfg, dict): + if 'decay_unit' in lr_cfg: + logger.warning('decay_unit is no need to set, for it will be reset by lr_decay_unit.') + lr_cfg['decay_unit'] = lr_decay_unit + lr_scheduler = build_lr_scheduler(lr_cfg, epochs, step_each_epoch) + if isinstance(lr_scheduler, LRCallable): + item['lr_func'] = lr_scheduler + else: + item['lr'] = lr_scheduler + elif isinstance(lr_cfg, float): + item['lr'] = lr_cfg + logger.info('build lr scheduler in param_groups succeed.') + return param_groups_cfg + + +def group_params(model, param_groups_cfg=None): + ''' + Group params by config or by stop_gradient by default. + Args: + model: paddle.nn.Layer + param_groups_cfg: Dict, param_groups config + Returns: + Dict, f.g. {'group_name': {'params': [(name, param), ...],}} + ''' + + if param_groups_cfg and len(param_groups_cfg) > 0: + params_dict = {} + # init params_dict by config + for group in param_groups_cfg: + params_dict[group['name']] = {} + params_dict[group['name']]['params'] = [] + for k, v in group.items(): + params_dict[group['name']][k] = v + # add params + for name, param in model.named_parameters(): + if param.stop_gradient: + continue + flag = 0 + for g_name in params_dict: + if 'regular_exp' in params_dict[g_name]: + regular_exp = params_dict[g_name]['regular_exp'] + group_matcher = re.compile(regular_exp) + else: + group_matcher = re.compile(g_name) + if group_matcher.match(name): + params_dict[g_name]["params"].append((name, param)) + flag = 1 + break + if flag == 0: + if 'default' not in params_dict: + params_dict['default'] = {'params': []} + params_dict['default']["params"].append((name, param)) + + logger.info(f'Model parameters has been split into {len(params_dict)} groups by config.') + for key in params_dict: + logger.info(f"{key}-params length: {len(params_dict[key]['params'])}") + + return params_dict + + # default group method + param_groups = [] + for name, param in model.named_parameters(): + if param.stop_gradient: + continue + param_groups.append((name, param)) + logger.info(f'Model parameters has been split into 1 groups by default.') + return {'default': {"params": param_groups}} + + +def build_optimizer(config, lr_scheduler, model, epochs, step_each_epoch, lr_decay_unit): + config = copy.deepcopy(config) + + optim_name = config.pop('name') + layer_decay = config.pop('layer_decay', None) grad_clip = None - grad_clip_config = optim_config.pop('grad_clip', None) + grad_clip_config = config.pop('grad_clip', None) if grad_clip_config is not None: grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm') grad_clip = eval(grad_clip_name)(**grad_clip_config) - no_weight_decay_name = optim_config.pop('no_weight_decay_name', []) - tensor_fusion = optim_config.pop('tensor_fusion', True) + weight_decay = config.get('weight_decay', None) + no_weight_decay_name = config.pop('no_weight_decay_name', []) + + tensor_fusion = config.pop('tensor_fusion', True) if 'LAR' in optim_name: tensor_fusion = False - logger.info('LARS or LARC Optimizer can not use tensor fusion technology. It automatically fall back to `tensor_fusion = False`.') - - if hasattr(model, 'param_groups'): - # param_group = model.param_groups(no_weight_decay_name, tensor_fusion) # todo compact simsaim - param_group = model.param_groups(optim_config, tensor_fusion, config["Global"]["epochs"], trainset_length) - for group in param_group: - if 'tensor_fusion' in group and group['tensor_fusion']: - group['params'] = get_fused_params(group['params']) - optim_config.pop('custom_cfg', None) - + logger.info('LARS or LARC Optimizer can not use tensor fusion technology. ' + 'It automatically fall back to `tensor_fusion = False`.') + + # param_groups is a dict like {'group_name': {'params': [(name, param), ...]}} + if hasattr(model, 'param_group_fn'): + # param groups are defined by model + model_group_cfg = config.pop('param_group_fn', {}) + param_group_map = model.param_group_fn(no_weight_decay_name=no_weight_decay_name, weight_decay=weight_decay, + layer_decay=layer_decay, **model_group_cfg) else: - param_group_map = defaultdict(list) - for n, p in model.named_parameters(): - state = copy.deepcopy(p.__dict__) - state['stop_gradient'] = p.stop_gradient - if any(nd in n for nd in no_weight_decay_name): - state['no_weight_decay'] = True - param_group_map[str(state)].append(p) - - if tensor_fusion: - # fuse params - for key in param_group_map: - if 'gpu' not in paddle.get_device(): - continue - if "'is_distributed': True" in key: - continue - if "'has_sparse_grad': True" in key: - continue - param_group_map[key] = get_fused_params(param_group_map[key]) - - # bulid optimizer params - param_group = [] + param_groups_cfg = config.get('param_groups', None) + if param_groups_cfg and len(param_groups_cfg) > 0: + param_groups_cfg = build_group_lr_scheduler(param_groups_cfg, epochs, step_each_epoch, lr_decay_unit) + param_group_map = group_params(model, param_groups_cfg) + if isinstance(layer_decay, float): + param_group_map = param_group_layer_decay(model, + layer_decay, + weight_decay=weight_decay, + param_groups_map=param_group_map, + no_weight_decay_list=no_weight_decay_name, + ) + elif len(no_weight_decay_name) > 0: + param_group_map = param_group_weight_decay(model, + weight_decay=weight_decay, + param_groups_map=param_group_map, + no_weight_decay_list=no_weight_decay_name, + ) + + for key in param_group_map: + param_group_map[key]['params'] = [p for (n, p) in param_group_map[key]['params']] + + if tensor_fusion: + param_group_map = group_params_by_state(param_group_map) + # fuse params for key in param_group_map: - group = {'params': param_group_map[key]} - + if 'gpu' not in paddle.get_device(): + continue if "'is_distributed': True" in key: - group['is_distributed'] = True - - if 'no_weight_decay' in key: - group['weight_decay'] = 0.0 - - param_group.append(group) - - lr = lr_scheduler - lr_func = None - if isinstance(lr_scheduler, LRCallable): - lr = lr_scheduler.lr + continue + if "'has_sparse_grad': True" in key: + continue + param_group_map[key]["params"] = get_fused_params(param_group_map[key]["params"]) + + param_group = [] + for key in param_group_map: + group = param_group_map[key] + if "'is_distributed': True" in key: + group['is_distributed'] = True + if 'no_weight_decay' in key: + group['weight_decay'] = 0.0 + param_group.append(group) + + # build default lr scheduler + lr = lr_scheduler + lr_func = None + lr_cfg = config.pop('lr', None) + if isinstance(lr_cfg, float): + lr = lr_cfg + elif isinstance(lr_cfg, dict): + if 'decay_unit' in lr_cfg: + logger.warning('decay_unit is no need to set, for it will be reset by lr_decay_unit.') + lr_cfg['decay_unit'] = lr_decay_unit + lr_scheduler = build_lr_scheduler(lr_cfg, epochs, step_each_epoch) + lr = lr_scheduler + if isinstance(lr_scheduler, LRCallable): + lr = lr_scheduler.lr lr_func = lr_scheduler - + assert lr is not None, 'lr should not be None.' optim = eval(optim_name)(param_group, - lr=lr, + lr=lr, lr_func=lr_func, grad_clip=grad_clip, - **optim_config) - + **config) logger.debug("build optimizer ({}) success..".format(optim)) - return optim + return optim \ No newline at end of file diff --git a/passl/optimizer/momentum.py b/passl/optimizer/momentum.py index 179839fc..55402fd4 100644 --- a/passl/optimizer/momentum.py +++ b/passl/optimizer/momentum.py @@ -26,6 +26,8 @@ class Momentum(Optimizer): def __init__(self, params, + lr=0.001, + lr_func=None, momentum=0.9, weight_decay=0.0, use_master_param=True, @@ -33,6 +35,8 @@ def __init__(self, **args): defaults = dict( + lr=lr, + lr_func=lr_func, momentum=momentum, weight_decay=weight_decay, use_master_param=use_master_param, @@ -68,7 +72,7 @@ def step(self): grad = p.grad if grad is None: continue - # print('###########',p.name) + if grad.is_selected_rows(): raise RuntimeError( 'Momentum does not support sparse gradients.') diff --git a/passl/optimizer/optimizer.py b/passl/optimizer/optimizer.py index 234af8b9..19cd3428 100644 --- a/passl/optimizer/optimizer.py +++ b/passl/optimizer/optimizer.py @@ -83,7 +83,6 @@ def add_param_group(self, param_group): param_group.setdefault(name, deepcopy(default)) else: param_group.setdefault(name, default) - params = param_group['params'] if len(params) != len(set(params)): warnings.warn( @@ -114,6 +113,15 @@ def __getstate__(self): def __setstate__(self, state): self.__dict__.update(state) + @staticmethod + def _get_lr(param_group): + lr_t = param_group["lr"] + if isinstance(lr_t, paddle.optimizer.lr.LRScheduler): + lr_t = lr_t.get_lr() + if 'lr_scale' in param_group: + lr_t *= param_group['lr_scale'] + return lr_t + def state_dict(self): def pack_group(group): packed = {k: v for k, v in group.items() if k != 'params'} @@ -206,16 +214,13 @@ def clear_grad(self, set_to_zero=True): @paddle.no_grad() def lr_step(self, step=None): - for i, group in enumerate(self.param_groups): + for group in self.param_groups: lr = group['lr'] - - if isinstance(lr, paddle.optimizer.lr.LRScheduler): # group defined lr scheduler + if isinstance(lr, paddle.optimizer.lr.LRScheduler): lr.step(step) elif 'lr_func' in group and callable(group['lr_func']): group['lr_func'](group, step) - # print("####lr0 {}, lr0 {}".format(self.param_groups[0]['lr'].get_lr(), self.param_groups[1]['lr'].get_lr())) - @paddle.no_grad() def get_lr(self, group_id=0): lr = self.param_groups[group_id]['lr'] @@ -225,4 +230,4 @@ def get_lr(self, group_id=0): @paddle.no_grad() def step(self): - raise NotImplementedError + raise NotImplementedError \ No newline at end of file diff --git a/passl/optimizer/utils/__init__.py b/passl/optimizer/utils/__init__.py new file mode 100644 index 00000000..9d9f7a4a --- /dev/null +++ b/passl/optimizer/utils/__init__.py @@ -0,0 +1 @@ +from .group_params import * \ No newline at end of file diff --git a/passl/optimizer/utils/group_params.py b/passl/optimizer/utils/group_params.py new file mode 100644 index 00000000..9108222c --- /dev/null +++ b/passl/optimizer/utils/group_params.py @@ -0,0 +1,194 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import re +from collections import defaultdict +from passl.utils import logger + + +def group_with_matcher(model, group_matcher): + """ + + Args: + named_params: List like [(name, param),] + group_matcher: Dict like {group_name: regular_expression1} + Returns: + param_groups: Dict like {group_name: [param_name1, param_name2, ...]} + + """ + matcher_list = [] + for group_name, re_exps in group_matcher.items(): + assert re_exps is not None, "re_exps should not be None." + if isinstance(re_exps, (tuple, list)): + for re_str in re_exps: + matcher_list.append((group_name, re.compile(re_str))) + else: + matcher_list.append((group_name, re.compile(re_exps))) + param_groups = defaultdict(list) + default_group = [] + for name, param in model.named_parameters(): + if param.stop_gradient: + continue + flag = 0 + for group_name, matcher in matcher_list: + res = matcher.match(name) + if res: + param_groups[group_name].append((name, param)) + flag = 1 + if flag == 0: + default_group.append((name, param)) + if len(default_group) > 0: + param_groups['default'] = default_group + param_groups = {k: {"params": v} for k, v in param_groups.items()} + return param_groups + + +def group_params_by_state(param_groups_map): + ''' + group parameters by state for tensor fusion + Args: + param_groups_map: Dict like {'group_name': {'params': [param1, param2, ...]}} + + Returns: + new_param_groups: Dict like {'group_name': {'params': [param1, param2, ...]}} + ''' + new_param_groups = {} + for g_name in param_groups_map: + for param in param_groups_map[g_name]['params']: + if param.stop_gradient: + continue + state = copy.deepcopy(param.__dict__) + new_group_name = g_name+'_'+str(state) + if new_group_name not in new_param_groups: + new_param_groups[new_group_name] = { + "params": [], + "group_name": new_group_name, + } + for key in param_groups_map[g_name]: + if key not in ["params", "group_name"]: + new_param_groups[new_group_name][key] = param_groups_map[g_name][key] + + new_param_groups[new_group_name]["params"].append(param) + logger.info(f"The original param_groups which has {len(param_groups_map)} " + f"groups has been split to {len(new_param_groups)} groups by state.") + return new_param_groups + + +def param_group_layer_decay( + model, + layer_decay, + weight_decay=None, + group_matcher=None, + no_weight_decay_list=(), + param_groups_map=None, + ): + ''' + group parameters by layer_decay and weight_decay setting + Args: + model: instance of paddle.nn.Layer + layer_decay: float or None + weight_decay: float or None by default, which can also assigned in the optimizer args, + but it has the highest priority if given here. + group_matcher: Dict like {group_name: regular_expression1} + no_weight_decay_list: list of string(layer name keyword) + param_groups_map: Dict like {group_name: {'params': [(name, group), ...]}} + + Returns: + param_groups: Dict like {group_name: {'params': [(name, group), ...]}} + ''' + assert (not group_matcher) or (not param_groups_map), \ + "group_matcher and param_names_group should not be given in the same time." + if group_matcher: + param_groups_map = group_with_matcher(model, group_matcher) + num_layers = len(param_groups_map) + layer_scales = {z[0]: layer_decay ** (num_layers - i) for i, (k, v) in enumerate(param_groups_map.items()) for z in v} + param_groups = {} + for g_name in param_groups_map: + for name, param in param_groups_map[g_name]['params']: + if param.stop_gradient: + continue + lr_scale = layer_scales[name] if name in layer_scales else 1. + if any(nd in name for nd in no_weight_decay_list): + this_decay = 0. + g_decay = "no_weight_decay" + else: + this_decay = weight_decay + g_decay = "weight_decay" + new_group_name = g_name + '_' + g_decay + if new_group_name not in param_groups: + param_groups[new_group_name] = { + "lr_scale": lr_scale, + "params": [], + "group_name": new_group_name, + } + for key in param_groups_map[g_name]: + if key not in param_groups[new_group_name]: + param_groups[new_group_name][key] = param_groups_map[g_name][key] + if this_decay is not None: + param_groups[new_group_name]["weight_decay"] = this_decay + param_groups[new_group_name]["params"].append((name, param)) + return param_groups + + +def param_group_weight_decay( + model, + group_matcher=None, + weight_decay=None, + no_weight_decay_list=(), + param_groups_map=None, + ): + ''' + group parameters by weight_decay setting + Args: + model: instance of paddle.nn.Layer + group_matcher: Dict like {group_name: regular_expression1} + weight_decay: float or None by default, which can also assigned in the optimizer args, + but it has the highest priority if given here. + no_weight_decay_list: list of string(layer name keyword) + param_groups_map: Dict like {group_name: {'params': [(name, group), ...]}} + + Returns: + param_groups: Dict like {group_name: {'params': [(name, group), ...]}} + ''' + # weight_decay value can be None and assigned in the optimizer config, + # but it has the highest priority if given here. + assert (not group_matcher) or (not param_groups_map), \ + "group_matcher and param_names_group should not be given in the same time." + param_groups = {} + if group_matcher is not None: + param_groups_map = group_with_matcher(model, group_matcher) + for g_name in param_groups_map: + for name, param in param_groups_map[g_name]['params']: + if param.stop_gradient: + continue + if any(nd in name for nd in no_weight_decay_list): + g_decay = "no_weight_decay" + this_decay = 0. + else: + g_decay = "weight_decay" + this_decay = weight_decay + new_group_name = g_name + "_" + g_decay + if new_group_name not in param_groups: + param_groups[new_group_name] = { + "params": [], + "group_name": new_group_name, + } + for key in param_groups_map[g_name]: + if key not in param_groups[new_group_name]: + param_groups[new_group_name][key] = param_groups_map[g_name][key] + if this_decay is not None: + param_groups[new_group_name]["weight_decay"] = this_decay + param_groups[new_group_name]["params"].append((name, param)) + + return param_groups \ No newline at end of file diff --git a/passl/scheduler/__init__.py b/passl/scheduler/__init__.py index 4f31e170..bb0522e3 100644 --- a/passl/scheduler/__init__.py +++ b/passl/scheduler/__init__.py @@ -17,11 +17,11 @@ from passl.utils import logger from .lr_scheduler import TimmCosine, ViTLRScheduler, Step, Poly -from .lr_callable import LRCallable, CosineWithFixLR +from .lr_callable import LRCallable -def build_lr_scheduler(lr_config, epochs, step_each_epoch, decay_unit): - lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch, 'decay_unit': decay_unit }) +def build_lr_scheduler(lr_config, epochs, step_each_epoch): + lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch}) if 'name' in lr_config: lr_name = lr_config.pop('name') if "MultiStepDecay" in lr_name: @@ -39,4 +39,4 @@ def build_lr_scheduler(lr_config, epochs, step_each_epoch, decay_unit): else: lr = lr_config['learning_rate'] logger.debug("build lr ({}) success..".format(lr)) - return lr + return lr \ No newline at end of file diff --git a/passl/scheduler/lr_callable.py b/passl/scheduler/lr_callable.py index b4722733..3bd049e9 100644 --- a/passl/scheduler/lr_callable.py +++ b/passl/scheduler/lr_callable.py @@ -1,17 +1,3 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - import math class LRCallable(object): @@ -35,4 +21,4 @@ def __call__(self, group, epoch): if 'fix_lr' in group and group['fix_lr']: group['lr'] = self.lr else: - group['lr'] = cur_lr + group['lr'] = cur_lr \ No newline at end of file diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml index 8f641d0f..6d9687ab 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml @@ -41,16 +41,16 @@ Optimizer: momentum: 0.9 weight_decay: 0.0 tensor_fusion: False - decay_unit: epoch - LRScheduler: + lr_decay_unit: epoch + lr: name: MultiStepDecay learning_rate: 0.02 milestones: [12, 16] gamma: 0.2 last_epoch: -1 - custom_cfg: - - name: head - LRScheduler: + param_groups: + - name: res_model.projection_head + lr: name: MultiStepDecay learning_rate: 5 milestones: [12, 16] @@ -74,7 +74,7 @@ DataLoader: samples_tag: semi_10 sampler: name: DistributedBatchSampler - batch_size: 64 # accum_steps: 1, total batchsize: 256 + batch_size: 128 # accum_steps: 1, total batchsize: 256 drop_last: False shuffle: True loader: @@ -96,7 +96,7 @@ DataLoader: std: [0.228, 0.224, 0.225] sampler: name: DistributedBatchSampler - batch_size: 64 + batch_size: 128 drop_last: False shuffle: False loader: diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml index 5a4e9a83..bf7bdf5b 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml @@ -42,8 +42,8 @@ Optimizer: momentum: 0.9 weight_decay: 1e-6 tensor_fusion: True - decay_unit: epoch - LRScheduler: + lr_decay_unit: epoch + lr: name: TimmCosine learning_rate: 0.3 eta_min: 0.0 diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml index 9eaf4fa1..a6d0c2e5 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml @@ -46,8 +46,8 @@ Optimizer: trust_coefficient: 0.001 clip: False tensor_fusion: False - decay_unit: step - LRScheduler: + lr_decay_unit: step + lr: name: TimmCosine learning_rate: 0.6 eta_min: 0.0006 diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml index 5ccbc6ad..4cf7398e 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml @@ -49,8 +49,8 @@ Optimizer: trust_coefficient: 0.001 clip: False tensor_fusion: False - decay_unit: step - LRScheduler: + lr_decay_unit: step + lr: name: TimmCosine learning_rate: 4.8 eta_min: 0.0048 diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh index 9844d806..e52a9d0f 100644 --- a/tasks/ssl/swav/finetune.sh +++ b/tasks/ssl/swav/finetune.sh @@ -17,7 +17,7 @@ unset PADDLE_TRAINER_ENDPOINTS unset DISTRIBUTED_TRAINER_ENDPOINTS export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" -export CUDA_VISIBLE_DEVICES=4,5,6,7 +export CUDA_VISIBLE_DEVICES=0,1 #,2,5 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ From 77060bf5d78282ec33746eeb41581674ea554be5 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Fri, 5 May 2023 17:17:52 +0800 Subject: [PATCH 20/46] format --- passl/core/param_fuse.py | 2 +- passl/data/dataset/multicrop_dataset.py | 4 +- passl/engine/engine.py | 2 +- passl/engine/loops/classification_loop.py | 8 +- .../engine/loops/contrastive_learning_loop.py | 113 +------------ passl/models/swav.py | 154 ++++-------------- passl/models/swav_resnet.py | 29 +++- passl/optimizer/__init__.py | 4 +- passl/optimizer/optimizer.py | 2 +- passl/optimizer/utils/__init__.py | 16 +- passl/optimizer/utils/group_params.py | 2 +- passl/scheduler/__init__.py | 2 +- passl/scheduler/lr_callable.py | 2 +- passl/utils/io.py | 4 +- tasks/ssl/swav/README.md | 2 +- ...wav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml | 4 +- ...swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml | 2 +- tasks/ssl/swav/finetune.sh | 4 +- tasks/ssl/swav/pretrain.sh | 2 +- .../swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh | 2 +- .../swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh | 2 +- ...wav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh | 2 +- 22 files changed, 105 insertions(+), 259 deletions(-) diff --git a/passl/core/param_fuse.py b/passl/core/param_fuse.py index 87fc5cb3..f3ff5e46 100644 --- a/passl/core/param_fuse.py +++ b/passl/core/param_fuse.py @@ -504,4 +504,4 @@ def get_fused_params(params): for group_idx, parameters in var_groups.items(): fused_param = flatten_dense_tensors(parameters) fused_params.append(fused_param) - return fused_params \ No newline at end of file + return fused_params diff --git a/passl/data/dataset/multicrop_dataset.py b/passl/data/dataset/multicrop_dataset.py index 42b800f7..a4488e7b 100644 --- a/passl/data/dataset/multicrop_dataset.py +++ b/passl/data/dataset/multicrop_dataset.py @@ -72,7 +72,7 @@ def __getitem__(self, index): path, target = self.imgs[index] sample = self.loader(path) sample = list(map(lambda trans: trans(sample), self.trans)) - + return sample, target @@ -89,4 +89,4 @@ def get_color_distortion(s=1.0): rnd_color_jitter = RandomApply([color_jitter], p=0.8) rnd_gray = RandomGrayscale(p=0.2) color_distort = Compose([rnd_color_jitter, rnd_gray]) - return color_distort \ No newline at end of file + return color_distort diff --git a/passl/engine/engine.py b/passl/engine/engine.py index 7cacb83f..c3277561 100644 --- a/passl/engine/engine.py +++ b/passl/engine/engine.py @@ -372,4 +372,4 @@ def export(self): self.model.eval() path = os.path.join(self.output_dir, self.config["Model"]["name"]) - io.export(self.config["Export"], self.model, path) \ No newline at end of file + io.export(self.config["Export"], self.model, path) diff --git a/passl/engine/loops/classification_loop.py b/passl/engine/loops/classification_loop.py index 08bdc1d0..659bcc19 100644 --- a/passl/engine/loops/classification_loop.py +++ b/passl/engine/loops/classification_loop.py @@ -51,7 +51,7 @@ def forward_backward(self, batch): for idx in range(self.trainer.accum_steps): data = batch[0][idx * step_size:(idx + 1) * step_size] label = batch[1][idx * step_size:(idx + 1) * step_size] - + # do cast if using fp16 otherwise do nothing with paddle.amp.auto_cast( enable=self.trainer.fp16, @@ -61,7 +61,7 @@ def forward_backward(self, batch): out = self.trainer.model(data) final_out.append(out) - + loss_dict = self.trainer.train_loss_func(out, label) for key in loss_dict: @@ -92,7 +92,7 @@ def train_one_step(self, batch, total_iterations=None): self.trainer.scaler.update() # clear gradients self.trainer.optimizer.clear_grad() - + if self.trainer.lr_decay_unit == 'step': self.trainer.optimizer.lr_step(self.global_step) @@ -175,7 +175,7 @@ def eval_one_dataset(self, eval_dataloader): custom_white_list=self.trainer.fp16_custom_white_list, custom_black_list=self.trainer.fp16_custom_black_list, level=self.trainer.fp16_level): - + out = self.trainer.model(batch[0]) # calc loss if self.trainer.eval_loss_func is not None: diff --git a/passl/engine/loops/contrastive_learning_loop.py b/passl/engine/loops/contrastive_learning_loop.py index d943d4cc..5bdefea6 100644 --- a/passl/engine/loops/contrastive_learning_loop.py +++ b/passl/engine/loops/contrastive_learning_loop.py @@ -16,97 +16,13 @@ from __future__ import division from __future__ import print_function -import os -import sys -import logging -from datetime import timedelta - -import time import collections -import platform import paddle -from passl.core import grad_sync, param_sync -from passl.utils import io +from passl.core import grad_sync -from passl.utils import profiler from passl.utils import logger from .loop import TrainingEpochLoop - -class LogFormatter: - def __init__(self): - self.start_time = time.time() - - def format(self, record): - elapsed_seconds = round(record.created - self.start_time) - - prefix = "%s - %s - %s" % ( - record.levelname, - time.strftime("%x %X"), - timedelta(seconds=elapsed_seconds), - ) - message = record.getMessage() - message = message.replace("\n", "\n" + " " * (len(prefix) + 3)) - return "%s - %s" % (prefix, message) if message else "" - - -def create_logger(filepath, rank): - """ - Create a logger. - Use a different log file for each process. - """ - # create log formatter - log_formatter = LogFormatter() - - # create file handler and set level to debug - if filepath is not None: - if rank > 0: - filepath = "%s-%i" % (filepath, rank) - file_handler = logging.FileHandler(filepath, "a") - file_handler.setLevel(logging.DEBUG) - file_handler.setFormatter(log_formatter) - - # create console handler and set level to info - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.INFO) - console_handler.setFormatter(log_formatter) - - # create logger and set level to debug - logger = logging.getLogger() - logger.handlers = [] - logger.setLevel(logging.DEBUG) - logger.propagate = False - if filepath is not None: - logger.addHandler(file_handler) - logger.addHandler(console_handler) - - # reset logger elapsed time - def reset_time(): - log_formatter.start_time = time.time() - - logger.reset_time = reset_time - - return logger - - -def init_logger(name): - logger = create_logger( - os.path.join("{}.log".format(name)), rank=0 - ) - logger.info("============ Initialized logger ============") - logger.info("") - return logger - - -def log_model(model, logger): - model1 = model.res_model - for name, param in model1.named_parameters(): - logger.info(name) - logger.info(param.abs().sum()) - if param.grad is not None: - logger.info(name+'grad') - logger.info(param.grad.abs().sum()) - class ContrastiveLearningTrainingEpochLoop(TrainingEpochLoop): def __init__(self, trainer, epochs, max_train_step=None, val_loop=None): @@ -136,9 +52,7 @@ def forward_backward(self, batch, total_iterations): if isinstance(loss_dict, paddle.Tensor): loss_dict = {'loss': loss_dict} - ####### test ####### - # logger1 = init_logger('before_pretrain') - # log_model(self.trainer.model, logger1) + for key in loss_dict: loss_dict[key] = loss_dict[key] / self.trainer.accum_steps @@ -148,32 +62,19 @@ def forward_backward(self, batch, total_iterations): # loss scaling if using fp16 otherwise do nothing scaled = self.trainer.scaler.scale(loss_dict["loss"]) scaled.backward() - - try: + + try: self.trainer.model.after_loss_backward(total_iterations) except AttributeError: logger.warning("Model has no after_loss_backward method, ignored this process") - - ####### test ####### -# grad_sync(self.trainer.optimizer.param_groups) - -# # do unscale and step if using fp16 and not found nan/inf -# # otherwise do nothing -# self.trainer.scaler.step(self.trainer.optimizer) -# # do update loss scaling if using fp16 -# # otherwise do nothing -# self.trainer.scaler.update() - -# logger2 = init_logger('after_pretrain') - # log_model(self.trainer.model, logger2) - # print('final_loss_dict', final_loss_dict) + return final_loss_dict def train_one_step(self, batch, total_iterations): # remove label batch = batch[0] - + # do forward and backward loss_dict = self.forward_backward(batch, total_iterations) @@ -185,7 +86,7 @@ def train_one_step(self, batch, total_iterations): # do update loss scaling if using fp16 # otherwise do nothing self.trainer.scaler.update() - + # clear gradients self.trainer.optimizer.clear_grad() if self.trainer.lr_decay_unit == 'step': diff --git a/passl/models/swav.py b/passl/models/swav.py index 8d509560..50795189 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os import numpy as np from sys import flags @@ -23,17 +37,16 @@ 'SwAVPretrain', ] -# def model and class SwAV(Model): def __init__(self, **kwargs): super().__init__() self.res_model = swavresnet50(**kwargs) - + def _load_model(self, path, model, tag): path = path + ".pdparams" if os.path.isfile(path): para_state_dict = paddle.load(path) - + # resnet model_state_dict = model.state_dict() keys = model_state_dict.keys() @@ -68,13 +81,13 @@ def save(self, path, local_rank=0, rank=0): def _freeze_norm(self, layer): if isinstance(layer, (nn.layer.norm._BatchNormBase)): layer._use_global_stats = True - + class SwAVLinearProbe(SwAV): def __init__(self, class_num=1000, **kwargs): super().__init__(**kwargs) self.linear = RegLog(class_num) self.res_model.eval() - + # freeze all layers but the last fc for name, param in self.named_parameters(): if name not in ['linear.linear.weight', 'linear.linear.bias']: @@ -84,9 +97,9 @@ def __init__(self, class_num=1000, **kwargs): parameters = list( filter(lambda p: not p.stop_gradient, self.parameters())) assert len(parameters) == 2 # weight, bias - + self.apply(self._freeze_norm) - + def load_pretrained(self, path, rank=0, finetune=False): self._load_model(path, self.res_model, 'backbone') @@ -94,79 +107,17 @@ def forward(self, inp): with paddle.no_grad(): output = self.res_model(inp) output = self.linear(output) - + return output class SwAVFinetune(SwAV): def __init__(self, **kwargs): super().__init__(**kwargs) self.apply(self._freeze_norm) - + def load_pretrained(self, path, rank=0, finetune=False): - self._load_model(path, self.res_model, 'backbone') - - # def param_groups(self, config, tensor_fusion=True, epochs=None, trainset_length=None): - # """ - # custom_cfg(dict|optional): [{'name': 'backbone', 'lr': 0.1, 'LRScheduler': {"lr":1.0}}, {'name': 'norm', 'weight_decay_mult': 0}] - # """ - - # self.custom_cfg = config.pop('custom_cfg', None) - # if self.custom_cfg is not None: - # assert isinstance(self.custom_cfg, list), "`custom_cfg` must be a list." - - # for item in self.custom_cfg: - # assert isinstance( - # item, dict), "The item of `custom_cfg` must be a dict" - - # param_group = self._collect_params(config, self.res_model, tensor_fusion, epochs, trainset_length) - - # return param_group - - # def _collect_params(self, config, model, tensor_fusion, epochs, trainset_length): - # # Collect different parameter groups - # if self.custom_cfg is None or len(self.custom_cfg) == 0: - # return [{'params': model.parameters(), 'tensor_fusion': tensor_fusion}] - - # # split params - # self.weight_decay = config['weight_decay'] - # params_dict = {item['name']: [] for item in self.custom_cfg} # key name and a PasslDefault - # params_dict['PasslDefault'] = [] - # for name, param in model.named_parameters(): - # if param.stop_gradient: - # continue - # for idx, item in enumerate(self.custom_cfg): - # if item['name'] in name: - # params_dict[item['name']].append(param) - # break - # else: - # params_dict['PasslDefault'].append(param) - - # res = [] - # for item in self.custom_cfg: - # weight_decay_mult = item.get("weight_decay_mult", None) - # if item.get("LRScheduler", None) is not None: - # lr_scheduler = build_lr_scheduler(item['LRScheduler'], epochs, trainset_length, config['decay_unit']) - # else: - # Warning('The LRScheduler is not set for group with name {}, use default LRScheduler'.format(item['name'])) - # param_dict = {'params': params_dict[item['name']], 'lr': lr_scheduler} - - # if self.weight_decay is not None and weight_decay_mult is not None: - # param_dict['weight_decay'] = self.weight_decay * weight_decay_mult - # param_dict['tensor_fusion'] = tensor_fusion - # res.append(param_dict) - # else: - # res.append({'params': params_dict['PasslDefault'], 'tensor_fusion': tensor_fusion}) - - # msg = 'Parameter groups for optimizer: \n' - # for idx, item in enumerate(self.custom_cfg): - # params_name = [p.name for p in params_dict[item['name']]] - # item = item.copy() - # item['params_name'] = params_name - # msg += 'Group {}: \n{} \n'.format(idx, item) - # logger.info(msg) - - # return res - + self._load_model(path, self.res_model, 'backbone') + def forward(self, inp): return self.res_model(inp) @@ -179,23 +130,11 @@ def __init__(self, queue_length=0, crops_for_assign=(0, 1), nmb_crops=[2, 6], ep self.epsilon = epsilon self.freeze_prototypes_niters = freeze_prototypes_niters - # initialize queue - self.queue = None - # queue_path = os.path.join('.', "queue" + str(0) + ".pth") - # if os.path.isfile(queue_path): - # self.queue = paddle.load(queue_path)["queue"] - # # the queue needs to be divisible by the batch size - # queue_length = queue_length - # queue_length -= queue_length % (256) - # if queue_length > 0 and epoch >= 15 and self.queue is None: - # self.queue = paddle.zeros([len(crops_for_assign), - # queue_length // 4, kwargs['output_dim']]) - # self.load_pretrained('swav_800ep_pretrain.pdparams') self.apply(self._freeze_norm) - + def load_pretrained(self, path, rank=0, finetune=False): - self._load_model('swav_800ep_pretrain.pdparams', self.res_model, 'backbone') - + self._load_model('swav_800ep_pretrain.pdparams', self.res_model, 'backbone') + @paddle.no_grad() def distributed_sinkhorn(self, out, sinkhorn_iterations=3): Q = paddle.exp(x=out / self.epsilon).t() @@ -215,11 +154,6 @@ def distributed_sinkhorn(self, out, sinkhorn_iterations=3): return Q.t() def forward(self, inp): - # ####### test ####### - # import numpy as np - # np.random.seed(42) - # a = np.random.rand(32, 3, 224, 224) - # inp = paddle.to_tensor(a).astype('float32') bs = inp[0].shape[0] # normalize the prototypes @@ -228,8 +162,6 @@ def forward(self, inp): w = paddle.nn.functional.normalize(x=w, axis=0, p=2) # 1 paddle.assign(w, self.res_model.prototypes.weight) embedding, output = self.res_model(inp) - # print('output, embedding', embedding.mean(), output.mean(), inp.mean()) - # import pdb; pdb.set_trace() embedding = embedding.detach() # compute loss @@ -237,39 +169,25 @@ def forward(self, inp): for i, crop_id in enumerate(self.crops_for_assign): with paddle.no_grad(): out = output[bs * crop_id:bs * (crop_id + 1)].detach() - # print('bs, crop_id', bs, crop_id, self.nmb_crops) - if self.queue is not None: - if use_the_queue or not paddle.all(x=self.queue[(i), (-1), :] == 0): - use_the_queue = True - out = paddle.concat(x=(paddle.mm(input=self.queue[i], - mat2=self.res_model.prototypes.weight.t()), out)) - self.queue[(i), bs:] = self.queue[(i), :-bs].clone() - self.queue[(i), :bs] = embedding[crop_id * bs:(crop_id + 1) * bs] - q = self.distributed_sinkhorn(out)[-bs:] - # print('out.mean(), q.mean()', out.mean(), q.mean()) - + subloss = 0 - # print(output.shape) for v in np.delete(np.arange(np.sum(self.nmb_crops)), crop_id): x = output[bs * v:bs * (v + 1)] / self.temperature subloss -= paddle.mean(x=paddle.sum(x=q * paddle.nn. functional.log_softmax(x=x, axis=1), axis=1)) - # print('v, subloss', v, subloss) - + loss += subloss / (np.sum(self.nmb_crops) - 1) - # print('i, loss', i, loss) - # import pdb; pdb.set_trace() loss /= len(self.crops_for_assign) return loss - + def after_loss_backward(self, iteration): if iteration < self.freeze_prototypes_niters: for name, p in self.res_model.named_parameters(): if 'prototypes' in name and p.grad is not None: p.clear_grad() - + def swav_resnet50_linearprobe(**kwargs): model = SwAVLinearProbe(**kwargs) return model @@ -295,9 +213,9 @@ def swav_resnet50_pretrain(apex, **kwargs): # todo # with apex syncbn speeds up computation than global syncbn process_group = apex.parallel.create_syncbn_process_group(8) model = apex.parallel.convert_syncbn_model(model, process_group=process_group) - - return model - + + return model + class RegLog(paddle.nn.Layer): """Creates logistic regression on top of frozen features""" @@ -306,11 +224,11 @@ def __init__(self, num_labels): s = 2048 self.av_pool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1)) self.linear = paddle.nn.Linear(in_features=s, out_features=num_labels) - + init.normal_(self.linear.weight, mean=0.0, std=0.01) init.zeros_(self.linear.bias) def forward(self, x): x = self.av_pool(x) x = x.reshape((x.shape[0], -1)) - return self.linear(x) \ No newline at end of file + return self.linear(x) diff --git a/passl/models/swav_resnet.py b/passl/models/swav_resnet.py index 2869eedc..03e5dffb 100644 --- a/passl/models/swav_resnet.py +++ b/passl/models/swav_resnet.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import paddle import functools import paddle.nn as nn @@ -11,20 +25,20 @@ def kaiming_normal_init(param, **kwargs): def constant_init(param, **kwargs): initializer = nn.initializer.Constant(**kwargs) initializer(param, param.block) - - + + class SwAVResNet(paddle.nn.Layer): def __init__(self, block, depth, normalize=False, output_dim=0, hidden_mlp=0, nmb_prototypes=0, eval_mode=False): - + super(SwAVResNet, self).__init__() self.l2norm = normalize self.eval_mode = eval_mode num_out_filters = 512 - + self.avgpool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1)) - + if output_dim == 0: self.projection_head = None elif hidden_mlp == 0: @@ -59,7 +73,7 @@ def forward_backbone(self, x): if self.eval_mode: return x - + x = self.avgpool(x) x = paddle.flatten(x=x, start_axis=1) return x @@ -76,7 +90,7 @@ def forward_head(self, x): def forward(self, inputs): if not isinstance(inputs, list): inputs = [inputs] - + idx_crops = paddle.cumsum(x=paddle.unique_consecutive(x=paddle. to_tensor(data=[inp.shape[-1] for inp in inputs]), return_counts=True)[1], axis=0) # padiff @@ -108,4 +122,3 @@ def forward(self, x): def swavresnet50(**kwargs): return SwAVResNet(block=BottleneckBlock, depth=50, **kwargs) - diff --git a/passl/optimizer/__init__.py b/passl/optimizer/__init__.py index 9f2170ae..5f7d3982 100644 --- a/passl/optimizer/__init__.py +++ b/passl/optimizer/__init__.py @@ -93,7 +93,7 @@ def group_params(model, param_groups_cfg=None): if 'regular_exp' in params_dict[g_name]: regular_exp = params_dict[g_name]['regular_exp'] group_matcher = re.compile(regular_exp) - else: + else: group_matcher = re.compile(g_name) if group_matcher.match(name): params_dict[g_name]["params"].append((name, param)) @@ -211,4 +211,4 @@ def build_optimizer(config, lr_scheduler, model, epochs, step_each_epoch, lr_dec grad_clip=grad_clip, **config) logger.debug("build optimizer ({}) success..".format(optim)) - return optim \ No newline at end of file + return optim diff --git a/passl/optimizer/optimizer.py b/passl/optimizer/optimizer.py index 19cd3428..f556240a 100644 --- a/passl/optimizer/optimizer.py +++ b/passl/optimizer/optimizer.py @@ -230,4 +230,4 @@ def get_lr(self, group_id=0): @paddle.no_grad() def step(self): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/passl/optimizer/utils/__init__.py b/passl/optimizer/utils/__init__.py index 9d9f7a4a..d79233f8 100644 --- a/passl/optimizer/utils/__init__.py +++ b/passl/optimizer/utils/__init__.py @@ -1 +1,15 @@ -from .group_params import * \ No newline at end of file +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .group_params import * diff --git a/passl/optimizer/utils/group_params.py b/passl/optimizer/utils/group_params.py index 9108222c..bd04904d 100644 --- a/passl/optimizer/utils/group_params.py +++ b/passl/optimizer/utils/group_params.py @@ -191,4 +191,4 @@ def param_group_weight_decay( param_groups[new_group_name]["weight_decay"] = this_decay param_groups[new_group_name]["params"].append((name, param)) - return param_groups \ No newline at end of file + return param_groups diff --git a/passl/scheduler/__init__.py b/passl/scheduler/__init__.py index bb0522e3..a538bb70 100644 --- a/passl/scheduler/__init__.py +++ b/passl/scheduler/__init__.py @@ -39,4 +39,4 @@ def build_lr_scheduler(lr_config, epochs, step_each_epoch): else: lr = lr_config['learning_rate'] logger.debug("build lr ({}) success..".format(lr)) - return lr \ No newline at end of file + return lr diff --git a/passl/scheduler/lr_callable.py b/passl/scheduler/lr_callable.py index 62a46155..137fdc82 100644 --- a/passl/scheduler/lr_callable.py +++ b/passl/scheduler/lr_callable.py @@ -18,4 +18,4 @@ def __init__(self, learning_rate): self.lr = learning_rate def __call__(self, param_group, epoch): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/passl/utils/io.py b/passl/utils/io.py index 8904215c..ab166eaf 100644 --- a/passl/utils/io.py +++ b/passl/utils/io.py @@ -157,12 +157,12 @@ def save_checkpoint(net, if local_rank == 0: if loss_scaler is not None: opt_state_dict['scaler_state'] = loss_scaler.state_dict() - + # Solve AttrDict can't pickle error for group in opt_state_dict['param_groups']: if 'LRScheduler' in group: group['LRScheduler'] = dict(group['LRScheduler']) - + for model_prefix in model_prefixs: paddle.save(opt_state_dict, model_prefix + ".pdopt") paddle.save(metric_info, model_prefix + ".pdstates") diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md index d14c1b81..3bd16663 100644 --- a/tasks/ssl/swav/README.md +++ b/tasks/ssl/swav/README.md @@ -92,7 +92,7 @@ We provide more directly runnable configurations, see [SwAV Configurations](./co ```bibtex @misc{caron2021unsupervised, - title={Unsupervised Learning of Visual Features by Contrasting Cluster Assignments}, + title={Unsupervised Learning of Visual Features by Contrasting Cluster Assignments}, author={Mathilde Caron and Ishan Misra and Julien Mairal and Priya Goyal and Piotr Bojanowski and Armand Joulin}, year={2021}, eprint={2006.09882}, diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml index 6d9687ab..11feaa57 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml @@ -50,7 +50,7 @@ Optimizer: last_epoch: -1 param_groups: - name: res_model.projection_head - lr: + lr: name: MultiStepDecay learning_rate: 5 milestones: [12, 16] @@ -113,4 +113,4 @@ Metric: Export: export_type: paddle - input_shape: [None, 3, 224, 224] \ No newline at end of file + input_shape: [None, 3, 224, 224] diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml index bf7bdf5b..37247fde 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml @@ -24,7 +24,7 @@ DistributedStrategy: # model architecture Model: name: swav_resnet50_linearprobe - output_dim: 0 + output_dim: 0 eval_mode: True class_num: 1000 diff --git a/tasks/ssl/swav/finetune.sh b/tasks/ssl/swav/finetune.sh index e52a9d0f..3a598182 100644 --- a/tasks/ssl/swav/finetune.sh +++ b/tasks/ssl/swav/finetune.sh @@ -17,11 +17,11 @@ unset PADDLE_TRAINER_ENDPOINTS unset DISTRIBUTED_TRAINER_ENDPOINTS export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" -export CUDA_VISIBLE_DEVICES=0,1 #,2,5 +export CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml - \ No newline at end of file + diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh index d30ff34b..d1f6e86e 100644 --- a/tasks/ssl/swav/pretrain.sh +++ b/tasks/ssl/swav/pretrain.sh @@ -23,4 +23,4 @@ python -m paddle.distributed.launch \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml - # tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml \ No newline at end of file + # tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml diff --git a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh index 187b8e8b..f56f1e0b 100644 --- a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh +++ b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh @@ -27,4 +27,4 @@ python -m paddle.distributed.launch \ -o Global.max_train_step=201 \ -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \ -o Global.flags.FLAGS_cudnn_deterministic=1 - -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained \ No newline at end of file + -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained diff --git a/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh b/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh index 7c748f15..fd8a7709 100644 --- a/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh +++ b/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh @@ -27,4 +27,4 @@ python -m paddle.distributed.launch \ -o Global.max_train_step=201 \ -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \ -o Global.flags.FLAGS_cudnn_deterministic=1 \ - -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained \ No newline at end of file + -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained diff --git a/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh b/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh index 954705ad..2fa7ad20 100644 --- a/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh +++ b/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh @@ -26,4 +26,4 @@ python -m paddle.distributed.launch \ -o Global.print_batch_step=1 \ -o Global.max_train_step=201 \ -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \ - -o Global.flags.FLAGS_cudnn_deterministic=1 \ No newline at end of file + -o Global.flags.FLAGS_cudnn_deterministic=1 From 45d527359158ba55efbedcb80e621e9801ff4c4d Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Fri, 5 May 2023 17:33:33 +0800 Subject: [PATCH 21/46] valid_ft --- ..._resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml | 3 --- tasks/ssl/swav/pretrain.sh | 6 ++--- tasks/ssl/swav/pretrain_1N8C.sh | 26 +++++++++++++++++++ tests/CI/case.sh | 2 +- 4 files changed, 29 insertions(+), 8 deletions(-) create mode 100644 tasks/ssl/swav/pretrain_1N8C.sh diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml index 4cf7398e..deb515a3 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml @@ -21,9 +21,6 @@ Global: # FP16 setting FP16: level: O1 -# GradScaler: -# init_loss_scaling: 65536.0 -# incr_every_n_steps: 2000 DistributedStrategy: data_parallel: True diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh index d1f6e86e..e45ea53c 100644 --- a/tasks/ssl/swav/pretrain.sh +++ b/tasks/ssl/swav/pretrain.sh @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -unset PADDLE_TRAINER_ENDPOINTS unset DISTRIBUTED_TRAINER_ENDPOINTS -export PADDLE_NNODES=1 +export PADDLE_NNODES=4 export PADDLE_MASTER="127.0.0.1:12538" export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 @@ -22,5 +21,4 @@ python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ - tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml - # tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml + tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml diff --git a/tasks/ssl/swav/pretrain_1N8C.sh b/tasks/ssl/swav/pretrain_1N8C.sh new file mode 100644 index 00000000..d1f6e86e --- /dev/null +++ b/tasks/ssl/swav/pretrain_1N8C.sh @@ -0,0 +1,26 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +export PADDLE_NNODES=1 +export PADDLE_MASTER="127.0.0.1:12538" +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + +python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml + # tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml diff --git a/tests/CI/case.sh b/tests/CI/case.sh index f0c8772d..21a1f9b5 100644 --- a/tests/CI/case.sh +++ b/tests/CI/case.sh @@ -398,7 +398,7 @@ function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() { loss=`cat log/workerlog.0 | grep '200/501' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` mem=`cat log/workerlog.0 | grep '200/501' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` - loss_base=2.23445 + loss_base=1.95351 ips_base=793.89847 mem_base=5.67 check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} From 3ea3e73012bbeda0b4451656fba6d76e753c87ec Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Sat, 6 May 2023 09:58:45 +0800 Subject: [PATCH 22/46] backbone_config --- passl/data/dataset/multicrop_dataset.py | 2 -- passl/models/swav.py | 7 ++++++- .../swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml | 4 +++- .../swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml | 8 +++++--- .../swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml | 12 +++++++----- .../swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml | 10 ++++++---- tasks/ssl/swav/pretrain.sh | 2 +- tasks/ssl/swav/pretrain_1N8C.sh | 1 - 8 files changed, 28 insertions(+), 18 deletions(-) diff --git a/passl/data/dataset/multicrop_dataset.py b/passl/data/dataset/multicrop_dataset.py index a4488e7b..3a098b01 100644 --- a/passl/data/dataset/multicrop_dataset.py +++ b/passl/data/dataset/multicrop_dataset.py @@ -23,7 +23,6 @@ from passl.data.dataset.imagefolder_dataset import ImageFolder from passl.data.preprocess import ( RandomApply, - # GaussianBlur, SimCLRGaussianBlur, NormalizeImage, RandomGrayscale, @@ -77,7 +76,6 @@ def __getitem__(self, index): def get_pil_gaussian_blur(p=0.5): - # gaussian_blur = GaussianBlur(sigma=[.1, 2.], _PIL=True) gaussian_blur = SimCLRGaussianBlur(sigma=[.1, 2.]) rnd_gaussian_blur = RandomApply([gaussian_blur], p=p) return rnd_gaussian_blur diff --git a/passl/models/swav.py b/passl/models/swav.py index 50795189..989b11ab 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -40,7 +40,12 @@ class SwAV(Model): def __init__(self, **kwargs): super().__init__() - self.res_model = swavresnet50(**kwargs) + backbone_config = kwargs['backbone'] + backbone_type = backbone_config.pop("type", None) + if backbone_type is not None: + self.res_model = eval(backbone_type)(**backbone_config) + else: + AttributeError(f'Backbone type is not assigned, please assign it.') def _load_model(self, path, model, tag): path = path + ".pdparams" diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml index 11feaa57..7dc624cc 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml @@ -25,7 +25,9 @@ DistributedStrategy: # model architecture Model: name: swav_resnet50_finetune - output_dim: 1000 + backbone: + type: swavresnet50 + output_dim: 1000 # loss function config for traing/eval process Loss: diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml index 37247fde..68b54a47 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml @@ -24,9 +24,11 @@ DistributedStrategy: # model architecture Model: name: swav_resnet50_linearprobe - output_dim: 0 - eval_mode: True - class_num: 1000 + backbone: + type: swavresnet50 + output_dim: 0 + eval_mode: True + class_num: 1000 # loss function config for traing/eval process Loss: diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml index a6d0c2e5..2e5b7a7d 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml @@ -28,16 +28,18 @@ DistributedStrategy: # model architecture Model: name: swav_resnet50_pretrain - apex: False + backbone: + type: swavresnet50 + normalize: True + hidden_mlp: 2048 + output_dim: 128 + nmb_prototypes: 3000 + apex: False queue_length: 3804 # 0 crops_for_assign: [0, 1] nmb_crops: [2, 6] epsilon: 0.05 freeze_prototypes_niters: 5005 # 313 - normalize: True - hidden_mlp: 2048 - output_dim: 128 - nmb_prototypes: 3000 Optimizer: name: MomentumLARC diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml index deb515a3..cdc0ebf8 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml @@ -28,16 +28,18 @@ DistributedStrategy: # model architecture Model: name: swav_resnet50_pretrain + backbone: + type: swavresnet50 + normalize: True + hidden_mlp: 2048 + output_dim: 128 + nmb_prototypes: 3000 apex: False queue_length: 0 crops_for_assign: [0, 1] nmb_crops: [2, 6] epsilon: 0.05 freeze_prototypes_niters: 313 - normalize: True - hidden_mlp: 2048 - output_dim: 128 - nmb_prototypes: 3000 Optimizer: name: MomentumLARC diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh index e45ea53c..ad460394 100644 --- a/tasks/ssl/swav/pretrain.sh +++ b/tasks/ssl/swav/pretrain.sh @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -unset DISTRIBUTED_TRAINER_ENDPOINTS +# unset DISTRIBUTED_TRAINER_ENDPOINTS export PADDLE_NNODES=4 export PADDLE_MASTER="127.0.0.1:12538" export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 diff --git a/tasks/ssl/swav/pretrain_1N8C.sh b/tasks/ssl/swav/pretrain_1N8C.sh index d1f6e86e..ce6caad8 100644 --- a/tasks/ssl/swav/pretrain_1N8C.sh +++ b/tasks/ssl/swav/pretrain_1N8C.sh @@ -23,4 +23,3 @@ python -m paddle.distributed.launch \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml - # tools/train.py -c tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml From 675e075959bd200d4acc033d1998ea4cccbdebe3 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Sat, 6 May 2023 14:31:02 +0800 Subject: [PATCH 23/46] verified --- passl/data/dataset/imagefolder_dataset.py | 7 +-- passl/data/dataset/multicrop_dataset.py | 2 +- .../engine/loops/contrastive_learning_loop.py | 5 +-- passl/models/swav.py | 22 ++++----- tasks/ssl/swav/README.md | 45 ++++++++++++------- ...wav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml | 4 +- ...swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml | 2 +- ...v_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml | 8 ++-- 8 files changed, 54 insertions(+), 41 deletions(-) diff --git a/passl/data/dataset/imagefolder_dataset.py b/passl/data/dataset/imagefolder_dataset.py index dac2634a..a42c6425 100644 --- a/passl/data/dataset/imagefolder_dataset.py +++ b/passl/data/dataset/imagefolder_dataset.py @@ -20,6 +20,7 @@ import paddle +from passl.utils import logger from passl.data.dataset import default_loader IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", @@ -66,8 +67,8 @@ def __init__(self, if samples_tag is None: samples = self.make_dataset(self.root, class_to_idx, extensions) elif samples_tag == "semi_1" or samples_tag == "semi_10": - # connection reset - # subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(percent) + "percent.txt") + # connection reset proxyon + # subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(samples_tag.split('_')[-1]) + "percent.txt") # list_imgs = [li.decode("utf-8").split('\n')[0] for li in subset_file] subset_file = str(samples_tag.split('_')[-1]) + "percent.txt" with open(subset_file, 'r') as f: @@ -77,7 +78,7 @@ def __init__(self, else: raise NotImplementedError('{} is not implemented'.format(samples)) - print(f'find total {len(classes)} classes and {len(samples)} images.') + logger.info(f'find total {len(classes)} classes and {len(samples)} images.') self.extensions = extensions diff --git a/passl/data/dataset/multicrop_dataset.py b/passl/data/dataset/multicrop_dataset.py index 3a098b01..f3acce64 100644 --- a/passl/data/dataset/multicrop_dataset.py +++ b/passl/data/dataset/multicrop_dataset.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/passl/engine/loops/contrastive_learning_loop.py b/passl/engine/loops/contrastive_learning_loop.py index 5bdefea6..2cd5c91c 100644 --- a/passl/engine/loops/contrastive_learning_loop.py +++ b/passl/engine/loops/contrastive_learning_loop.py @@ -16,10 +16,10 @@ from __future__ import division from __future__ import print_function -import collections import paddle -from passl.core import grad_sync +import collections +from passl.core import grad_sync from passl.utils import logger from .loop import TrainingEpochLoop @@ -52,7 +52,6 @@ def forward_backward(self, batch, total_iterations): if isinstance(loss_dict, paddle.Tensor): loss_dict = {'loss': loss_dict} - for key in loss_dict: loss_dict[key] = loss_dict[key] / self.trainer.accum_steps diff --git a/passl/models/swav.py b/passl/models/swav.py index 989b11ab..0a423b30 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -47,21 +47,21 @@ def __init__(self, **kwargs): else: AttributeError(f'Backbone type is not assigned, please assign it.') - def _load_model(self, path, model, tag): + def _load_model(self, path, tag): path = path + ".pdparams" if os.path.isfile(path): para_state_dict = paddle.load(path) # resnet - model_state_dict = model.state_dict() + model_state_dict = self.state_dict() keys = model_state_dict.keys() num_params_loaded = 0 for k in keys: if k not in para_state_dict: - print("{} is not in pretrained model".format(k)) + logger.info("{} is not in pretrained model".format(k)) elif list(para_state_dict[k].shape) != list(model_state_dict[k] .shape): - print( + logger.info( "[SKIP] Shape of pretrained params {} doesn't match.(Pretrained: {}, Actual: {})" .format(k, para_state_dict[k].shape, model_state_dict[k] .shape)) @@ -71,11 +71,11 @@ def _load_model(self, path, model, tag): para_state_dict[k] = para_state_dict[k].astype(model_state_dict[k].dtype) model_state_dict[k] = para_state_dict[k] num_params_loaded += 1 - model.set_dict(model_state_dict) - print("There are {}/{} variables loaded into {}.".format( + self.set_dict(model_state_dict) + logger.info("There are {}/{} variables loaded into {}.".format( num_params_loaded, len(model_state_dict), tag)) else: - print("No pretrained weights found in {} => training with random weights".format(tag)) + logger.info("No pretrained weights found in {} => training with random weights".format(tag)) def load_pretrained(self, path, rank=0, finetune=False): pass @@ -106,7 +106,7 @@ def __init__(self, class_num=1000, **kwargs): self.apply(self._freeze_norm) def load_pretrained(self, path, rank=0, finetune=False): - self._load_model(path, self.res_model, 'backbone') + self._load_model(path, 'backbone') def forward(self, inp): with paddle.no_grad(): @@ -121,7 +121,7 @@ def __init__(self, **kwargs): self.apply(self._freeze_norm) def load_pretrained(self, path, rank=0, finetune=False): - self._load_model(path, self.res_model, 'backbone') + self._load_model(path, 'backbone') def forward(self, inp): return self.res_model(inp) @@ -137,8 +137,8 @@ def __init__(self, queue_length=0, crops_for_assign=(0, 1), nmb_crops=[2, 6], ep self.apply(self._freeze_norm) - def load_pretrained(self, path, rank=0, finetune=False): - self._load_model('swav_800ep_pretrain.pdparams', self.res_model, 'backbone') + # def load_pretrained(self, path, rank=0, finetune=False): + # self._load_model('swav_800ep_pretrain.pdparams', 'backbone') @paddle.no_grad() def distributed_sinkhorn(self, out, sinkhorn_iterations=3): diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md index 3bd16663..4cd19543 100644 --- a/tasks/ssl/swav/README.md +++ b/tasks/ssl/swav/README.md @@ -59,22 +59,35 @@ python -m paddle.distributed.launch \ ``` ## How to End-to-End Fine-tuning -To perform end-to-end fine-tuning for SwAV, run the training with the trained PASSL format checkpoint: - -```bash -unset PADDLE_TRAINER_ENDPOINTS -export PADDLE_NNODES=1 -export PADDLE_MASTER="127.0.0.1:12538" -export CUDA_VISIBLE_DEVICES=0,1,2,3 -export FLAGS_stop_check_timeout=3600 - -python -m paddle.distributed.launch \ - --nnodes=$PADDLE_NNODES \ - --master=$PADDLE_MASTER \ - --devices=$CUDA_VISIBLE_DEVICES \ - passl-train \ - -c ./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml -``` +To perform end-to-end fine-tuning for SwAV: + +* First download the data split text file with following commands: + ```bash + cd PASSL + + wget "https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/10percent.txt" + + wget "https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/1percent.txt" + ``` + +* Then, download the pretrained models to `./pretrained/swav/swav_resnet50_in1k_800ep_pretrained.pdparams` + +* Finally, run the training with the trained PASSL format checkpoint: + ```bash + unset PADDLE_TRAINER_ENDPOINTS + export PADDLE_NNODES=1 + export PADDLE_MASTER="127.0.0.1:12538" + export CUDA_VISIBLE_DEVICES=0,1,2,3 + export FLAGS_stop_check_timeout=3600 + + python -m paddle.distributed.launch \ + --nnodes=$PADDLE_NNODES \ + --master=$PADDLE_MASTER \ + --devices=$CUDA_VISIBLE_DEVICES \ + passl-train \ + -c ./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml + -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained + ``` ## Other Configurations We provide more directly runnable configurations, see [SwAV Configurations](./configs/). diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml index 7dc624cc..4eb691fe 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml @@ -4,9 +4,9 @@ Global: train_loop: ClassificationTrainingEpochLoop validate_loop: ClassificationEvaluationLoop checkpoint: null - pretrained_model: swav_800ep_pretrain_adjustresnet + pretrained_model: swav_800ep_pretrain_adjustresnetn finetune: True - output_dir: ./output/semi_0426_semi10 + output_dir: ./output/semi_0506_semi10 device: gpu save_interval: 1 max_num_latest_checkpoint: 0 diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml index 68b54a47..08a1dc25 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml @@ -28,7 +28,7 @@ Model: type: swavresnet50 output_dim: 0 eval_mode: True - class_num: 1000 + class_num: 1000 # loss function config for traing/eval process Loss: diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml index 2e5b7a7d..75961e5a 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml @@ -13,7 +13,7 @@ Global: eval_interval: 1 eval_unit: "epoch" accum_steps: 1 - epochs: 400 # 800 + epochs: 400 print_batch_step: 100 use_visualdl: False seed: 31 @@ -34,12 +34,12 @@ Model: hidden_mlp: 2048 output_dim: 128 nmb_prototypes: 3000 - apex: False - queue_length: 3804 # 0 + apex: False + queue_length: 3804 crops_for_assign: [0, 1] nmb_crops: [2, 6] epsilon: 0.05 - freeze_prototypes_niters: 5005 # 313 + freeze_prototypes_niters: 5005 Optimizer: name: MomentumLARC From 2c259818d80f170689880ef936af6e72a3d59a8e Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Sat, 6 May 2023 14:43:38 +0800 Subject: [PATCH 24/46] fix --- tests/CI/case.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/CI/case.sh b/tests/CI/case.sh index 21a1f9b5..192eb715 100644 --- a/tests/CI/case.sh +++ b/tests/CI/case.sh @@ -390,10 +390,10 @@ function simsiam_resnet50_lp_in1k_1n8c_dp_fp32() { echo "=========== $FUNCNAME run end ===========" } -function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() { +function swav_resnet50_224_ft_in1k_1n4c_dp() { echo "=========== $FUNCNAME run begin ===========" rm -rf log - bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh + bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh loss=`cat log/workerlog.0 | grep '200/501' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` @@ -405,10 +405,10 @@ function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() { echo "=========== $FUNCNAME run end ===========" } -function swav_resnet50_224_lp_in1k_1n8c_dp_fp32() { +function swav_resnet50_224_lp_in1k_1n8c_dp() { echo "=========== $FUNCNAME run begin ===========" rm -rf log - bash ./ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh + bash ./ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh loss=`cat log/workerlog.0 | grep '200/5005' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` From df744d0d1f37668b3beae63ed5ab6b41cdf8e003 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Sat, 6 May 2023 16:12:55 +0800 Subject: [PATCH 25/46] update --- passl/data/dataset/imagefolder_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/passl/data/dataset/imagefolder_dataset.py b/passl/data/dataset/imagefolder_dataset.py index a42c6425..76a5d77c 100644 --- a/passl/data/dataset/imagefolder_dataset.py +++ b/passl/data/dataset/imagefolder_dataset.py @@ -78,7 +78,7 @@ def __init__(self, else: raise NotImplementedError('{} is not implemented'.format(samples)) - logger.info(f'find total {len(classes)} classes and {len(samples)} images.') + print(f'find total {len(classes)} classes and {len(samples)} images.') self.extensions = extensions From 948e9b9f71984d079ede8932ba31fc8e92e4ea4a Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Sat, 6 May 2023 17:49:08 +0800 Subject: [PATCH 26/46] fix --- passl/models/swav.py | 4 ++-- .../swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml | 2 +- tasks/ssl/swav/linearprobe.sh | 2 +- tasks/ssl/swav/pretrain.sh | 5 +++-- tests/CI/case.sh | 4 ++-- ..._1n4c_dp.sh => swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh} | 0 ..._1n8c_dp.sh => swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh} | 0 7 files changed, 9 insertions(+), 8 deletions(-) rename tests/CI/ssl/swav/{swav_resnet50_224_ft_in1k_1n4c_dp.sh => swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh} (100%) rename tests/CI/ssl/swav/{swav_resnet50_224_lp_in1k_1n8c_dp.sh => swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh} (100%) diff --git a/passl/models/swav.py b/passl/models/swav.py index 0a423b30..3aa13757 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -72,8 +72,8 @@ def _load_model(self, path, tag): model_state_dict[k] = para_state_dict[k] num_params_loaded += 1 self.set_dict(model_state_dict) - logger.info("There are {}/{} variables loaded into {}.".format( - num_params_loaded, len(model_state_dict), tag)) + logger.info("There are {}/{} variables loaded into {} with {}.".format( + num_params_loaded, len(model_state_dict), tag, path)) else: logger.info("No pretrained weights found in {} => training with random weights".format(tag)) diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml index 4eb691fe..c7353402 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml @@ -4,7 +4,7 @@ Global: train_loop: ClassificationTrainingEpochLoop validate_loop: ClassificationEvaluationLoop checkpoint: null - pretrained_model: swav_800ep_pretrain_adjustresnetn + pretrained_model: epoch_73n finetune: True output_dir: ./output/semi_0506_semi10 device: gpu diff --git a/tasks/ssl/swav/linearprobe.sh b/tasks/ssl/swav/linearprobe.sh index 07ced970..ad2845a9 100644 --- a/tasks/ssl/swav/linearprobe.sh +++ b/tasks/ssl/swav/linearprobe.sh @@ -16,7 +16,7 @@ unset PADDLE_TRAINER_ENDPOINTS unset DISTRIBUTED_TRAINER_ENDPOINTS export PADDLE_NNODES=1 export PADDLE_MASTER="127.0.0.1:12538" -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export CUDA_VISIBLE_DEVICES=1,2,3,0,4,5,6,7 python -m paddle.distributed.launch \ --nnodes=$PADDLE_NNODES \ diff --git a/tasks/ssl/swav/pretrain.sh b/tasks/ssl/swav/pretrain.sh index ad460394..1288c5d3 100644 --- a/tasks/ssl/swav/pretrain.sh +++ b/tasks/ssl/swav/pretrain.sh @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -# unset DISTRIBUTED_TRAINER_ENDPOINTS -export PADDLE_NNODES=4 +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +export PADDLE_NNODES=2 export PADDLE_MASTER="127.0.0.1:12538" export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 diff --git a/tests/CI/case.sh b/tests/CI/case.sh index 192eb715..7d215a78 100644 --- a/tests/CI/case.sh +++ b/tests/CI/case.sh @@ -390,7 +390,7 @@ function simsiam_resnet50_lp_in1k_1n8c_dp_fp32() { echo "=========== $FUNCNAME run end ===========" } -function swav_resnet50_224_ft_in1k_1n4c_dp() { +function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() { echo "=========== $FUNCNAME run begin ===========" rm -rf log bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh @@ -405,7 +405,7 @@ function swav_resnet50_224_ft_in1k_1n4c_dp() { echo "=========== $FUNCNAME run end ===========" } -function swav_resnet50_224_lp_in1k_1n8c_dp() { +function swav_resnet50_224_lp_in1k_1n8c_dp_fp32() { echo "=========== $FUNCNAME run begin ===========" rm -rf log bash ./ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh diff --git a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh similarity index 100% rename from tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh rename to tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh diff --git a/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh b/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh similarity index 100% rename from tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh rename to tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh From 49b7daccd77d9610bc3575eae8f3655d334bed89 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Mon, 8 May 2023 10:29:37 +0800 Subject: [PATCH 27/46] fix_ci --- tests/CI/case.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/CI/case.sh b/tests/CI/case.sh index 7d215a78..21a1f9b5 100644 --- a/tests/CI/case.sh +++ b/tests/CI/case.sh @@ -393,7 +393,7 @@ function simsiam_resnet50_lp_in1k_1n8c_dp_fp32() { function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() { echo "=========== $FUNCNAME run begin ===========" rm -rf log - bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp.sh + bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh loss=`cat log/workerlog.0 | grep '200/501' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` @@ -408,7 +408,7 @@ function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() { function swav_resnet50_224_lp_in1k_1n8c_dp_fp32() { echo "=========== $FUNCNAME run begin ===========" rm -rf log - bash ./ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp.sh + bash ./ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh loss=`cat log/workerlog.0 | grep '200/5005' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` From 3cfb191d633d99a3a54db26fdcf083be72a6d48c Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Mon, 15 May 2023 11:04:30 +0800 Subject: [PATCH 28/46] edit_accord_comment --- passl/engine/engine.py | 18 +-- .../engine/loops/contrastive_learning_loop.py | 9 +- passl/engine/loops/loop.py | 10 +- passl/models/swav.py | 112 +++++++++++++++- passl/models/swav_resnet.py | 124 ------------------ passl/scheduler/__init__.py | 8 +- passl/scheduler/lr_scheduler.py | 6 +- 7 files changed, 132 insertions(+), 155 deletions(-) delete mode 100644 passl/models/swav_resnet.py diff --git a/passl/engine/engine.py b/passl/engine/engine.py index c3277561..c43b199b 100644 --- a/passl/engine/engine.py +++ b/passl/engine/engine.py @@ -233,14 +233,14 @@ def worker_init_fn(worker_id): self.lr_decay_unit) # load pretrained model - if self.config["Global"]["pretrained_model"] is not None: - assert isinstance( - self.config["Global"]["pretrained_model"], str - ), "pretrained_model type is not available. Please use `string`." - self.model.load_pretrained( - self.config["Global"]["pretrained_model"], - self.config["Global"]["rank"], - self.config["Global"].get("finetune", False)) + if self.config["Global"]["pretrained_model"] is not None: + assert isinstance( + self.config["Global"]["pretrained_model"], str + ), "pretrained_model type is not available. Please use `string`." + self.model.load_pretrained( + self.config["Global"]["pretrained_model"], + self.config["Global"]["rank"], + self.config["Global"].get("finetune", False)) # for distributed if self.config["Global"]["distributed"]: @@ -356,7 +356,7 @@ def train(self): self.vdl_writer.close() @paddle.no_grad() - def eval(self, epoch_id=0): + def eval(self): assert self.mode in ["train", "eval"] self.model.eval() self.validating = True diff --git a/passl/engine/loops/contrastive_learning_loop.py b/passl/engine/loops/contrastive_learning_loop.py index 2cd5c91c..663703cc 100644 --- a/passl/engine/loops/contrastive_learning_loop.py +++ b/passl/engine/loops/contrastive_learning_loop.py @@ -28,7 +28,7 @@ class ContrastiveLearningTrainingEpochLoop(TrainingEpochLoop): def __init__(self, trainer, epochs, max_train_step=None, val_loop=None): super().__init__(trainer, epochs, max_train_step=max_train_step, val_loop=val_loop) - def forward_backward(self, batch, total_iterations): + def forward_backward(self, batch): # Gradient Merge(GuoxiaWang): Accumulate gradient over multiple # steps to save on memory. @@ -63,19 +63,18 @@ def forward_backward(self, batch, total_iterations): scaled.backward() try: - self.trainer.model.after_loss_backward(total_iterations) + self.trainer.model.after_loss_backward(self.total_iterations) except AttributeError: logger.warning("Model has no after_loss_backward method, ignored this process") return final_loss_dict - def train_one_step(self, batch, total_iterations): - + def train_one_step(self, batch): # remove label batch = batch[0] # do forward and backward - loss_dict = self.forward_backward(batch, total_iterations) + loss_dict = self.forward_backward(batch) grad_sync(self.trainer.optimizer.param_groups) diff --git a/passl/engine/loops/loop.py b/passl/engine/loops/loop.py index 959bb386..c14978a5 100644 --- a/passl/engine/loops/loop.py +++ b/passl/engine/loops/loop.py @@ -219,7 +219,7 @@ def run(self): self.trainer.train_dataloader.batch_sampler.set_epoch(epoch_id) # for one epoch train - self.train_one_epoch(epoch_id) + self.train_one_epoch() if self.trainer.lr_decay_unit == 'epoch': self.trainer.optimizer.lr_step(self.cur_epoch_id) @@ -257,14 +257,14 @@ def run(self): self.trainer.training = False - def train_one_epoch(self, epoch_id): + def train_one_epoch(self): self.trainer.model.train() tic = time.time() for batch_idx, batch in enumerate(self.trainer.train_dataloader): self.cur_batch_idx = batch_idx - total_iterations = (epoch_id-1)*self.total_batch_idx + batch_idx + self.total_iterations = (self.trainer.cur_epoch_id-1)*self.total_batch_idx + batch_idx if self.max_train_step is not None and self.global_step >= self.max_train_step: logger.info( @@ -289,7 +289,7 @@ def train_one_epoch(self, epoch_id): self.global_step += 1 # do forward and backward - out, loss_dict = self.train_one_step(batch, total_iterations) + out, loss_dict = self.train_one_step(batch) self.time_info["batch_cost"].update(time.time() - tic) @@ -311,7 +311,7 @@ def train_one_epoch(self, epoch_id): tic = time.time() - def train_one_step(self, batch, total_iterations): + def train_one_step(self, batch): raise NotImplementedError def save_checkpoint(self): diff --git a/passl/models/swav.py b/passl/models/swav.py index 3aa13757..cb9c7298 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -13,19 +13,17 @@ # limitations under the License. import os +import functools import numpy as np from sys import flags -from collections import defaultdict import paddle import paddle.nn as nn from passl.nn import init -from passl.scheduler import build_lr_scheduler from passl.utils import logger -from passl.models.swav_resnet import swavresnet50 from passl.models.base_model import Model - +from passl.models.resnet import ResNet, BottleneckBlock __all__ = [ 'swav_resnet50_finetune', @@ -237,3 +235,109 @@ def forward(self, x): x = self.av_pool(x) x = x.reshape((x.shape[0], -1)) return self.linear(x) + + +def kaiming_normal_init(param, **kwargs): + initializer = nn.initializer.KaimingNormal(**kwargs) + initializer(param, param.block) + +def constant_init(param, **kwargs): + initializer = nn.initializer.Constant(**kwargs) + initializer(param, param.block) + + +class SwAVResNet(paddle.nn.Layer): + def __init__(self, block, depth, + normalize=False, output_dim=0, hidden_mlp=0, + nmb_prototypes=0, eval_mode=False): + + super(SwAVResNet, self).__init__() + self.l2norm = normalize + self.eval_mode = eval_mode + num_out_filters = 512 + + self.avgpool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1)) + + if output_dim == 0: + self.projection_head = None + elif hidden_mlp == 0: + self.projection_head = paddle.nn.Linear(in_features= + num_out_filters * block.expansion, out_features=output_dim) + else: + self.projection_head = paddle.nn.Sequential(paddle.nn.Linear( + in_features=num_out_filters * block.expansion, out_features + =hidden_mlp), paddle.nn.BatchNorm1D(num_features=hidden_mlp, + momentum=1 - 0.1, epsilon=1e-05, weight_attr=None, + bias_attr=None, use_global_stats=True), paddle.nn.ReLU(), + paddle.nn.Linear(in_features=hidden_mlp, out_features= + output_dim)) + + self.prototypes = None + if isinstance(nmb_prototypes, list): + self.prototypes = MultiPrototypes(output_dim, nmb_prototypes) + elif nmb_prototypes > 0: + self.prototypes = paddle.nn.Linear(in_features=output_dim, + out_features=nmb_prototypes, bias_attr=False) + for sublayer in self.sublayers(): + if isinstance(sublayer, nn.Conv2D): + kaiming_normal_init(sublayer.weight) + elif isinstance(sublayer, (nn.BatchNorm2D, nn.GroupNorm)): + constant_init(sublayer.weight, value=1.0) + constant_init(sublayer.bias, value=0.0) + + self.encoder = functools.partial(ResNet, block=block, depth=depth)(with_pool=False, class_num=0) + + def forward_backbone(self, x): + x = self.encoder(x) + + if self.eval_mode: + return x + + x = self.avgpool(x) + x = paddle.flatten(x=x, start_axis=1) + return x + + def forward_head(self, x): + if self.projection_head is not None: + x = self.projection_head(x) + if self.l2norm: + x = paddle.nn.functional.normalize(x=x, axis=1, p=2) + if self.prototypes is not None: + return x, self.prototypes(x) + return x + + def forward(self, inputs): + if not isinstance(inputs, list): + inputs = [inputs] + + idx_crops = paddle.cumsum(x=paddle.unique_consecutive(x=paddle. + to_tensor(data=[inp.shape[-1] for inp in inputs]), + return_counts=True)[1], axis=0) # padiff + start_idx = 0 + for end_idx in idx_crops: + _out = self.forward_backbone(paddle.concat(x=inputs[start_idx:end_idx])) + if start_idx == 0: + output = _out + else: + output = paddle.concat(x=(output, _out)) + start_idx = end_idx + return self.forward_head(output) + + +class MultiPrototypes(paddle.nn.Layer): + def __init__(self, output_dim, nmb_prototypes): + super(MultiPrototypes, self).__init__() + self.nmb_heads = len(nmb_prototypes) + for i, k in enumerate(nmb_prototypes): + self.add_module('prototypes' + str(i), paddle.nn.Linear( + in_features=output_dim, out_features=k, bias_attr=False)) + + def forward(self, x): + out = [] + for i in range(self.nmb_heads): + out.append(getattr(self, 'prototypes' + str(i))(x)) + return out + + +def swavresnet50(**kwargs): + return SwAVResNet(block=BottleneckBlock, depth=50, **kwargs) diff --git a/passl/models/swav_resnet.py b/passl/models/swav_resnet.py deleted file mode 100644 index 03e5dffb..00000000 --- a/passl/models/swav_resnet.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import functools -import paddle.nn as nn - -from .resnet import ResNet, BottleneckBlock - -def kaiming_normal_init(param, **kwargs): - initializer = nn.initializer.KaimingNormal(**kwargs) - initializer(param, param.block) - -def constant_init(param, **kwargs): - initializer = nn.initializer.Constant(**kwargs) - initializer(param, param.block) - - -class SwAVResNet(paddle.nn.Layer): - def __init__(self, block, depth, - normalize=False, output_dim=0, hidden_mlp=0, - nmb_prototypes=0, eval_mode=False): - - super(SwAVResNet, self).__init__() - self.l2norm = normalize - self.eval_mode = eval_mode - num_out_filters = 512 - - self.avgpool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1)) - - if output_dim == 0: - self.projection_head = None - elif hidden_mlp == 0: - self.projection_head = paddle.nn.Linear(in_features= - num_out_filters * block.expansion, out_features=output_dim) - else: - self.projection_head = paddle.nn.Sequential(paddle.nn.Linear( - in_features=num_out_filters * block.expansion, out_features - =hidden_mlp), paddle.nn.BatchNorm1D(num_features=hidden_mlp, - momentum=1 - 0.1, epsilon=1e-05, weight_attr=None, - bias_attr=None, use_global_stats=True), paddle.nn.ReLU(), - paddle.nn.Linear(in_features=hidden_mlp, out_features= - output_dim)) - - self.prototypes = None - if isinstance(nmb_prototypes, list): - self.prototypes = MultiPrototypes(output_dim, nmb_prototypes) - elif nmb_prototypes > 0: - self.prototypes = paddle.nn.Linear(in_features=output_dim, - out_features=nmb_prototypes, bias_attr=False) - for sublayer in self.sublayers(): - if isinstance(sublayer, nn.Conv2D): - kaiming_normal_init(sublayer.weight) # todo mode='fan_out', - elif isinstance(sublayer, (nn.BatchNorm2D, nn.GroupNorm)): - constant_init(sublayer.weight, value=1.0) - constant_init(sublayer.bias, value=0.0) - - self.encoder = functools.partial(ResNet, block=block, depth=depth)(with_pool=False, class_num=0) - - def forward_backbone(self, x): - x = self.encoder(x) - - if self.eval_mode: - return x - - x = self.avgpool(x) - x = paddle.flatten(x=x, start_axis=1) - return x - - def forward_head(self, x): - if self.projection_head is not None: - x = self.projection_head(x) - if self.l2norm: - x = paddle.nn.functional.normalize(x=x, axis=1, p=2) - if self.prototypes is not None: - return x, self.prototypes(x) - return x - - def forward(self, inputs): - if not isinstance(inputs, list): - inputs = [inputs] - - idx_crops = paddle.cumsum(x=paddle.unique_consecutive(x=paddle. - to_tensor(data=[inp.shape[-1] for inp in inputs]), - return_counts=True)[1], axis=0) # padiff - start_idx = 0 - for end_idx in idx_crops: - _out = self.forward_backbone(paddle.concat(x=inputs[start_idx:end_idx])) - if start_idx == 0: - output = _out - else: - output = paddle.concat(x=(output, _out)) - start_idx = end_idx - return self.forward_head(output) - - -class MultiPrototypes(paddle.nn.Layer): - def __init__(self, output_dim, nmb_prototypes): - super(MultiPrototypes, self).__init__() - self.nmb_heads = len(nmb_prototypes) - for i, k in enumerate(nmb_prototypes): - self.add_module('prototypes' + str(i), paddle.nn.Linear( - in_features=output_dim, out_features=k, bias_attr=False)) - - def forward(self, x): - out = [] - for i in range(self.nmb_heads): - out.append(getattr(self, 'prototypes' + str(i))(x)) - return out - - -def swavresnet50(**kwargs): - return SwAVResNet(block=BottleneckBlock, depth=50, **kwargs) diff --git a/passl/scheduler/__init__.py b/passl/scheduler/__init__.py index a538bb70..002ee755 100644 --- a/passl/scheduler/__init__.py +++ b/passl/scheduler/__init__.py @@ -12,11 +12,10 @@ # limitations under the License. import paddle -from paddle.optimizer.lr import MultiStepDecay from passl.utils import logger -from .lr_scheduler import TimmCosine, ViTLRScheduler, Step, Poly +from .lr_scheduler import TimmCosine, ViTLRScheduler, Step, Poly, MultiStepDecay from .lr_callable import LRCallable @@ -24,11 +23,6 @@ def build_lr_scheduler(lr_config, epochs, step_each_epoch): lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch}) if 'name' in lr_config: lr_name = lr_config.pop('name') - if "MultiStepDecay" in lr_name: - lr_config.pop('epochs') - lr_config.pop('step_each_epoch') - lr_config.pop('decay_unit') - print(lr_config) lr = eval(lr_name)(**lr_config) if isinstance(lr, paddle.optimizer.lr.LRScheduler): return lr diff --git a/passl/scheduler/lr_scheduler.py b/passl/scheduler/lr_scheduler.py index 223ca349..2b91405a 100644 --- a/passl/scheduler/lr_scheduler.py +++ b/passl/scheduler/lr_scheduler.py @@ -19,7 +19,6 @@ from paddle.optimizer import lr from passl.utils import logger - class TimmCosine(lr.LRScheduler): def __init__(self, learning_rate, @@ -200,3 +199,8 @@ def get_lr(self): return self.base_lr * pow(1 - float(self.last_epoch - self.warmups) / float(self.T_max - self.warmups), 2) + + +class MultiStepDecay(lr.MultiStepDecay): + def __init__(self, learning_rate, milestones, gamma, last_epoch, **kwargs): + super().__init__(learning_rate, milestones, gamma, last_epoch) From d9f9bf1ad0c87e48e1ba798888dcc9cf88e2ff58 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Mon, 15 May 2023 11:33:42 +0800 Subject: [PATCH 29/46] fix --- passl/models/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/passl/models/__init__.py b/passl/models/__init__.py index 85f9663b..0792faae 100644 --- a/passl/models/__init__.py +++ b/passl/models/__init__.py @@ -27,7 +27,6 @@ from .convnext import * from .mocov3 import * from .swav import * -from .swav_resnet import * from .simsiam import * __all__ = ["build_model"] From dc55ff6490c422d34925f94b468fa870ec58c097 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Tue, 16 May 2023 15:25:22 +0800 Subject: [PATCH 30/46] fix_by_comment --- passl/data/dataset/__init__.py | 1 + passl/data/dataset/fewshot_dataset.py | 58 +++++++++++++++++++ passl/data/dataset/imagefolder_dataset.py | 21 +------ passl/engine/engine.py | 2 +- passl/engine/loops/classification_loop.py | 2 +- .../engine/loops/contrastive_learning_loop.py | 5 -- passl/engine/loops/loop.py | 3 +- passl/models/swav.py | 13 ++++- ...wav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml | 8 +-- tests/CI/case.sh | 4 +- 10 files changed, 81 insertions(+), 36 deletions(-) create mode 100644 passl/data/dataset/fewshot_dataset.py diff --git a/passl/data/dataset/__init__.py b/passl/data/dataset/__init__.py index b19912e1..011cae11 100644 --- a/passl/data/dataset/__init__.py +++ b/passl/data/dataset/__init__.py @@ -64,3 +64,4 @@ def default_loader(path: str): from .imagenet_dataset import ImageNetDataset from .imagefolder_dataset import ImageFolder from .multicrop_dataset import MultiCropDataset +from .fewshot_dataset import FewShotDataset diff --git a/passl/data/dataset/fewshot_dataset.py b/passl/data/dataset/fewshot_dataset.py new file mode 100644 index 00000000..809abd0a --- /dev/null +++ b/passl/data/dataset/fewshot_dataset.py @@ -0,0 +1,58 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +from passl.utils import logger +from passl.data.dataset import default_loader +from passl.data.dataset import ImageFolder + +IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", + ".tiff", ".webp") + + +class FewShotDataset(ImageFolder): + """ + This class inherits from :class:`~passl.data.datasets.ImageFolder`, so + the dataset takes txt files containing image names to find the data + Args: + root (string): Root directory path. + transform (callable, optional): A function/transform that takes in an numpy image + and returns a transformed version. E.g, ``transforms.RandomCrop`` + target_transform (callable, optional): A function/transform that takes in the + target and transforms it. + loader (callable, optional): A function to load an image given its path. + txt_file_name(string): The name of the txt file. + """ + + def __init__(self, + root, + transform=None, + target_transform=None, + loader=default_loader, + extensions=IMG_EXTENSIONS, + txt_file_name=None): + super(FewShotDataset, self).__init__(root=root, transform=transform, + target_transform=target_transform, loader=loader, + extensions=extensions) + + assert txt_file_name is not None, "The txt_file_name should not be assigned." + if os.path.isfile(txt_file_name): + with open(txt_file_name, 'r') as f: + list_imgs = [li.split('\n')[0] for li in f.readlines()] + + self.imgs = [(os.path.join(root, li.split('_')[0], li), self.class_to_idx[li.split('_')[0]]) for li in list_imgs] + else: + raise FileNotFoundError('{} is not existed'.format(txt_file_name)) + print('Previous information is not correct.') + print(f'Actually, we have total {len(self.imgs)} images in semi-training setting.') diff --git a/passl/data/dataset/imagefolder_dataset.py b/passl/data/dataset/imagefolder_dataset.py index 76a5d77c..fd1429e8 100644 --- a/passl/data/dataset/imagefolder_dataset.py +++ b/passl/data/dataset/imagefolder_dataset.py @@ -13,14 +13,10 @@ # limitations under the License. import os -import urllib -import urllib.request import numpy as np from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union import paddle - -from passl.utils import logger from passl.data.dataset import default_loader IMG_EXTENSIONS = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", @@ -59,24 +55,11 @@ def __init__(self, transform=None, target_transform=None, loader=default_loader, - extensions=IMG_EXTENSIONS, - samples_tag=None): + extensions=IMG_EXTENSIONS): self.root = root classes, class_to_idx = self.find_classes(self.root) - if samples_tag is None: - samples = self.make_dataset(self.root, class_to_idx, extensions) - elif samples_tag == "semi_1" or samples_tag == "semi_10": - # connection reset proxyon - # subset_file = urllib.request.urlopen("https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/" + str(samples_tag.split('_')[-1]) + "percent.txt") - # list_imgs = [li.decode("utf-8").split('\n')[0] for li in subset_file] - subset_file = str(samples_tag.split('_')[-1]) + "percent.txt" - with open(subset_file, 'r') as f: - list_imgs = [li.split('\n')[0] for li in f.readlines()] - - samples = [(os.path.join(root, li.split('_')[0], li), class_to_idx[li.split('_')[0]]) for li in list_imgs] - else: - raise NotImplementedError('{} is not implemented'.format(samples)) + samples = self.make_dataset(self.root, class_to_idx, extensions) print(f'find total {len(classes)} classes and {len(samples)} images.') diff --git a/passl/engine/engine.py b/passl/engine/engine.py index c43b199b..d0008aa7 100644 --- a/passl/engine/engine.py +++ b/passl/engine/engine.py @@ -233,7 +233,7 @@ def worker_init_fn(worker_id): self.lr_decay_unit) # load pretrained model - if self.config["Global"]["pretrained_model"] is not None: + if self.config["Global"]["pretrained_model"] is not None: assert isinstance( self.config["Global"]["pretrained_model"], str ), "pretrained_model type is not available. Please use `string`." diff --git a/passl/engine/loops/classification_loop.py b/passl/engine/loops/classification_loop.py index 659bcc19..92ce83f0 100644 --- a/passl/engine/loops/classification_loop.py +++ b/passl/engine/loops/classification_loop.py @@ -77,7 +77,7 @@ def forward_backward(self, batch): out = paddle.concat(final_out, axis=0) return out, final_loss_dict - def train_one_step(self, batch, total_iterations=None): + def train_one_step(self, batch): # do forward and backward out, loss_dict = self.forward_backward(batch) diff --git a/passl/engine/loops/contrastive_learning_loop.py b/passl/engine/loops/contrastive_learning_loop.py index 663703cc..428d4853 100644 --- a/passl/engine/loops/contrastive_learning_loop.py +++ b/passl/engine/loops/contrastive_learning_loop.py @@ -62,11 +62,6 @@ def forward_backward(self, batch): scaled = self.trainer.scaler.scale(loss_dict["loss"]) scaled.backward() - try: - self.trainer.model.after_loss_backward(self.total_iterations) - except AttributeError: - logger.warning("Model has no after_loss_backward method, ignored this process") - return final_loss_dict def train_one_step(self, batch): diff --git a/passl/engine/loops/loop.py b/passl/engine/loops/loop.py index c14978a5..80b643df 100644 --- a/passl/engine/loops/loop.py +++ b/passl/engine/loops/loop.py @@ -26,6 +26,7 @@ from passl.utils import io from passl.utils import logger from passl.utils.misc import SmoothedValue +from passl.utils.infohub import runtime_info_hub class _Loop: """Basic Loops interface.""" @@ -264,7 +265,7 @@ def train_one_epoch(self): for batch_idx, batch in enumerate(self.trainer.train_dataloader): self.cur_batch_idx = batch_idx - self.total_iterations = (self.trainer.cur_epoch_id-1)*self.total_batch_idx + batch_idx + runtime_info_hub.total_iterations = (self.trainer.cur_epoch_id-1)*self.total_batch_idx + batch_idx if self.max_train_step is not None and self.global_step >= self.max_train_step: logger.info( diff --git a/passl/models/swav.py b/passl/models/swav.py index cb9c7298..08ea2879 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -22,6 +22,7 @@ from passl.nn import init from passl.utils import logger +from passl.utils.infohub import runtime_info_hub from passl.models.base_model import Model from passl.models.resnet import ResNet, BottleneckBlock @@ -135,9 +136,6 @@ def __init__(self, queue_length=0, crops_for_assign=(0, 1), nmb_crops=[2, 6], ep self.apply(self._freeze_norm) - # def load_pretrained(self, path, rank=0, finetune=False): - # self._load_model('swav_800ep_pretrain.pdparams', 'backbone') - @paddle.no_grad() def distributed_sinkhorn(self, out, sinkhorn_iterations=3): Q = paddle.exp(x=out / self.epsilon).t() @@ -159,6 +157,15 @@ def distributed_sinkhorn(self, out, sinkhorn_iterations=3): def forward(self, inp): bs = inp[0].shape[0] + if runtime_info_hub.total_iterations < self.freeze_prototypes_niters: + for name, p in self.res_model.named_parameters(): + if 'prototypes' in name: + p.stop_gradient = True + else: + for name, p in self.res_model.named_parameters(): + if 'prototypes' in name: + p.stop_gradient = False + # normalize the prototypes with paddle.no_grad(): w = self.res_model.prototypes.weight.clone() diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml index c7353402..0427f294 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml @@ -4,9 +4,9 @@ Global: train_loop: ClassificationTrainingEpochLoop validate_loop: ClassificationEvaluationLoop checkpoint: null - pretrained_model: epoch_73n + pretrained_model: epoch_794 finetune: True - output_dir: ./output/semi_0506_semi10 + output_dir: ./output/semi_0515_semi10 device: gpu save_interval: 1 max_num_latest_checkpoint: 0 @@ -63,7 +63,7 @@ Optimizer: DataLoader: Train: dataset: - name: ImageFolder + name: FewShotDataset root: data/ILSVRC2012/train transform: - RandomResizedCrop: @@ -73,7 +73,7 @@ DataLoader: - Normalize: mean: [0.485, 0.456, 0.406] std: [0.228, 0.224, 0.225] - samples_tag: semi_10 + txt_file_name: 10percent.txt sampler: name: DistributedBatchSampler batch_size: 128 # accum_steps: 1, total batchsize: 256 diff --git a/tests/CI/case.sh b/tests/CI/case.sh index 21a1f9b5..1bcb646e 100644 --- a/tests/CI/case.sh +++ b/tests/CI/case.sh @@ -398,9 +398,9 @@ function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() { loss=`cat log/workerlog.0 | grep '200/501' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` mem=`cat log/workerlog.0 | grep '200/501' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` - loss_base=1.95351 + loss_base=1.97248 ips_base=793.89847 - mem_base=5.67 + mem_base=10.74 check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} echo "=========== $FUNCNAME run end ===========" } From 6b1c6a5efbb4a6afd64063a630f97967b233295e Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Tue, 16 May 2023 15:47:30 +0800 Subject: [PATCH 31/46] pretrained_model --- passl/data/dataset/fewshot_dataset.py | 2 +- passl/utils/io.py | 5 ----- tasks/ssl/swav/README.md | 17 +++++++++++++++-- .../swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml | 4 ++-- .../swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml | 2 +- 5 files changed, 19 insertions(+), 11 deletions(-) diff --git a/passl/data/dataset/fewshot_dataset.py b/passl/data/dataset/fewshot_dataset.py index 809abd0a..c696f549 100644 --- a/passl/data/dataset/fewshot_dataset.py +++ b/passl/data/dataset/fewshot_dataset.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/passl/utils/io.py b/passl/utils/io.py index ab166eaf..cc7b5a28 100644 --- a/passl/utils/io.py +++ b/passl/utils/io.py @@ -158,11 +158,6 @@ def save_checkpoint(net, if loss_scaler is not None: opt_state_dict['scaler_state'] = loss_scaler.state_dict() - # Solve AttrDict can't pickle error - for group in opt_state_dict['param_groups']: - if 'LRScheduler' in group: - group['LRScheduler'] = dict(group['LRScheduler']) - for model_prefix in model_prefixs: paddle.save(opt_state_dict, model_prefix + ".pdopt") paddle.save(metric_info, model_prefix + ".pdstates") diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md index 4cd19543..2d44d7c5 100644 --- a/tasks/ssl/swav/README.md +++ b/tasks/ssl/swav/README.md @@ -40,9 +40,16 @@ python -m paddle.distributed.launch \ ``` ## How to Linear Classification - By default, we use momentum-SGD and a batch size of 256 for linear classification on frozen features/weights. This can be done with a single 8-GPU node. +- Download pretrained model +```bash +mkdir -p pretrained/swav +wget -O ./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams +``` + +- Train linear classification model + ```bash unset PADDLE_TRAINER_ENDPOINTS export PADDLE_NNODES=1 @@ -70,7 +77,13 @@ To perform end-to-end fine-tuning for SwAV: wget "https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/1percent.txt" ``` -* Then, download the pretrained models to `./pretrained/swav/swav_resnet50_in1k_800ep_pretrained.pdparams` +* Then, download the pretrained models to `./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams` + +- Download pretrained model +```bash +mkdir -p pretrained/swav +wget -O ./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams +``` * Finally, run the training with the trained PASSL format checkpoint: ```bash diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml index 0427f294..e8715e9e 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml @@ -4,9 +4,9 @@ Global: train_loop: ClassificationTrainingEpochLoop validate_loop: ClassificationEvaluationLoop checkpoint: null - pretrained_model: epoch_794 + pretrained_model: ./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained finetune: True - output_dir: ./output/semi_0515_semi10 + output_dir: ./output device: gpu save_interval: 1 max_num_latest_checkpoint: 0 diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml index 08a1dc25..05c9b334 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml @@ -4,7 +4,7 @@ Global: train_loop: ClassificationTrainingEpochLoop validate_loop: ClassificationEvaluationLoop checkpoint: null - pretrained_model: ./pretrained/swav/swav_resnet50_in1k_800ep_pretrained + pretrained_model: ./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained output_dir: ./output device: gpu save_interval: 1 From bc3f41f334ea3bf9d327e8ccc444cab82f97b4eb Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Wed, 17 May 2023 10:07:08 +0800 Subject: [PATCH 32/46] add_models --- tasks/ssl/swav/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md index 2d44d7c5..def91e8e 100644 --- a/tasks/ssl/swav/README.md +++ b/tasks/ssl/swav/README.md @@ -108,12 +108,12 @@ We provide more directly runnable configurations, see [SwAV Configurations](./co ## Models ### ViT-Base -| Model | Phase | Dataset | Configs | GPUs | Epochs | Top1 Acc (%) | Links | +| Model | Phase | Dataset | Configs | GPUs | Epochs | Top1 Acc (%) | Links | | ------------- | ----------- | ------------ | ------------------------------------------------------------ | ---------- | ------ | -------- | ------------------------------------------------------------ | -| resnet50 | pretrain | ImageNet2012 | [config](./configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml) | A100*N4C32 | 800 | - | [model]() \| [log]() | -| resnet50 | linear probe | ImageNet2012 | [config](./configs/swav_resnet50_224_lp_in1k_4n32c_dp_fp16o1.yaml) | A100*N1C8 | 75.3 | 0.7662 | [model]() \| [log]() | -| resnet50 | finetune | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4 | 100 | 69.0 | [model]() \| [log]() | - +| resnet50 | pretrain | ImageNet2012 | [config](./configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml) | A100*N4C32 | 800 | - | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams) \| [log](https://github.com/shiyutang/files/files/11493437/pretrain_train.log) | +| resnet50 | linear probe | ImageNet2012 | [config](./configs/swav_resnet50_224_lp_in1k_4n32c_dp_fp16o1.yaml) | A100*N1C8 | 100 | 75.3 | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_linearprobe.pdparams) \| [log](https://github.com/shiyutang/files/files/11493435/linear_train.log) | +| resnet50 | finetune-semi10 | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4 | 20 | 69.0 | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_10percent.pdparams) \| [log](https://github.com/shiyutang/files/files/11493438/semi10_train.log) | +| resnet50 | finetune-semi10 | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4 | 20 | 55.0 | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_1percent.pdparams) \| [log](https://github.com/shiyutang/files/files/11493451/semi1.log) | ## Citations ```bibtex From c8175d31e225e83baaffd8b82260ee9d0948dc3a Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Wed, 17 May 2023 10:15:53 +0800 Subject: [PATCH 33/46] CI --- tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh | 6 +++--- tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh | 6 +++--- .../CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh index f56f1e0b..734e5108 100644 --- a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh +++ b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh @@ -22,9 +22,9 @@ python -m paddle.distributed.launch \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ passl-train \ - -c ../../tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yaml \ - -o Global.print_batch_step=1 \ + -c ../../tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp32.yaml \ + -o Global.print_batch_step=20 \ -o Global.max_train_step=201 \ -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \ -o Global.flags.FLAGS_cudnn_deterministic=1 - -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained + -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained diff --git a/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh b/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh index fd8a7709..d018a166 100644 --- a/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh +++ b/tests/CI/ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh @@ -22,9 +22,9 @@ python -m paddle.distributed.launch \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ passl-train \ - -c ../../tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yaml \ - -o Global.print_batch_step=1 \ + -c ../../tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yaml \ + -o Global.print_batch_step=20 \ -o Global.max_train_step=201 \ -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \ -o Global.flags.FLAGS_cudnn_deterministic=1 \ - -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained + -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained diff --git a/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh b/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh index 2fa7ad20..8d8cd867 100644 --- a/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh +++ b/tests/CI/ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh @@ -23,7 +23,7 @@ python -m paddle.distributed.launch \ --devices=$CUDA_VISIBLE_DEVICES \ passl-train \ -c ../../tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml \ - -o Global.print_batch_step=1 \ + -o Global.print_batch_step=20 \ -o Global.max_train_step=201 \ -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \ -o Global.flags.FLAGS_cudnn_deterministic=1 From 510ca3d5a8f0c6d9a76cc28153410e0be8670963 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Wed, 17 May 2023 10:18:53 +0800 Subject: [PATCH 34/46] CI --- tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh index 734e5108..59cfe413 100644 --- a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh +++ b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh @@ -26,5 +26,5 @@ python -m paddle.distributed.launch \ -o Global.print_batch_step=20 \ -o Global.max_train_step=201 \ -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \ - -o Global.flags.FLAGS_cudnn_deterministic=1 + -o Global.flags.FLAGS_cudnn_deterministic=1 \ -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained From 455bf84d0be2652d41422d0948ceafe30ca7ca12 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Wed, 17 May 2023 14:13:43 +0800 Subject: [PATCH 35/46] CI --- tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh index 59cfe413..220262ac 100644 --- a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh +++ b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh @@ -22,7 +22,7 @@ python -m paddle.distributed.launch \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ passl-train \ - -c ../../tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n8c_dp_fp32.yaml \ + -c ../../tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml \ -o Global.print_batch_step=20 \ -o Global.max_train_step=201 \ -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \ From 9473b92348d4c55d8de77ac2a55aef22cf9426e7 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Wed, 17 May 2023 17:00:24 +0800 Subject: [PATCH 36/46] fix_CI --- .../swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml | 4 ++-- .../swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml | 4 ++-- .../configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml | 2 +- .../configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml index e8715e9e..32fa7a90 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml @@ -64,7 +64,7 @@ DataLoader: Train: dataset: name: FewShotDataset - root: data/ILSVRC2012/train + root: ./dataset/ILSVRC2012/train transform: - RandomResizedCrop: size: 224 @@ -86,7 +86,7 @@ DataLoader: Eval: dataset: name: ImageFolder - root: data/ILSVRC2012/val + root: ./dataset/ILSVRC2012/val transform: - Resize: size: 256 diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml index 05c9b334..59c44277 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml @@ -57,7 +57,7 @@ DataLoader: Train: dataset: name: ImageFolder - root: data/ILSVRC2012/train + root: ./dataset/ILSVRC2012/train transform: - RandomResizedCrop: size: 224 @@ -78,7 +78,7 @@ DataLoader: Eval: dataset: name: ImageFolder - root: data/ILSVRC2012/val + root: ./dataset/ILSVRC2012/val transform: - Resize: size: 256 diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml index 75961e5a..f2226887 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml @@ -63,7 +63,7 @@ DataLoader: Train: dataset: name: MultiCropDataset - root: ./data/ILSVRC2012 + root: ./dataset/ILSVRC2012 size_crops: [224, 96] num_crops: [2, 6] min_scale_crops: [0.14, 0.05] diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml index cdc0ebf8..96042f86 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml @@ -62,7 +62,7 @@ DataLoader: Train: dataset: name: MultiCropDataset - root: ./data/ILSVRC2012 + root: ./dataset/ILSVRC2012 size_crops: [224, 96] num_crops: [2, 6] min_scale_crops: [0.14, 0.05] From 42016993b8f7528220c7fed1a3d83d6fad3cda64 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Wed, 17 May 2023 20:07:58 +0800 Subject: [PATCH 37/46] update_10per --- .../swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml index 32fa7a90..283d4e8f 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml @@ -73,7 +73,7 @@ DataLoader: - Normalize: mean: [0.485, 0.456, 0.406] std: [0.228, 0.224, 0.225] - txt_file_name: 10percent.txt + txt_file_name: ./dataset/ILSVRC2012/10percent.txt sampler: name: DistributedBatchSampler batch_size: 128 # accum_steps: 1, total batchsize: 256 From 0d4292ab99a3b35107c22e94e94280f0470f4e0d Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Wed, 17 May 2023 21:32:47 +0800 Subject: [PATCH 38/46] fix_ci --- tasks/ssl/swav/README.md | 123 ++++++++++++++++++ tests/CI/case.sh | 38 ++++-- .../swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh | 2 +- 3 files changed, 148 insertions(+), 15 deletions(-) diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md index def91e8e..0950a663 100644 --- a/tasks/ssl/swav/README.md +++ b/tasks/ssl/swav/README.md @@ -126,3 +126,126 @@ We provide more directly runnable configurations, see [SwAV Configurations](./co primaryClass={cs.CV} } ``` + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/env bash +set -e + +export passl_path=/paddle/PASSL/tests/CI +export log_path=/paddle/log_passl + +function model_list(){ + swav_resnet50_224_ft_in1k_1n4c_dp_fp32 + swav_resnet50_224_lp_in1k_1n8c_dp_fp32 + swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1 +} + +############ case start ############ + +function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() { + echo "=========== $FUNCNAME run begin ===========" + rm -rf log + bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh + + loss=`cat log/workerlog.0 | grep '120/126' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` + ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` + mem=`cat log/workerlog.0 | grep '120/126' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` + loss_base=2.01301 + ips_base=1922.62626 + mem_base=10.50 + check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} + echo "=========== $FUNCNAME run end ===========" +} + +function swav_resnet50_224_lp_in1k_1n8c_dp_fp32() { + echo "=========== $FUNCNAME run begin ===========" + rm -rf log + bash ./ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh + + loss=`cat log/workerlog.0 | grep '200/5005' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` + ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` + mem=`cat log/workerlog.0 | grep '200/5005' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` + loss_base=4.89133 + ips_base=11111.52955 + mem_base=0.83 + check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} + echo "=========== $FUNCNAME run end ===========" +} + + +function swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1() { + echo "=========== $FUNCNAME run begin ===========" + rm -rf log + bash ./ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh + + loss=`cat log/workerlog.0 | grep '200/2599' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` + ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` + mem=`cat log/workerlog.0 | grep '200/2599' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` + loss_base=8.00343 + ips_base=1385.94186 + mem_base=8.63 + check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} + echo "=========== $FUNCNAME run end ===========" +} + +function check_result() { + if [ $? -ne 0 ];then + echo -e "\033 $1 model runs failed! \033" | tee -a $log_path/result.log + exit -1 + fi + + if [ $# -ne 7 ]; then + echo -e "\033 parameter transfer failed: $@ \033" | tee -a $log_path/result.log + exit -1 + fi + + echo -e "loss_base: $2 loss_test: $3" | tee -a $log_path/result.log + if [ $2 != $3 ];then + echo -e "\033 $1 loss diff check failed! \033" | tee -a $log_path/result.log + exit -1 + fi + + diff=$(echo $4 $5|awk '{printf "%0.2f\n", ($2-$1)/$1*100}') + echo -e "ips_base: $4 ips_test: $5 ips_diff: $diff% " | tee -a $log_path/result.log + # 设置不同ips校验阈值 + if [ $1 == mae_vit_base_patch16_lp_in1k_1n8c_dp_fp16o1 ];then + v1=$(echo $diff 10.0|awk '{print($1>=$2)?"0":"1"}') + v2=$(echo $diff -10.0|awk '{print($1<=$2)?"0":"1"}') + else + v1=$(echo $diff 5.0|awk '{print($1>=$2)?"0":"1"}') + v2=$(echo $diff -5.0|awk '{print($1<=$2)?"0":"1"}') + fi + if [[ $v1 == 0 ]] || [[ $v2 == 0 ]];then + echo -e "\033 $1 ips diff check failed! \033" | tee -a $log_path/result.log + exit -1 + fi + + echo -e "mem_base: $6 mem_test: $7" | tee -a $log_path/result.log + if [ $6 != $7 ];then + echo -e "\033 $1 mem diff check failed! \033" | tee -a $log_path/result.log + exit -1 + fi + +} + + +main() { + cd ${passl_path} + + model_list +} + +main$@ diff --git a/tests/CI/case.sh b/tests/CI/case.sh index d123f64e..caeca3cb 100644 --- a/tests/CI/case.sh +++ b/tests/CI/case.sh @@ -395,12 +395,12 @@ function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() { rm -rf log bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh - loss=`cat log/workerlog.0 | grep '200/501' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` + loss=`cat log/workerlog.0 | grep '120/126' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` - mem=`cat log/workerlog.0 | grep '200/501' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` - loss_base=1.97248 - ips_base=793.89847 - mem_base=10.74 + mem=`cat log/workerlog.0 | grep '120/126' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` + loss_base=2.01301 + ips_base=1536.33 + mem_base=10.50 check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} echo "=========== $FUNCNAME run end ===========" } @@ -413,9 +413,9 @@ function swav_resnet50_224_lp_in1k_1n8c_dp_fp32() { loss=`cat log/workerlog.0 | grep '200/5005' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` mem=`cat log/workerlog.0 | grep '200/5005' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` - loss_base=4.89133 - ips_base=11111.52955 - mem_base=0.83 + loss_base=3.83529 + ips_base=5620.26 + mem_base=0.46 check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} echo "=========== $FUNCNAME run end ===========" } @@ -429,9 +429,9 @@ function swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1() { loss=`cat log/workerlog.0 | grep '200/2599' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` mem=`cat log/workerlog.0 | grep '200/2599' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` - loss_base=8.00343 - ips_base=1385.94186 - mem_base=8.63 + loss_base=7.94478 + ips_base=982.07 + mem_base=8.62 check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} echo "=========== $FUNCNAME run end ===========" } @@ -448,9 +448,19 @@ function check_result() { fi echo -e "loss_base: $2 loss_test: $3" | tee -a $log_path/result.log - if [ $2 != $3 ];then - echo -e "\033 $1 loss diff check failed! \033" | tee -a $log_path/result.log - exit -1 + diff=$(echo $2 $3|awk '{printf "%0.2f\n", ($2-$1)/$1*100}') + if [ $1 == swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1 ];then + v1=$(echo $diff 0.1|awk '{print($1>=$2)?"0":"1"}') + v2=$(echo $diff -0.1|awk '{print($1<=$2)?"0":"1"}') + if [[ $v1 == 0 ]] || [[ $v2 == 0 ]];then + echo -e "\033 $1 loss diff check failed! \033" | tee -a $log_path/result.log + exit -1 + fi + else + if [ $2 != $3 ];then + echo -e "\033 $1 loss diff check failed! \033" | tee -a $log_path/result.log + exit -1 + fi fi diff=$(echo $4 $5|awk '{printf "%0.2f\n", ($2-$1)/$1*100}') diff --git a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh index 220262ac..badf168a 100644 --- a/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh +++ b/tests/CI/ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh @@ -24,7 +24,7 @@ python -m paddle.distributed.launch \ passl-train \ -c ../../tasks/ssl/swav/configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml \ -o Global.print_batch_step=20 \ - -o Global.max_train_step=201 \ + -o Global.max_train_step=121 \ -o Global.flags.FLAGS_cudnn_exhaustive_search=0 \ -o Global.flags.FLAGS_cudnn_deterministic=1 \ -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained From 3b0862c6af6b885251e242f958cf18de12ea9b6d Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Thu, 18 May 2023 09:58:33 +0800 Subject: [PATCH 39/46] ft_ips --- tests/CI/case.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/CI/case.sh b/tests/CI/case.sh index caeca3cb..f6e34761 100644 --- a/tests/CI/case.sh +++ b/tests/CI/case.sh @@ -399,7 +399,7 @@ function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() { ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` mem=`cat log/workerlog.0 | grep '120/126' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` loss_base=2.01301 - ips_base=1536.33 + ips_base=1919.8 mem_base=10.50 check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} echo "=========== $FUNCNAME run end ===========" From afba8ea57727856e04ff86686882fbfc041312fc Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Thu, 18 May 2023 11:03:35 +0800 Subject: [PATCH 40/46] fix_by_comment --- passl/data/dataset/__init__.py | 2 +- passl/data/dataset/fewshot_dataset.py | 4 +- ...p_dataset.py => swavmulticrop_datatset.py} | 0 passl/engine/engine.py | 1 + passl/engine/loops/loop.py | 1 - passl/models/swav.py | 6 +- tasks/ssl/swav/README.md | 145 ++---------------- ...av_resnet50_224_lp_in1k_1n8c_dp_fp32.yaml} | 0 ...v_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml | 2 +- ..._resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml | 2 +- 10 files changed, 19 insertions(+), 144 deletions(-) rename passl/data/dataset/{multicrop_dataset.py => swavmulticrop_datatset.py} (100%) rename tasks/ssl/swav/configs/{swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml => swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yaml} (100%) diff --git a/passl/data/dataset/__init__.py b/passl/data/dataset/__init__.py index 011cae11..fb5933f5 100644 --- a/passl/data/dataset/__init__.py +++ b/passl/data/dataset/__init__.py @@ -63,5 +63,5 @@ def default_loader(path: str): from .imagenet_dataset import ImageNetDataset from .imagefolder_dataset import ImageFolder -from .multicrop_dataset import MultiCropDataset +from .multicrop_dataset import SwAVMultiCropDataset from .fewshot_dataset import FewShotDataset diff --git a/passl/data/dataset/fewshot_dataset.py b/passl/data/dataset/fewshot_dataset.py index c696f549..47448308 100644 --- a/passl/data/dataset/fewshot_dataset.py +++ b/passl/data/dataset/fewshot_dataset.py @@ -54,5 +54,5 @@ def __init__(self, self.imgs = [(os.path.join(root, li.split('_')[0], li), self.class_to_idx[li.split('_')[0]]) for li in list_imgs] else: raise FileNotFoundError('{} is not existed'.format(txt_file_name)) - print('Previous information is not correct.') - print(f'Actually, we have total {len(self.imgs)} images in semi-training setting.') + logger.info('Previous information is not correct.') + logger.info(f'Actually, we have total {len(self.imgs)} images in semi-training setting.') diff --git a/passl/data/dataset/multicrop_dataset.py b/passl/data/dataset/swavmulticrop_datatset.py similarity index 100% rename from passl/data/dataset/multicrop_dataset.py rename to passl/data/dataset/swavmulticrop_datatset.py diff --git a/passl/engine/engine.py b/passl/engine/engine.py index d0008aa7..5302efff 100644 --- a/passl/engine/engine.py +++ b/passl/engine/engine.py @@ -345,6 +345,7 @@ def checkpoint(self): def init_runtime_info_hub(self): runtime_info_hub.epochs = self.train_loop.epochs runtime_info_hub.max_steps = self.train_loop.max_steps + runtime_info_hub.total_iterations = self.train_loop.global_step def train(self): assert self.mode == "train" diff --git a/passl/engine/loops/loop.py b/passl/engine/loops/loop.py index 80b643df..45745484 100644 --- a/passl/engine/loops/loop.py +++ b/passl/engine/loops/loop.py @@ -265,7 +265,6 @@ def train_one_epoch(self): for batch_idx, batch in enumerate(self.trainer.train_dataloader): self.cur_batch_idx = batch_idx - runtime_info_hub.total_iterations = (self.trainer.cur_epoch_id-1)*self.total_batch_idx + batch_idx if self.max_train_step is not None and self.global_step >= self.max_train_step: logger.info( diff --git a/passl/models/swav.py b/passl/models/swav.py index 08ea2879..db4eeeb8 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -89,7 +89,7 @@ def _freeze_norm(self, layer): class SwAVLinearProbe(SwAV): def __init__(self, class_num=1000, **kwargs): super().__init__(**kwargs) - self.linear = RegLog(class_num) + self.linear = RegLogit(class_num) self.res_model.eval() # freeze all layers but the last fc @@ -226,11 +226,11 @@ def swav_resnet50_pretrain(apex, **kwargs): # todo return model -class RegLog(paddle.nn.Layer): +class RegLogit(paddle.nn.Layer): """Creates logistic regression on top of frozen features""" def __init__(self, num_labels): - super(RegLog, self).__init__() + super(RegLogit, self).__init__() s = 2048 self.av_pool = paddle.nn.AdaptiveAvgPool2D(output_size=(1, 1)) self.linear = paddle.nn.Linear(in_features=s, out_features=num_labels) diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md index 0950a663..e573b84c 100644 --- a/tasks/ssl/swav/README.md +++ b/tasks/ssl/swav/README.md @@ -70,7 +70,7 @@ To perform end-to-end fine-tuning for SwAV: * First download the data split text file with following commands: ```bash - cd PASSL + cd PASSL/dataset/ILSVRC2012 wget "https://raw.githubusercontent.com/google-research/simclr/master/imagenet_subsets/10percent.txt" @@ -78,12 +78,10 @@ To perform end-to-end fine-tuning for SwAV: ``` * Then, download the pretrained models to `./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams` - -- Download pretrained model -```bash -mkdir -p pretrained/swav -wget -O ./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams -``` + ```bash + mkdir -p pretrained/swav + wget -O ./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams + ``` * Finally, run the training with the trained PASSL format checkpoint: ```bash @@ -107,13 +105,13 @@ We provide more directly runnable configurations, see [SwAV Configurations](./co ## Models -### ViT-Base +### Resnet | Model | Phase | Dataset | Configs | GPUs | Epochs | Top1 Acc (%) | Links | | ------------- | ----------- | ------------ | ------------------------------------------------------------ | ---------- | ------ | -------- | ------------------------------------------------------------ | -| resnet50 | pretrain | ImageNet2012 | [config](./configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml) | A100*N4C32 | 800 | - | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams) \| [log](https://github.com/shiyutang/files/files/11493437/pretrain_train.log) | -| resnet50 | linear probe | ImageNet2012 | [config](./configs/swav_resnet50_224_lp_in1k_4n32c_dp_fp16o1.yaml) | A100*N1C8 | 100 | 75.3 | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_linearprobe.pdparams) \| [log](https://github.com/shiyutang/files/files/11493435/linear_train.log) | -| resnet50 | finetune-semi10 | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4 | 20 | 69.0 | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_10percent.pdparams) \| [log](https://github.com/shiyutang/files/files/11493438/semi10_train.log) | -| resnet50 | finetune-semi10 | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4 | 20 | 55.0 | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_1percent.pdparams) \| [log](https://github.com/shiyutang/files/files/11493451/semi1.log) | +| resnet50 | pretrain | ImageNet2012 | [config](./configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml) | A100*N2C16 | 800 | - | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.log) | +| resnet50 | linear probe | ImageNet2012 | [config](./configs/swav_resnet50_224_lp_in1k_4n32c_dp_fp16o1.yaml) | A100*N1C8 | 100 | 75.3 | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_linearprobe.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_linearprobe.log) | +| resnet50 | finetune-semi10 | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4 | 20 | 69.0 | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_10percent.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_10percent.log) | +| resnet50 | finetune-semi10 | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4 | 20 | 55.0 | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_1percent.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_1percent.log) | ## Citations ```bibtex @@ -126,126 +124,3 @@ We provide more directly runnable configurations, see [SwAV Configurations](./co primaryClass={cs.CV} } ``` - -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#!/usr/bin/env bash -set -e - -export passl_path=/paddle/PASSL/tests/CI -export log_path=/paddle/log_passl - -function model_list(){ - swav_resnet50_224_ft_in1k_1n4c_dp_fp32 - swav_resnet50_224_lp_in1k_1n8c_dp_fp32 - swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1 -} - -############ case start ############ - -function swav_resnet50_224_ft_in1k_1n4c_dp_fp32() { - echo "=========== $FUNCNAME run begin ===========" - rm -rf log - bash ./ssl/swav/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.sh - - loss=`cat log/workerlog.0 | grep '120/126' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` - ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` - mem=`cat log/workerlog.0 | grep '120/126' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` - loss_base=2.01301 - ips_base=1922.62626 - mem_base=10.50 - check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} - echo "=========== $FUNCNAME run end ===========" -} - -function swav_resnet50_224_lp_in1k_1n8c_dp_fp32() { - echo "=========== $FUNCNAME run begin ===========" - rm -rf log - bash ./ssl/swav/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.sh - - loss=`cat log/workerlog.0 | grep '200/5005' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` - ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` - mem=`cat log/workerlog.0 | grep '200/5005' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` - loss_base=4.89133 - ips_base=11111.52955 - mem_base=0.83 - check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} - echo "=========== $FUNCNAME run end ===========" -} - - -function swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1() { - echo "=========== $FUNCNAME run begin ===========" - rm -rf log - bash ./ssl/swav/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.sh - - loss=`cat log/workerlog.0 | grep '200/2599' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` - ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` - mem=`cat log/workerlog.0 | grep '200/2599' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` - loss_base=8.00343 - ips_base=1385.94186 - mem_base=8.63 - check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} - echo "=========== $FUNCNAME run end ===========" -} - -function check_result() { - if [ $? -ne 0 ];then - echo -e "\033 $1 model runs failed! \033" | tee -a $log_path/result.log - exit -1 - fi - - if [ $# -ne 7 ]; then - echo -e "\033 parameter transfer failed: $@ \033" | tee -a $log_path/result.log - exit -1 - fi - - echo -e "loss_base: $2 loss_test: $3" | tee -a $log_path/result.log - if [ $2 != $3 ];then - echo -e "\033 $1 loss diff check failed! \033" | tee -a $log_path/result.log - exit -1 - fi - - diff=$(echo $4 $5|awk '{printf "%0.2f\n", ($2-$1)/$1*100}') - echo -e "ips_base: $4 ips_test: $5 ips_diff: $diff% " | tee -a $log_path/result.log - # 设置不同ips校验阈值 - if [ $1 == mae_vit_base_patch16_lp_in1k_1n8c_dp_fp16o1 ];then - v1=$(echo $diff 10.0|awk '{print($1>=$2)?"0":"1"}') - v2=$(echo $diff -10.0|awk '{print($1<=$2)?"0":"1"}') - else - v1=$(echo $diff 5.0|awk '{print($1>=$2)?"0":"1"}') - v2=$(echo $diff -5.0|awk '{print($1<=$2)?"0":"1"}') - fi - if [[ $v1 == 0 ]] || [[ $v2 == 0 ]];then - echo -e "\033 $1 ips diff check failed! \033" | tee -a $log_path/result.log - exit -1 - fi - - echo -e "mem_base: $6 mem_test: $7" | tee -a $log_path/result.log - if [ $6 != $7 ];then - echo -e "\033 $1 mem diff check failed! \033" | tee -a $log_path/result.log - exit -1 - fi - -} - - -main() { - cd ${passl_path} - - model_list -} - -main$@ diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml b/tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yaml similarity index 100% rename from tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yml rename to tasks/ssl/swav/configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yaml diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml index f2226887..f3f3f3ab 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1.yaml @@ -62,7 +62,7 @@ Optimizer: DataLoader: Train: dataset: - name: MultiCropDataset + name: SwAVMultiCropDataset root: ./dataset/ILSVRC2012 size_crops: [224, 96] num_crops: [2, 6] diff --git a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml index 96042f86..bf59988b 100644 --- a/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml +++ b/tasks/ssl/swav/configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml @@ -61,7 +61,7 @@ Optimizer: DataLoader: Train: dataset: - name: MultiCropDataset + name: SwAVMultiCropDataset root: ./dataset/ILSVRC2012 size_crops: [224, 96] num_crops: [2, 6] From c975a33a0dc45f527a4dec676bd9e440a1fa10fd Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Thu, 18 May 2023 14:21:58 +0800 Subject: [PATCH 41/46] update --- passl/data/dataset/__init__.py | 2 +- passl/data/dataset/swavmulticrop_datatset.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/passl/data/dataset/__init__.py b/passl/data/dataset/__init__.py index fb5933f5..2f75f2b6 100644 --- a/passl/data/dataset/__init__.py +++ b/passl/data/dataset/__init__.py @@ -63,5 +63,5 @@ def default_loader(path: str): from .imagenet_dataset import ImageNetDataset from .imagefolder_dataset import ImageFolder -from .multicrop_dataset import SwAVMultiCropDataset +from .swavmulticrop_datatset import SwAVMultiCropDataset from .fewshot_dataset import FewShotDataset diff --git a/passl/data/dataset/swavmulticrop_datatset.py b/passl/data/dataset/swavmulticrop_datatset.py index f3acce64..ce7826ba 100644 --- a/passl/data/dataset/swavmulticrop_datatset.py +++ b/passl/data/dataset/swavmulticrop_datatset.py @@ -29,14 +29,14 @@ ) -class MultiCropDataset(ImageFolder): +class SwAVMultiCropDataset(ImageFolder): def __init__(self, root, size_crops, num_crops, min_scale_crops, max_scale_crops): - super(MultiCropDataset, self).__init__(root) + super(SwAVMultiCropDataset, self).__init__(root) assert len(size_crops) == len(num_crops) assert len(min_scale_crops) == len(num_crops) From 0016737988f85d870e3597ab57807623dd6e6454 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Thu, 18 May 2023 15:34:00 +0800 Subject: [PATCH 42/46] update --- tests/CI/case.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/CI/case.sh b/tests/CI/case.sh index f6e34761..c79647ab 100644 --- a/tests/CI/case.sh +++ b/tests/CI/case.sh @@ -450,8 +450,8 @@ function check_result() { echo -e "loss_base: $2 loss_test: $3" | tee -a $log_path/result.log diff=$(echo $2 $3|awk '{printf "%0.2f\n", ($2-$1)/$1*100}') if [ $1 == swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1 ];then - v1=$(echo $diff 0.1|awk '{print($1>=$2)?"0":"1"}') - v2=$(echo $diff -0.1|awk '{print($1<=$2)?"0":"1"}') + v1=$(echo $diff 0.2|awk '{print($1>=$2)?"0":"1"}') + v2=$(echo $diff -0.2|awk '{print($1<=$2)?"0":"1"}') if [[ $v1 == 0 ]] || [[ $v2 == 0 ]];then echo -e "\033 $1 loss diff check failed! \033" | tee -a $log_path/result.log exit -1 From 363b4a4ac70878fc5623a2097a0980af39f18933 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Fri, 19 May 2023 10:29:56 +0800 Subject: [PATCH 43/46] pretrain_fix --- passl/models/swav.py | 50 +++++++++++++++++++++----------------------- tests/CI/case.sh | 13 ++---------- 2 files changed, 26 insertions(+), 37 deletions(-) diff --git a/passl/models/swav.py b/passl/models/swav.py index db4eeeb8..3ba2dda9 100644 --- a/passl/models/swav.py +++ b/passl/models/swav.py @@ -198,33 +198,7 @@ def after_loss_backward(self, iteration): if 'prototypes' in name and p.grad is not None: p.clear_grad() -def swav_resnet50_linearprobe(**kwargs): - model = SwAVLinearProbe(**kwargs) - return model - -def swav_resnet50_finetune(**kwargs): - model = SwAVFinetune(**kwargs) - if paddle.distributed.get_world_size() > 1: - model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - return model - -def swav_resnet50_pretrain(apex, **kwargs): # todo - flags = {} - flags['FLAGS_cudnn_exhaustive_search'] = True - flags['FLAGS_cudnn_deterministic'] = False - paddle.set_flags(flags) - - model = SwAVPretrain(**kwargs) - if paddle.distributed.get_world_size() > 1: - if not apex: - model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - else: - # with apex syncbn speeds up computation than global syncbn - process_group = apex.parallel.create_syncbn_process_group(8) - model = apex.parallel.convert_syncbn_model(model, process_group=process_group) - - return model class RegLogit(paddle.nn.Layer): """Creates logistic regression on top of frozen features""" @@ -348,3 +322,27 @@ def forward(self, x): def swavresnet50(**kwargs): return SwAVResNet(block=BottleneckBlock, depth=50, **kwargs) + + +def swav_resnet50_linearprobe(**kwargs): + model = SwAVLinearProbe(**kwargs) + return model + +def swav_resnet50_finetune(**kwargs): + model = SwAVFinetune(**kwargs) + if paddle.distributed.get_world_size() > 1: + model = nn.SyncBatchNorm.convert_sync_batchnorm(model) + return model + +def swav_resnet50_pretrain(apex, **kwargs): + model = SwAVPretrain(**kwargs) + + if paddle.distributed.get_world_size() > 1: + if not apex: + model = nn.SyncBatchNorm.convert_sync_batchnorm(model) + else: + # with apex syncbn speeds up computation than global syncbn + process_group = apex.parallel.create_syncbn_process_group(8) + model = apex.parallel.convert_syncbn_model(model, process_group=process_group) + + return model diff --git a/tests/CI/case.sh b/tests/CI/case.sh index c79647ab..7ee30014 100644 --- a/tests/CI/case.sh +++ b/tests/CI/case.sh @@ -448,21 +448,12 @@ function check_result() { fi echo -e "loss_base: $2 loss_test: $3" | tee -a $log_path/result.log - diff=$(echo $2 $3|awk '{printf "%0.2f\n", ($2-$1)/$1*100}') - if [ $1 == swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1 ];then - v1=$(echo $diff 0.2|awk '{print($1>=$2)?"0":"1"}') - v2=$(echo $diff -0.2|awk '{print($1<=$2)?"0":"1"}') - if [[ $v1 == 0 ]] || [[ $v2 == 0 ]];then + if [ $2 != $3 ];then echo -e "\033 $1 loss diff check failed! \033" | tee -a $log_path/result.log exit -1 - fi - else - if [ $2 != $3 ];then - echo -e "\033 $1 loss diff check failed! \033" | tee -a $log_path/result.log - exit -1 - fi fi + diff=$(echo $4 $5|awk '{printf "%0.2f\n", ($2-$1)/$1*100}') echo -e "ips_base: $4 ips_test: $5 ips_diff: $diff% " | tee -a $log_path/result.log # 设置不同ips校验阈值 From a55609994fb722f5ac41805c4d3720359215751f Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Fri, 19 May 2023 11:55:44 +0800 Subject: [PATCH 44/46] update --- tests/CI/case.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/CI/case.sh b/tests/CI/case.sh index 7ee30014..5a05ac78 100644 --- a/tests/CI/case.sh +++ b/tests/CI/case.sh @@ -429,7 +429,7 @@ function swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1() { loss=`cat log/workerlog.0 | grep '200/2599' | awk -F 'loss: ' '{print $2}' | awk -F ',' '{print $1}'` ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` mem=`cat log/workerlog.0 | grep '200/2599' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` - loss_base=7.94478 + loss_base=7.93896 ips_base=982.07 mem_base=8.62 check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} From 790500f21c5fd566f4e62d53ea27b48d10c668c8 Mon Sep 17 00:00:00 2001 From: shiyutang <1574572981@qq.com> Date: Fri, 19 May 2023 14:02:14 +0800 Subject: [PATCH 45/46] update --- tests/CI/case.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/CI/case.sh b/tests/CI/case.sh index 5a05ac78..3d10679a 100644 --- a/tests/CI/case.sh +++ b/tests/CI/case.sh @@ -430,8 +430,8 @@ function swav_resnet50_224_pt_in1k_1n8c_dp_fp16o1() { ips=`cat log/workerlog.0 | grep 'ips: ' | awk -F 'ips: ' '{print $2}' | awk -F ' images/sec,' '{print $1}'| awk 'NR>1 {print}' | awk '{a+=$1}END{print a/NR}'` mem=`cat log/workerlog.0 | grep '200/2599' | awk -F 'max mem: ' '{print $2}' | awk -F ' GB,' '{print $1}'` loss_base=7.93896 - ips_base=982.07 - mem_base=8.62 + ips_base=1000.3 + mem_base=8.37 check_result $FUNCNAME ${loss_base} ${loss} ${ips_base} ${ips} ${mem_base} ${mem} echo "=========== $FUNCNAME run end ===========" } From b929bf26e2fcc6ca5ea15efae70771d75df397cc Mon Sep 17 00:00:00 2001 From: tangshiyu Date: Mon, 29 May 2023 20:24:33 +0800 Subject: [PATCH 46/46] update_doc --- tasks/ssl/swav/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tasks/ssl/swav/README.md b/tasks/ssl/swav/README.md index e573b84c..49dd6719 100644 --- a/tasks/ssl/swav/README.md +++ b/tasks/ssl/swav/README.md @@ -62,7 +62,7 @@ python -m paddle.distributed.launch \ --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ passl-train \ - -c ./configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp16o1.yaml + -c ./configs/swav_resnet50_224_lp_in1k_1n8c_dp_fp32.yaml ``` ## How to End-to-End Fine-tuning @@ -96,8 +96,8 @@ To perform end-to-end fine-tuning for SwAV: --master=$PADDLE_MASTER \ --devices=$CUDA_VISIBLE_DEVICES \ passl-train \ - -c ./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml - -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_pretrained + -c ./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml + -o Global.pretrained_model=./pretrained/swav/swav_resnet50_in1k_800ep_bz4096_pretrained ``` ## Other Configurations @@ -109,9 +109,9 @@ We provide more directly runnable configurations, see [SwAV Configurations](./co | Model | Phase | Dataset | Configs | GPUs | Epochs | Top1 Acc (%) | Links | | ------------- | ----------- | ------------ | ------------------------------------------------------------ | ---------- | ------ | -------- | ------------------------------------------------------------ | | resnet50 | pretrain | ImageNet2012 | [config](./configs/swav_resnet50_224_pt_in1k_4n32c_dp_fp16o1.yaml) | A100*N2C16 | 800 | - | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_800ep_bz4096_pretrained.log) | -| resnet50 | linear probe | ImageNet2012 | [config](./configs/swav_resnet50_224_lp_in1k_4n32c_dp_fp16o1.yaml) | A100*N1C8 | 100 | 75.3 | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_linearprobe.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_linearprobe.log) | -| resnet50 | finetune-semi10 | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4 | 20 | 69.0 | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_10percent.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_10percent.log) | -| resnet50 | finetune-semi10 | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp16o1.yaml) | A100*N1C4 | 20 | 55.0 | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_1percent.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_1percent.log) | +| resnet50 | linear probe | ImageNet2012 | [config](./configs/swav_resnet50_224_lp_in1k_4n32c_dp_fp32.yaml) | A100*N1C8 | 100 | 75.3 | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_linearprobe.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_linearprobe.log) | +| resnet50 | finetune-semi10 | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml) | A100*N1C4 | 20 | 69.0 | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_10percent.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_10percent.log) | +| resnet50 | finetune-semi10 | ImageNet2012 | [config](./configs/swav_resnet50_224_ft_in1k_1n4c_dp_fp32.yaml) | A100*N1C4 | 20 | 55.0 | [model](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_1percent.pdparams) \| [log](https://passl.bj.bcebos.com/models/swav/swav_resnet50_in1k_finetune_1percent.log) | ## Citations ```bibtex