Add new RegNet for SwAV (#214)

QuentinDuval · facebook-github-bot · commit 733736919795 · 2022-01-28T17:41:55.000-08:00
Summary: New regnet for SwAV plus unit test to check that associated pre-training is working on 1 node of 8 GPUs Pull Request resolved: fairinternal/ssl_scaling#214 Reviewed By: prigoyal Differential Revision: D33801136 Pulled By: QuentinDuval fbshipit-source-id: 3b8bf89039d91ab7cb9686bf8e60d640ace95907
diff --git a/configs/config/pretrain/seer/models/regnet10B.yaml b/configs/config/pretrain/seer/models/regnet10B.yaml
@@ -0,0 +1,94 @@
+# @package _global_
+config:
+  TRAINER:
+    TASK_NAME: self_supervision_fsdp_task
+  DATA:
+    TRAIN:
+      BATCHSIZE_PER_REPLICA: 16
+      TRANSFORMS:
+        - name: ImgPilToMultiCrop
+          total_num_crops: 6
+          size_crops: [160, 96]
+          num_crops: [2, 4]
+          crop_scales: [[0.14, 1], [0.05, 0.14]]
+        - name: RandomHorizontalFlip
+          p: 0.5
+        - name: ImgPilColorDistortion
+          strength: 1.0
+        - name: ImgPilGaussianBlur
+          p: 0.5
+          radius_min: 0.1
+          radius_max: 2.0
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      COLLATE_FUNCTION_PARAMS:
+        create_multidimensional_tensor: True
+  MODEL:
+    TRUNK:
+      NAME: regnet_fsdp
+      REGNET:
+        block_type: res_bottleneck_block
+        depth: 27
+        group_width: 1010
+        w_0: 1744
+        w_a: 620.83
+        w_m: 2.52
+        stage_checkpoints: [[2], [7], [9, 17], []]
+    HEAD:
+      PARAMS: [
+        ["swav_head_fsdp", {
+          "dims": [28280, 8192, 8192, 256],
+          "use_bn": False,
+          "num_clusters": [16000]
+        }],
+      ]
+    FSDP_CONFIG:
+      AUTO_WRAP_THRESHOLD: 100000000
+      flatten_parameters: False
+      mixed_precision: True
+      fp32_reduce_scatter: False
+      compute_dtype: float16
+    CUDA_CACHE:
+      CLEAR_CUDA_CACHE: True
+      CLEAR_FREQ: 5000
+    SYNC_BN_CONFIG:
+      CONVERT_BN_TO_SYNC_BN: True
+      SYNC_BN_TYPE: "pytorch"
+    AMP_PARAMS:
+      USE_AMP: True
+      AMP_TYPE: "pytorch"
+    ACTIVATION_CHECKPOINTING:
+      USE_ACTIVATION_CHECKPOINTING: True
+  LOSS:
+    swav_loss:
+      num_iters: 10
+      epsilon: 0.03
+      temp_hard_assignment_iters: 0
+      num_crops: 6
+      num_prototypes: [16000]
+  OPTIMIZER:
+    name: "sgd_fsdp"
+    use_larc: True
+    construct_single_param_group_only: True
+    weight_decay: 0.00001
+    num_epochs: 1
+    param_schedulers:
+      lr:
+        # we make it convenient to scale Learning rate automatically as per the scaling
+        # rule specified in https://arxiv.org/abs/1706.02677 (ImageNet in 1Hour).
+        auto_lr_scaling:
+          auto_scale: True
+          base_value: 0.3
+        lengths: [0.043648,0.956352]
+  CHECKPOINT:
+    CHECKPOINT_ITER_FREQUENCY: 100
+    LATEST_CHECKPOINT_RESUME_FILE_NUM: 1
+    USE_SYMLINK_CHECKPOINT_FOR_RESUME: True
+  DISTRIBUTED:
+    NCCL_DEBUG: False
+    NUM_NODES: 62
+    NUM_PROC_PER_NODE: 8
+    NCCL_SOCKET_NTHREADS: ''
+  LOG_FREQUENCY: 1
diff --git a/tests/test_regnet_fsdp_10b.py b/tests/test_regnet_fsdp_10b.py
@@ -0,0 +1,52 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+from vissl.utils.hydra_config import compose_hydra_configuration, convert_to_attrdict
+from vissl.utils.test_utils import (
+    gpu_test,
+    in_temporary_directory,
+    run_integration_test,
+)
+
+
+class TestRegnet10B(unittest.TestCase):
+    @staticmethod
+    def _create_10B_pretrain_config(num_gpus: int, num_steps: int, batch_size: int):
+        data_limit = num_steps * batch_size * num_gpus
+        cfg = compose_hydra_configuration(
+            [
+                "config=pretrain/swav/swav_8node_resnet",
+                "+config/pretrain/seer/models=regnet10B",
+                "config.OPTIMIZER.num_epochs=1",
+                "config.LOG_FREQUENCY=1",
+                # Testing on fake images
+                "config.DATA.TRAIN.DATA_SOURCES=[synthetic]",
+                "config.DATA.TRAIN.RANDOM_SYNTHETIC_IMAGES=True",
+                "config.DATA.TRAIN.USE_DEBUGGING_SAMPLER=True",
+                # Disable overlap communication and computation for test
+                "config.MODEL.FSDP_CONFIG.FORCE_SYNC_CUDA=True",
+                # Testing on 8 V100 32GB GPU only
+                f"config.DATA.TRAIN.BATCHSIZE_PER_REPLICA={batch_size}",
+                f"config.DATA.TRAIN.DATA_LIMIT={data_limit}",
+                "config.DISTRIBUTED.NUM_NODES=1",
+                f"config.DISTRIBUTED.NUM_PROC_PER_NODE={num_gpus}",
+                "config.DISTRIBUTED.RUN_ID=auto",
+            ]
+        )
+        args, config = convert_to_attrdict(cfg)
+        return config
+
+    @gpu_test(gpu_count=8)
+    def test_regnet_10b_swav_pretraining(self):
+        with in_temporary_directory():
+            config = self._create_10B_pretrain_config(
+                num_gpus=8, num_steps=2, batch_size=4
+            )
+            results = run_integration_test(config)
+            losses = results.get_losses()
+            print(losses)
+            self.assertEqual(len(losses), 2)