diff --git a/.gitignore b/.gitignore
index c3d0c6c..d5f1a2f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,8 +4,8 @@
 .vscode/*
 
 *.pyc
+*.conf
 
 wandb
 
 *.zip
-*.log
diff --git a/FedML b/FedML
index 400072e..49a3c76 160000
--- a/FedML
+++ b/FedML
@@ -1 +1 @@
-Subproject commit 400072ef5daa9a9ca0f205ff3a90ccf13f975729
+Subproject commit 49a3c760c7d166d6730c118eb0aafae872c852bf
diff --git a/data/cifar100/download_cifar100.sh b/data/cifar100/download_cifar100.sh
new file mode 100644
index 0000000..cb2d7f0
--- /dev/null
+++ b/data/cifar100/download_cifar100.sh
@@ -0,0 +1 @@
+wget https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz
\ No newline at end of file
diff --git a/data_preprocessing/ImageNet/data_loader.py b/data_preprocessing/ImageNet/data_loader.py
index 7a470cb..e528b23 100644
--- a/data_preprocessing/ImageNet/data_loader.py
+++ b/data_preprocessing/ImageNet/data_loader.py
@@ -4,9 +4,15 @@
 import torch
 import torch.utils.data as data
 import torchvision.transforms as transforms
+from torch.utils.data.distributed import DistributedSampler
+from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset
 
 from .datasets import ImageNet
+from .datasets import ImageNet100
 from .datasets import ImageNet_truncated
+from .datasets_hdf5 import ImageNet_hdf5
+from .datasets_hdf5 import ImageNet_truncated_hdf5
+
 
 logging.basicConfig()
 logger = logging.getLogger()
@@ -35,12 +41,18 @@ def __call__(self, img):
         return img
 
 
-def _data_transforms_ImageNet():
+def _data_transforms_ImageNet(args):
     # IMAGENET_MEAN = [0.5071, 0.4865, 0.4409]
     # IMAGENET_STD = [0.2673, 0.2564, 0.2762]
+    if args.data_transform == 'FLTransform':
+        IMAGENET_MEAN = [0.5, 0.5, 0.5]
+        IMAGENET_STD = [0.5, 0.5, 0.5]
+    elif args.data_transform == 'NormalTransform':
+        IMAGENET_MEAN = [0.485, 0.456, 0.406]
+        IMAGENET_STD = [0.229, 0.224, 0.225]
+    else:
+        raise NotImplementedError
 
-    IMAGENET_MEAN = [0.485, 0.456, 0.406]
-    IMAGENET_STD = [0.229, 0.224, 0.225]
 
     image_size = 224
     train_transform = transforms.Compose([
@@ -62,74 +74,244 @@ def _data_transforms_ImageNet():
     return train_transform, valid_transform
 
 
-# for centralized training
-def get_dataloader(dataset, datadir, train_bs, test_bs, dataidxs=None):
-    return get_dataloader_ImageNet(datadir, train_bs, test_bs, dataidxs)
-
-
-# for local devices
-def get_dataloader_test(dataset, datadir, train_bs, test_bs, dataidxs_train, dataidxs_test):
-    return get_dataloader_test_ImageNet(datadir, train_bs, test_bs, dataidxs_train, dataidxs_test)
 
+def get_ImageNet_truncated(imagenet_dataset_train, imagenet_dataset_test, train_bs,
+                                      test_bs, dataidxs=None, net_dataidx_map=None, args=None):
+    """
+        imagenet_dataset_train, imagenet_dataset_test should be ImageNet or ImageNet_hdf5
+    """
+    if type(imagenet_dataset_train) in [ImageNet, ImageNet100]:
+        dl_obj = ImageNet_truncated
+    elif type(imagenet_dataset_train) == ImageNet_hdf5:
+        dl_obj = ImageNet_truncated_hdf5
+    else:
+        raise NotImplementedError()
 
-def get_dataloader_ImageNet_truncated(imagenet_dataset_train: ImageNet, imagenet_dataset_test: ImageNet, train_bs,
-                                      test_bs, dataidxs=None, net_dataidx_map=None):
-    dl_obj = ImageNet_truncated
-
-    transform_train, transform_test = _data_transforms_ImageNet()
+    transform_train, transform_test = _data_transforms_ImageNet(args)
 
     train_ds = dl_obj(imagenet_dataset_train, dataidxs, net_dataidx_map, train=True, transform=transform_train,
                       download=False)
-    test_ds = dl_obj(imagenet_dataset_test, dataidxs=None, net_dataidx_map=None, train=False, transform=transform_test,
+    test_ds = dl_obj(imagenet_dataset_test, dataidxs, net_dataidx_map, train=False, transform=transform_test,
                      download=False)
+    return train_ds, test_ds
 
-    train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=False)
-    test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=False)
-
-    return train_dl, test_dl
-
-
-def get_dataloader_ImageNet(datadir, train_bs, test_bs, dataidxs=None):
-    dl_obj = ImageNet
-
-    transform_train, transform_test = _data_transforms_ImageNet()
 
-    train_ds = dl_obj(datadir, dataidxs=dataidxs, train=True, transform=transform_train, download=False)
-    test_ds = dl_obj(datadir, dataidxs=None, train=False, transform=transform_test, download=False)
+def get_dataloader(dataset_train, dataset_test, train_bs,
+                    test_bs, dataidxs=None, net_dataidx_map=None, args=None):
 
-    train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=False)
-    test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=False)
+    train_dl = data.DataLoader(dataset=dataset_train, batch_size=train_bs, shuffle=True, drop_last=False,
+                        pin_memory=True, num_workers=args.data_load_num_workers)
+    test_dl = data.DataLoader(dataset=dataset_test, batch_size=test_bs, shuffle=False, drop_last=False,
+                        pin_memory=True, num_workers=args.data_load_num_workers)
 
     return train_dl, test_dl
 
 
-def get_dataloader_test_ImageNet(datadir, train_bs, test_bs, dataidxs_train=None, dataidxs_test=None):
-    dl_obj = ImageNet
 
-    transform_train, transform_test = _data_transforms_ImageNet()
+def get_timm_loader(dataset_train, dataset_test, args):
+    """
+        Use for get data loader of timm, for data transforms, augmentations, etc.
+        dataset: self-defined dataset,
+        return: timm loader
+    """
+    logging.info("Using timm dataset and dataloader")
+
+    # TODO not sure whether any problem here
+    data_config = resolve_data_config(vars(args), model=None, verbose=args.rank == 0)
+
+    # setup augmentation batch splits for contrastive loss or split bn
+    num_aug_splits = 0
+    if args.aug_splits > 0:
+        assert args.aug_splits > 1, 'A split of 1 makes no sense'
+        num_aug_splits = args.aug_splits
+
+    # wrap dataset in AugMix helper
+    if num_aug_splits > 1:
+        dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits)
+
+    # create data loaders w/ augmentation pipeiine
+    train_interpolation = args.train_interpolation
+    if args.no_aug or not train_interpolation:
+        train_interpolation = data_config['interpolation']
+
+    # some args not in the args
+    args.prefetcher = False
+    args.pin_mem = False
+    collate_fn = None
+    args.use_multi_epochs_loader = False
+
+    train_batch_size = args.batch_size
+    test_batch_size = args.batch_size // 4
+
+    if args.data_transform == 'FLTransform':
+        data_config['mean'] = [0.5, 0.5, 0.5]
+        data_config['std'] = [0.5, 0.5, 0.5]
+    elif args.data_transform == 'NormalTransform':
+        pass 
+        # data_config['mean'] = 
+        # data_config['std'] = 
+    else:
+        raise NotImplementedError
+
+    logging.info("data transform, MEAN: {}, STD: {}.".format(
+        data_config['mean'], data_config['std']))
+    loader_train = create_loader(
+        dataset_train,
+        input_size=data_config['input_size'],
+        batch_size=train_batch_size,
+        is_training=True,
+        use_prefetcher=args.prefetcher,
+        no_aug=args.no_aug,
+        re_prob=args.reprob,
+        re_mode=args.remode,
+        re_count=args.recount,
+        re_split=args.resplit,
+        scale=args.scale,
+        ratio=args.ratio,
+        hflip=args.hflip,
+        vflip=args.vflip,
+        color_jitter=args.color_jitter,
+        auto_augment=args.aa,
+        num_aug_splits=num_aug_splits,
+        interpolation=train_interpolation,
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.data_load_num_workers,
+        distributed=args.distributed,
+        collate_fn=collate_fn,
+        pin_memory=args.pin_mem,
+        use_multi_epochs_loader=args.use_multi_epochs_loader
+    )
+
+    loader_eval = create_loader(
+        dataset_test,
+        input_size=data_config['input_size'],
+        batch_size=test_batch_size,
+        is_training=False,
+        use_prefetcher=args.prefetcher,
+        interpolation=data_config['interpolation'],
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.data_load_num_workers,
+        distributed=args.distributed,
+        crop_pct=data_config['crop_pct'],
+        pin_memory=args.pin_mem,
+    )
+    return loader_train, loader_eval
+
+
+def distributed_centralized_ImageNet_loader(dataset, data_dir,
+                        world_size, rank, batch_size, args):
+    """
+        Used for generating distributed dataloader for 
+        accelerating centralized training 
+    """
+
+    train_bs=batch_size
+    test_bs=batch_size
+
+    transform_train, transform_test = _data_transforms_ImageNet(args)
+    if dataset == 'ILSVRC2012':
+        train_dataset = ImageNet(data_dir=data_dir,
+                                dataidxs=None,
+                                train=True,
+                                transform=transform_train) 
+
+        test_dataset = ImageNet(data_dir=data_dir,
+                                dataidxs=None,
+                                train=False,
+                                transform=transform_test)
+        class_num = 1000
+    elif dataset == 'ILSVRC2012-100':
+        train_dataset = ImageNet100(data_dir=data_dir,
+                                dataidxs=None,
+                                train=True,
+                                transform=transform_train) 
+
+        test_dataset = ImageNet100(data_dir=data_dir,
+                                dataidxs=None,
+                                train=False,
+                                transform=transform_test) 
+        class_num = 100
+    elif dataset == 'ILSVRC2012_hdf5':
+        train_dataset = ImageNet_hdf5(data_dir=data_dir,
+                                dataidxs=None,
+                                train=True,
+                                transform=transform_train) 
+
+        test_dataset = ImageNet_hdf5(data_dir=data_dir,
+                                dataidxs=None,
+                                train=False,
+                                transform=transform_test) 
+        class_num = 1000
+    else:
+        raise NotImplementedError
+
+
+    if args.if_timm_dataset:
+        train_dl, test_dl = get_timm_loader(train_dataset, test_dataset, args)
+    else:
+        train_sam = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank)
+        # test_sam = DistributedSampler(test_dataset, num_replicas=world_size, rank=rank)
+
+        train_dl = data.DataLoader(train_dataset, batch_size=train_bs , sampler=train_sam,
+                            pin_memory=True, num_workers=args.data_load_num_workers)
+
+        test_dl = data.DataLoader(test_dataset, batch_size=test_bs, sampler=None,
+                            pin_memory=True, num_workers=args.data_load_num_workers)
 
-    train_ds = dl_obj(datadir, dataidxs=dataidxs_train, train=True, transform=transform_train, download=True)
-    test_ds = dl_obj(datadir, dataidxs=dataidxs_test, train=False, transform=transform_test, download=True)
-
-    train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=False)
-    test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=False)
-
-    return train_dl, test_dl
-
-
-def load_partition_data_ImageNet(dataset, data_dir,
-                                 partition_method=None, partition_alpha=None, client_number=100, batch_size=10):
-    train_dataset = ImageNet(data_dir=data_dir,
-                             dataidxs=None,
-                             train=True)
+    train_data_num = len(train_dataset)
+    test_data_num = len(test_dataset)
 
-    test_dataset = ImageNet(data_dir=data_dir,
-                            dataidxs=None,
-                            train=False)
+    logging.info("len of train_dataset: {}".format(train_data_num))
+    logging.info("len of test_dataset: {}".format(test_data_num))
+
+    return train_data_num, test_data_num, train_dl, test_dl, \
+           None, None, None, class_num
+
+
+def load_partition_data_ImageNet(dataset, data_dir, partition_method=None, partition_alpha=None, 
+                                    client_number=100, batch_size=10, args=None):
+
+    transform_train, transform_test = _data_transforms_ImageNet(args)
+    if dataset == 'ILSVRC2012':
+        train_dataset = ImageNet(data_dir=data_dir,
+                                dataidxs=None,
+                                train=True,
+                                transform=transform_train) 
+
+        test_dataset = ImageNet(data_dir=data_dir,
+                                dataidxs=None,
+                                train=False,
+                                transform=transform_test)
+        class_num = 1000
+    elif dataset == 'ILSVRC2012-100':
+        train_dataset = ImageNet100(data_dir=data_dir,
+                                dataidxs=None,
+                                train=True,
+                                transform=transform_train) 
+
+        test_dataset = ImageNet100(data_dir=data_dir,
+                                dataidxs=None,
+                                train=False,
+                                transform=transform_test) 
+        class_num = 100
+    elif dataset == 'ILSVRC2012_hdf5':
+        train_dataset = ImageNet_hdf5(data_dir=data_dir,
+                                dataidxs=None,
+                                train=True,
+                                transform=transform_train) 
+
+        test_dataset = ImageNet_hdf5(data_dir=data_dir,
+                                dataidxs=None,
+                                train=False,
+                                transform=transform_test) 
+        class_num = 1000
+    else:
+        raise NotImplementedError
 
     net_dataidx_map = train_dataset.get_net_dataidx_map()
 
-    class_num = 1000
 
     # logging.info("traindata_cls_counts = " + str(traindata_cls_counts))
     # train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)])
@@ -137,11 +319,12 @@ def load_partition_data_ImageNet(dataset, data_dir,
     test_data_num = len(test_dataset)
     class_num_dict = train_dataset.get_data_local_num_dict()
 
-    # train_data_global, test_data_global = get_dataloader(dataset, data_dir, batch_size, batch_size)
-
-    train_data_global, test_data_global = get_dataloader_ImageNet_truncated(train_dataset, test_dataset,
-                                                                            train_bs=batch_size, test_bs=batch_size,
-                                                                            dataidxs=None, net_dataidx_map=None, )
+    if args.if_timm_dataset:
+        train_data_global, test_data_global = get_timm_loader(train_dataset, test_dataset, args)
+    else:
+        train_data_global, test_data_global = get_dataloader(train_dataset, test_dataset,
+                                                            train_bs=batch_size, test_bs=batch_size,
+                                                            dataidxs=None, net_dataidx_map=None, args=None)
 
     logging.info("train_dl_global number = " + str(len(train_data_global)))
     logging.info("test_dl_global number = " + str(len(test_data_global)))
@@ -153,11 +336,19 @@ def load_partition_data_ImageNet(dataset, data_dir,
 
     for client_idx in range(client_number):
         if client_number == 1000:
+            if dataset not in ['ILSVRC2012', 'ILSVRC2012_hdf5']:
+                raise NotImplementedError("Only support 1000 clients for Full ILSVRC2012!")
             dataidxs = client_idx
             data_local_num_dict = class_num_dict
         elif client_number == 100:
-            dataidxs = [client_idx * 10 + i for i in range(10)]
-            data_local_num_dict[client_idx] = sum(class_num_dict[client_idx + i] for i in range(10))
+            if dataset in ['ILSVRC2012', 'ILSVRC2012_hdf5']:
+                dataidxs = [client_idx * 10 + i for i in range(10)]
+                data_local_num_dict[client_idx] = sum(class_num_dict[client_idx + i] for i in range(10))
+            elif dataset in ['ILSVRC2012-100']:
+                dataidxs = client_idx
+                data_local_num_dict = class_num_dict
+            else:
+                raise NotImplementedError
         else:
             raise NotImplementedError("Not support other client_number for now!")
 
@@ -168,10 +359,16 @@ def load_partition_data_ImageNet(dataset, data_dir,
         # training batch size = 64; algorithms batch size = 32
         # train_data_local, test_data_local = get_dataloader(dataset, data_dir, batch_size, batch_size,
         #                                          dataidxs)
-        train_data_local, test_data_local = get_dataloader_ImageNet_truncated(train_dataset, test_dataset,
-                                                                              train_bs=batch_size, test_bs=batch_size,
-                                                                              dataidxs=dataidxs,
-                                                                              net_dataidx_map=net_dataidx_map)
+        train_dataset_local, test_dataset_local = get_ImageNet_truncated(train_dataset, test_dataset,
+                                                                        train_bs=batch_size, test_bs=batch_size,
+                                                                        dataidxs=dataidxs,
+                                                                        net_dataidx_map=net_dataidx_map, args=args)
+        if args.if_timm_dataset:
+            train_data_local, test_data_local = get_timm_loader(train_dataset_local, test_dataset_local, args)
+        else:
+            train_data_local, test_data_local = get_dataloader(train_dataset_local, test_dataset_local,
+                                                                train_bs=batch_size, test_bs=batch_size,
+                                                                dataidxs=None, net_dataidx_map=None, args=args)
 
         # logging.info("client_idx = %d, batch_num_train_local = %d, batch_num_test_local = %d" % (
         # client_idx, len(train_data_local), len(test_data_local)))
@@ -184,7 +381,8 @@ def load_partition_data_ImageNet(dataset, data_dir,
 
 
 if __name__ == '__main__':
-    data_dir = '/home/datasets/imagenet/ILSVRC2012_dataset'
+    # data_dir = '/home/datasets/imagenet/ILSVRC2012_dataset'
+    data_dir = '/home/datasets/imagenet/imagenet_hdf5/imagenet-shuffled.hdf5'
 
     client_number = 100
     train_data_num, test_data_num, train_data_global, test_data_global, \
diff --git a/data_preprocessing/ImageNet/datasets.py b/data_preprocessing/ImageNet/datasets.py
index 44d702b..89ce77d 100644
--- a/data_preprocessing/ImageNet/datasets.py
+++ b/data_preprocessing/ImageNet/datasets.py
@@ -1,8 +1,10 @@
 import os
 import os.path
+import logging
 
-import torch.utils.data as data
 from PIL import Image
+import torch.utils.data as data
+from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset
 
 
 def has_file_allowed_extension(filename, extensions):
@@ -25,18 +27,21 @@ def find_classes(dir):
     return classes, class_to_idx
 
 
-def make_dataset(dir, class_to_idx, extensions):
+def make_dataset(dir, class_to_idx, extensions, num_classes=1000):
     images = []
 
     data_local_num_dict = dict()
     net_dataidx_map = dict()
     sum_temp = 0
     dir = os.path.expanduser(dir)
+
+    i_target = 0 
     for target in sorted(os.listdir(dir)):
+        if not (i_target < num_classes):
+            break
         d = os.path.join(dir, target)
         if not os.path.isdir(d):
             continue
-
         target_num = 0
         for root, _, fnames in sorted(os.walk(d)):
             for fname in sorted(fnames):
@@ -49,6 +54,7 @@ def make_dataset(dir, class_to_idx, extensions):
         net_dataidx_map[class_to_idx[target]] = (sum_temp, sum_temp + target_num)
         data_local_num_dict[class_to_idx[target]] = target_num
         sum_temp += target_num
+        i_target += 1
 
     assert len(images) == sum_temp
     return images, data_local_num_dict, net_dataidx_map
@@ -126,7 +132,83 @@ def __getdatasets__(self):
         if len(all_data) == 0:
             raise (RuntimeError("Found 0 files in subfolders of: " + self.data_dir + "\n"
                                                                                      "Supported extensions are: " + ",".join(
-                extensions)))
+                IMG_EXTENSIONS)))
+        return all_data, data_local_num_dict, net_dataidx_map
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: (image, target) where target is index of the target class.
+        """
+        # img, target = self.data[index], self.target[index]
+
+        path, target = self.local_data[index]
+        img = self.loader(path)
+        if self.transform is not None:
+            img = self.transform(img)
+
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.local_data)
+
+
+class ImageNet100(data.Dataset):
+
+    def __init__(self, data_dir, dataidxs=None, train=True, transform=None, target_transform=None, download=False):
+        """
+            Generating this class too many times will be time-consuming.
+            So it will be better calling this once and put it into ImageNet_truncated.
+        """
+        self.dataidxs = dataidxs
+        self.train = train
+        self.transform = transform
+        self.target_transform = target_transform
+        self.download = download
+        self.loader = default_loader
+        if self.train:
+            self.data_dir = os.path.join(data_dir, 'train')
+        else:
+            self.data_dir = os.path.join(data_dir, 'val')
+
+        self.all_data, self.data_local_num_dict, self.net_dataidx_map = self.__getdatasets__()
+        if dataidxs == None:
+            self.local_data = self.all_data
+        elif type(dataidxs) == int:
+            (begin, end) = self.net_dataidx_map[dataidxs]
+            self.local_data = self.all_data[begin: end]
+        else:
+            self.local_data = []
+            for idxs in dataidxs:
+                (begin, end) = self.net_dataidx_map[idxs]
+                self.local_data += self.all_data[begin: end]
+
+    def get_local_data(self):
+        return self.local_data
+
+    def get_net_dataidx_map(self):
+        return self.net_dataidx_map
+
+    def get_data_local_num_dict(self):
+        return self.data_local_num_dict
+
+    def __getdatasets__(self):
+        # all_data = datasets.ImageFolder(data_dir, self.transform, self.target_transform)
+
+        classes, class_to_idx = find_classes(self.data_dir)
+        IMG_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif']
+        all_data, data_local_num_dict, net_dataidx_map = make_dataset(
+            self.data_dir, class_to_idx, IMG_EXTENSIONS, num_classes=100)
+        if len(all_data) == 0:
+            raise (RuntimeError("Found 0 files in subfolders of: " + self.data_dir + "\n"
+                                                                                     "Supported extensions are: " + ",".join(
+                IMG_EXTENSIONS)))
         return all_data, data_local_num_dict, net_dataidx_map
 
     def __getitem__(self, index):
diff --git a/data_preprocessing/ImageNet/datasets_hdf5.py b/data_preprocessing/ImageNet/datasets_hdf5.py
new file mode 100644
index 0000000..35f6fe8
--- /dev/null
+++ b/data_preprocessing/ImageNet/datasets_hdf5.py
@@ -0,0 +1,191 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+
+import os
+import os.path
+
+import torch.utils.data as data
+import torchvision.transforms as transforms
+import h5py
+import numpy as np
+
+class DatasetHDF5(data.Dataset):
+    def __init__(self, hdf5fn, t, transform=None, target_transform=None):
+        """
+        t: 'train' or 'val'
+        """
+        super(DatasetHDF5, self).__init__()
+        self.hf = h5py.File(hdf5fn, 'r', libver='latest', swmr=True)
+        self.t = t
+        self.n_images= self.hf['%s_img'%self.t].shape[0]
+        self.dlabel = self.hf['%s_labels'%self.t][...]
+        self.d = self.hf['%s_img'%self.t]
+        # self.transform = transform
+        # self.target_transform = target_transform
+
+    def _get_dataset_x_and_target(self, index):
+        img = self.d[index, ...]
+        target = self.dlabel[index]
+        return img, np.int64(target)
+
+    def __getitem__(self, index):
+        img, target = self._get_dataset_x_and_target(index)
+        # if self.transform is not None:
+        #     img = self.transform(img)
+        # if self.target_transform is not None:
+        #     target = self.target_transform(target)
+        return img, target
+
+    def __len__(self):
+        return self.n_images
+
+
+class ImageNet_hdf5(data.Dataset):
+
+    def __init__(self, data_dir, dataidxs=None, train=True, transform=None, target_transform=None, download=False):
+        """
+            Generating this class too many times will be time-consuming.
+            So it will be better calling this once and put it into ImageNet_truncated.
+        """
+        self.dataidxs = dataidxs
+        self.train = train
+        self.transform = transform
+        self.target_transform = target_transform
+        self.download = download
+        self.hdf5fn = os.path.join(data_dir)
+
+        # if self.train:
+        #     self.data_dir = os.path.join(data_dir, 'train')
+        # else:
+        #     self.data_dir = os.path.join(data_dir, 'val')
+
+        self.all_data_hdf5 = DatasetHDF5(self.hdf5fn, 'train' if self.train else 'val', 
+            transform=self.transform, target_transform=self.target_transform)
+
+        self.data_local_num_dict, self.net_dataidx_map = \
+                                    self._get_net_dataidx_map()
+
+        """
+            self.local_data_idx is a list containing indexes of local client
+        """
+        self.all_data_idx = range(len(self.all_data_hdf5))
+        if dataidxs == None:
+            self.local_data_idx = self.all_data_idx
+        elif type(dataidxs) == int:
+            self.local_data_idx = self.net_dataidx_map[dataidxs]
+        else:
+            self.local_data_idx = []
+            for idxs in dataidxs:
+                self.local_data_idx += self.net_dataidx_map[idxs]
+
+
+    def _get_net_dataidx_map(self):
+        data_local_num_dict = dict()
+        net_dataidx_map = dict()
+        for i, label in enumerate(self.all_data_hdf5.dlabel):
+            label_int = np.int64(label)
+            if label in net_dataidx_map:
+                net_dataidx_map[label_int].append(i)
+            else:
+                net_dataidx_map[label_int] = []
+                net_dataidx_map[label_int].append(i)
+
+        for key, value in net_dataidx_map.items():
+            data_local_num_dict[key] = len(value)
+
+        return data_local_num_dict, net_dataidx_map
+
+
+    def get_net_dataidx_map(self):
+        return self.net_dataidx_map
+
+    def get_data_local_num_dict(self):
+        return self.data_local_num_dict
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: (image, target) where target is index of the target class.
+        """
+
+        img, target = self.all_data_hdf5[self.local_data_idx[index]]
+        img = transforms.ToPILImage()(img)
+        # img = self.loader(path)
+        if self.transform is not None:
+           img = self.transform(img)
+
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.local_data_idx)
+
+
+
+class ImageNet_truncated_hdf5(data.Dataset):
+
+    def __init__(self, imagenet_dataset: ImageNet_hdf5, dataidxs, net_dataidx_map, train=True, transform=None,
+                 target_transform=None, download=False):
+
+        self.dataidxs = dataidxs
+        self.train = train
+        # self.transform = transform
+        # self.target_transform = target_transform
+        self.download = download
+
+        self.all_data_hdf5 = imagenet_dataset
+
+        self.data_local_num_dict = imagenet_dataset.data_local_num_dict
+
+        self.net_dataidx_map = imagenet_dataset.net_dataidx_map
+
+        """
+            self.local_data_idx is a list containing indexes of local client
+        """
+        self.all_data_idx = range(len(self.all_data_hdf5))
+        if dataidxs == None:
+            self.local_data_idx = self.all_data_idx
+        elif type(dataidxs) == int:
+            self.local_data_idx = self.net_dataidx_map[dataidxs]
+        else:
+            self.local_data_idx = []
+            for idxs in dataidxs:
+                self.local_data_idx += self.net_dataidx_map[idxs]
+
+
+    def get_net_dataidx_map(self):
+        return self.net_dataidx_map
+
+    def get_data_local_num_dict(self):
+        return self.data_local_num_dict
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: (image, target) where target is index of the target class.
+        """
+
+        # Transform operation has been conducted in all_data_hdf5
+        img, target = self.all_data_hdf5[self.local_data_idx[index]]
+        return img, target
+
+    def __len__(self):
+        return len(self.local_data_idx)
+
+
+
+
+
+
+
+
+
+
diff --git a/data_preprocessing/Landmarks/data_loader.py b/data_preprocessing/Landmarks/data_loader.py
index 377efb7..d1e8ea9 100644
--- a/data_preprocessing/Landmarks/data_loader.py
+++ b/data_preprocessing/Landmarks/data_loader.py
@@ -9,6 +9,7 @@
 import torch
 import torch.utils.data as data
 import torchvision.transforms as transforms
+from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset
 
 from .datasets import Landmarks
 
@@ -28,47 +29,7 @@ def _read_csv(path: str):
   with open(path, 'r') as f:
     return list(csv.DictReader(f))
 
-# class Cutout(object):
-#     def __init__(self, length):
-#         self.length = length
 
-#     def __call__(self, img):
-#         h, w = img.size(1), img.size(2)
-#         mask = np.ones((h, w), np.float32)
-#         y = np.random.randint(h)
-#         x = np.random.randint(w)
-
-#         y1 = np.clip(y - self.length // 2, 0, h)
-#         y2 = np.clip(y + self.length // 2, 0, h)
-#         x1 = np.clip(x - self.length // 2, 0, w)
-#         x2 = np.clip(x + self.length // 2, 0, w)
-
-#         mask[y1: y2, x1: x2] = 0.
-#         mask = torch.from_numpy(mask)
-#         mask = mask.expand_as(img)
-#         img *= mask
-#         return img
-
-# def _data_transforms_landmarks():
-#     landmarks_MEAN = [0.5071, 0.4865, 0.4409]
-#     landmarks_STD = [0.2673, 0.2564, 0.2762]
-
-#     train_transform = transforms.Compose([
-#         transforms.ToPILImage(),
-#         transforms.RandomCrop(32, padding=4),
-#         transforms.RandomHorizontalFlip(),
-#         transforms.ToTensor(),
-#         transforms.Normalize(landmarks_MEAN, landmarks_STD),
-#     ])
-
-#     train_transform.transforms.append(Cutout(16))
-
-#     valid_transform = transforms.Compose([
-#         transforms.ToTensor(),
-#         transforms.Normalize(landmarks_MEAN, landmarks_STD),
-#     ])
-
-#     return train_transform, valid_transform
 
 class Cutout(object):
     def __init__(self, length):
@@ -92,12 +53,16 @@ def __call__(self, img):
         return img
 
 
-def _data_transforms_landmarks():
-    # IMAGENET_MEAN = [0.5071, 0.4865, 0.4409]
-    # IMAGENET_STD = [0.2673, 0.2564, 0.2762]
+def _data_transforms_landmarks(args):
 
-    IMAGENET_MEAN = [0.5, 0.5, 0.5]
-    IMAGENET_STD = [0.5, 0.5, 0.5]
+    if args.data_transform == 'FLTransform':
+        IMAGENET_MEAN = [0.5, 0.5, 0.5]
+        IMAGENET_STD = [0.5, 0.5, 0.5]
+    elif args.data_transform == 'NormalTransform':
+        IMAGENET_MEAN = [0.485, 0.456, 0.406]
+        IMAGENET_STD = [0.229, 0.224, 0.225]
+    else:
+        raise NotImplementedError
 
     image_size = 224
     train_transform = transforms.Compose([
@@ -132,7 +97,7 @@ def get_mapping_per_user(fn):
     mapping_table = _read_csv(fn)
     expected_cols = ['user_id', 'image_id', 'class']
     if not all(col in mapping_table[0].keys() for col in expected_cols):
-        logger.error('%s has wrong format.', mapping_file)
+        logger.error('%s has wrong format.', fn)
         raise ValueError(
             'The mapping file must contain user_id, image_id and class columns. '
             'The existing columns are %s' % ','.join(mapping_table[0].keys()))
@@ -161,46 +126,127 @@ def get_mapping_per_user(fn):
     return data_files, data_local_num_dict, net_dataidx_map
 
 
-# for centralized training
-def get_dataloader(dataset, datadir, train_files, test_files, train_bs, test_bs, dataidxs=None):
-    return get_dataloader_Landmarks(datadir, train_files, test_files, train_bs, test_bs, dataidxs)
-
-
-# for local devices
-def get_dataloader_test(dataset, datadir, train_files, test_files, train_bs, test_bs, dataidxs_train, dataidxs_test):
-    return get_dataloader_test_Landmarks(datadir, train_files, test_files, train_bs, test_bs, dataidxs_train, dataidxs_test)
+def get_dataloader(dataset_train, dataset_test, dataidxs=None, args=None):
+    train_bs = args.batch_size
+    test_bs = args.batch_size
 
+    train_dl = data.DataLoader(dataset=dataset_train, batch_size=train_bs, shuffle=True, drop_last=False,
+                        pin_memory=True, num_workers=args.data_load_num_workers)
+    test_dl = data.DataLoader(dataset=dataset_test, batch_size=test_bs, shuffle=False, drop_last=False,
+                        pin_memory=True, num_workers=args.data_load_num_workers)
 
-def get_dataloader_Landmarks(datadir, train_files, test_files, train_bs, test_bs, dataidxs=None):
-    dl_obj = Landmarks
+    return train_dl, test_dl
 
-    transform_train, transform_test = _data_transforms_landmarks()
 
-    train_ds = dl_obj(datadir, train_files, dataidxs=dataidxs, train=True, transform=transform_train, download=True)
-    test_ds = dl_obj(datadir, test_files, dataidxs=None, train=False, transform=transform_test, download=True)
+# def get_dataloader_Landmarks(datadir, train_files, test_files, train_bs, test_bs, dataidxs=None):
+#     dl_obj = Landmarks
 
-    train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=False)
-    test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=False)
+#     transform_train, transform_test = _data_transforms_landmarks()
 
-    return train_dl, test_dl
+#     train_ds = dl_obj(datadir, train_files, dataidxs=dataidxs, train=True, transform=transform_train, download=True)
+#     test_ds = dl_obj(datadir, test_files, dataidxs=dataidxs, train=False, transform=transform_test, download=True)
 
+#     train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=False)
+#     test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=False)
 
-def get_dataloader_test_Landmarks(datadir, train_files, test_files, train_bs, test_bs, dataidxs_train=None, dataidxs_test=None):
-    dl_obj = Landmarks
+#     return train_dl, test_dl
 
-    transform_train, transform_test = _data_transforms_landmarks()
 
-    train_ds = dl_obj(datadir, train_files, dataidxs=dataidxs_train, train=True, transform=transform_train, download=True)
-    test_ds = dl_obj(datadir, test_files, dataidxs=dataidxs_test, train=False, transform=transform_test, download=True)
+def get_timm_loader(dataset_train, dataset_test, args):
+    """
+        Use for get data loader of timm, for data transforms, augmentations, etc.
+        dataset: self-defined dataset,
+        return: timm loader
+    """
+    logging.info("Using timm dataset and dataloader")
+
+    # TODO not sure whether any problem here
+    data_config = resolve_data_config(vars(args), model=None, verbose=args.rank == 0)
+
+    # setup augmentation batch splits for contrastive loss or split bn
+    num_aug_splits = 0
+    if args.aug_splits > 0:
+        assert args.aug_splits > 1, 'A split of 1 makes no sense'
+        num_aug_splits = args.aug_splits
+
+    # wrap dataset in AugMix helper
+    if num_aug_splits > 1:
+        dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits)
+
+    # create data loaders w/ augmentation pipeiine
+    train_interpolation = args.train_interpolation
+    if args.no_aug or not train_interpolation:
+        train_interpolation = data_config['interpolation']
+
+    # some args not in the args
+    args.prefetcher = False
+    args.pin_mem = False
+    collate_fn = None
+    args.use_multi_epochs_loader = False
+
+    train_batch_size = args.batch_size
+    test_batch_size = args.batch_size // 4
+
+    if args.data_transform == 'FLTransform':
+        data_config['mean'] = [0.5, 0.5, 0.5]
+        data_config['std'] = [0.5, 0.5, 0.5]
+    elif args.data_transform == 'NormalTransform':
+        pass 
+        # data_config['mean'] = 
+        # data_config['std'] = 
+    else:
+        raise NotImplementedError
+
+    logging.info("data transform, MEAN: {}, STD: {}.".format(
+        data_config['mean'], data_config['std']))
+    loader_train = create_loader(
+        dataset_train,
+        input_size=data_config['input_size'],
+        batch_size=train_batch_size,
+        is_training=True,
+        use_prefetcher=args.prefetcher,
+        no_aug=args.no_aug,
+        re_prob=args.reprob,
+        re_mode=args.remode,
+        re_count=args.recount,
+        re_split=args.resplit,
+        scale=args.scale,
+        ratio=args.ratio,
+        hflip=args.hflip,
+        vflip=args.vflip,
+        color_jitter=args.color_jitter,
+        auto_augment=args.aa,
+        num_aug_splits=num_aug_splits,
+        interpolation=train_interpolation,
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.data_load_num_workers,
+        distributed=args.distributed,
+        collate_fn=collate_fn,
+        pin_memory=args.pin_mem,
+        use_multi_epochs_loader=args.use_multi_epochs_loader
+    )
+
+    loader_eval = create_loader(
+        dataset_test,
+        input_size=data_config['input_size'],
+        batch_size=test_batch_size,
+        is_training=False,
+        use_prefetcher=args.prefetcher,
+        interpolation=data_config['interpolation'],
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.data_load_num_workers,
+        distributed=args.distributed,
+        crop_pct=data_config['crop_pct'],
+        pin_memory=args.pin_mem,
+    )
+    return loader_train, loader_eval
 
-    train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=False)
-    test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=False)
-
-    return train_dl, test_dl
 
 
 def load_partition_data_landmarks(dataset, data_dir, fed_train_map_file, fed_test_map_file, 
-                            partition_method=None, partition_alpha=None, client_number=233, batch_size=10):
+                            partition_method=None, partition_alpha=None, client_number=233, batch_size=10, args=None):
 
     train_files, data_local_num_dict, net_dataidx_map = get_mapping_per_user(fed_train_map_file)
     test_files = _read_csv(fed_test_map_file)
@@ -209,7 +255,18 @@ def load_partition_data_landmarks(dataset, data_dir, fed_train_map_file, fed_tes
     # logging.info("traindata_cls_counts = " + str(traindata_cls_counts))
     train_data_num = len(train_files)
 
-    train_data_global, test_data_global = get_dataloader(dataset, data_dir, train_files, test_files, batch_size, batch_size)
+
+    transform_train, transform_test = _data_transforms_landmarks(args)
+
+    train_dataset = Landmarks(data_dir, train_files, dataidxs=None, train=True, transform=transform_train, download=True)
+    test_dataset = Landmarks(data_dir, test_files, dataidxs=None, train=False, transform=transform_test, download=True)
+
+
+    if args.if_timm_dataset:
+        train_data_global, test_data_global = get_timm_loader(train_dataset, test_dataset, args)
+    else:
+        train_data_global, test_data_global = get_dataloader(train_dataset, test_dataset, args)
+
     # logging.info("train_dl_global number = " + str(len(train_data_global)))
     # logging.info("test_dl_global number = " + str(len(test_data_global)))
     test_data_num = len(test_files)
@@ -227,9 +284,13 @@ def load_partition_data_landmarks(dataset, data_dir, fed_train_map_file, fed_tes
         # data_local_num_dict[client_idx] = local_data_num
         # logging.info("client_idx = %d, local_sample_number = %d" % (client_idx, local_data_num))
 
-        # training batch size = 64; algorithms batch size = 32
-        train_data_local, test_data_local = get_dataloader(dataset, data_dir, train_files, test_files, batch_size, batch_size,
-                                                 dataidxs)
+        train_dataset_local = Landmarks(data_dir, train_files, dataidxs=dataidxs, train=True, transform=transform_train, download=True)
+        test_dataset_local = Landmarks(data_dir, test_files, dataidxs=None, train=False, transform=transform_test, download=True)
+        if args.if_timm_dataset:
+            train_data_local, test_data_local = get_timm_loader(train_dataset_local, test_dataset_local, args)
+        else:
+            train_data_local, test_data_local = get_dataloader(train_dataset_local, test_dataset_local, args)
+
         # logging.info("client_idx = %d, batch_num_train_local = %d, batch_num_test_local = %d" % (
         #     client_idx, len(train_data_local), len(test_data_local)))
         train_data_local_dict[client_idx] = train_data_local
diff --git a/data_preprocessing/cifar10/__init__.py b/data_preprocessing/cifar10/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/data_preprocessing/cifar10/data_loader.py b/data_preprocessing/cifar10/data_loader.py
new file mode 100644
index 0000000..3377f9d
--- /dev/null
+++ b/data_preprocessing/cifar10/data_loader.py
@@ -0,0 +1,269 @@
+import logging
+
+import numpy as np
+import torch
+import torch.utils.data as data
+import torchvision.transforms as transforms
+
+from .datasets import CIFAR10_truncated
+
+logging.basicConfig()
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+
+# generate the non-IID distribution for all methods
+def read_data_distribution(filename='./data_preprocessing/non-iid-distribution/CIFAR10/distribution.txt'):
+    distribution = {}
+    with open(filename, 'r') as data:
+        for x in data.readlines():
+            if '{' != x[0] and '}' != x[0]:
+                tmp = x.split(':')
+                if '{' == tmp[1].strip():
+                    first_level_key = int(tmp[0])
+                    distribution[first_level_key] = {}
+                else:
+                    second_level_key = int(tmp[0])
+                    distribution[first_level_key][second_level_key] = int(tmp[1].strip().replace(',', ''))
+    return distribution
+
+
+def read_net_dataidx_map(filename='./data_preprocessing/non-iid-distribution/CIFAR10/net_dataidx_map.txt'):
+    net_dataidx_map = {}
+    with open(filename, 'r') as data:
+        for x in data.readlines():
+            if '{' != x[0] and '}' != x[0] and ']' != x[0]:
+                tmp = x.split(':')
+                if '[' == tmp[-1].strip():
+                    key = int(tmp[0])
+                    net_dataidx_map[key] = []
+                else:
+                    tmp_array = x.split(',')
+                    net_dataidx_map[key] = [int(i.strip()) for i in tmp_array]
+    return net_dataidx_map
+
+
+def record_net_data_stats(y_train, net_dataidx_map):
+    net_cls_counts = {}
+
+    for net_i, dataidx in net_dataidx_map.items():
+        unq, unq_cnt = np.unique(y_train[dataidx], return_counts=True)
+        tmp = {unq[i]: unq_cnt[i] for i in range(len(unq))}
+        net_cls_counts[net_i] = tmp
+    logging.debug('Data statistics: %s' % str(net_cls_counts))
+    return net_cls_counts
+
+
+class Cutout(object):
+    def __init__(self, length):
+        self.length = length
+
+    def __call__(self, img):
+        h, w = img.size(1), img.size(2)
+        mask = np.ones((h, w), np.float32)
+        y = np.random.randint(h)
+        x = np.random.randint(w)
+
+        y1 = np.clip(y - self.length // 2, 0, h)
+        y2 = np.clip(y + self.length // 2, 0, h)
+        x1 = np.clip(x - self.length // 2, 0, w)
+        x2 = np.clip(x + self.length // 2, 0, w)
+
+        mask[y1: y2, x1: x2] = 0.
+        mask = torch.from_numpy(mask)
+        mask = mask.expand_as(img)
+        img *= mask
+        return img
+
+
+def _data_transforms_cifar10():
+    CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
+    CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]
+
+    train_transform = transforms.Compose([
+        transforms.ToPILImage(),
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
+    ])
+
+    train_transform.transforms.append(Cutout(16))
+
+    valid_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
+    ])
+
+    return train_transform, valid_transform
+
+
+def load_cifar10_data(datadir):
+    train_transform, test_transform = _data_transforms_cifar10()
+
+    cifar10_train_ds = CIFAR10_truncated(datadir, train=True, download=True, transform=train_transform)
+    cifar10_test_ds = CIFAR10_truncated(datadir, train=False, download=True, transform=test_transform)
+
+    X_train, y_train = cifar10_train_ds.data, cifar10_train_ds.target
+    X_test, y_test = cifar10_test_ds.data, cifar10_test_ds.target
+
+    return (X_train, y_train, X_test, y_test)
+
+
+def partition_data(dataset, datadir, partition, n_nets, alpha):
+    logging.info("*********partition data***************")
+    X_train, y_train, X_test, y_test = load_cifar10_data(datadir)
+    n_train = X_train.shape[0]
+    # n_test = X_test.shape[0]
+
+    if partition == "homo":
+        total_num = n_train
+        idxs = np.random.permutation(total_num)
+        batch_idxs = np.array_split(idxs, n_nets)
+        net_dataidx_map = {i: batch_idxs[i] for i in range(n_nets)}
+
+    elif partition == "hetero":
+        min_size = 0
+        K = 10
+        N = y_train.shape[0]
+        logging.info("N = " + str(N))
+        net_dataidx_map = {}
+
+        while min_size < 10:
+            idx_batch = [[] for _ in range(n_nets)]
+            # for each class in the dataset
+            for k in range(K):
+                idx_k = np.where(y_train == k)[0]
+                np.random.shuffle(idx_k)
+                proportions = np.random.dirichlet(np.repeat(alpha, n_nets))
+                ## Balance
+                proportions = np.array([p * (len(idx_j) < N / n_nets) for p, idx_j in zip(proportions, idx_batch)])
+                proportions = proportions / proportions.sum()
+                proportions = (np.cumsum(proportions) * len(idx_k)).astype(int)[:-1]
+                idx_batch = [idx_j + idx.tolist() for idx_j, idx in zip(idx_batch, np.split(idx_k, proportions))]
+                min_size = min([len(idx_j) for idx_j in idx_batch])
+
+        for j in range(n_nets):
+            np.random.shuffle(idx_batch[j])
+            net_dataidx_map[j] = idx_batch[j]
+
+    elif partition == "hetero-fix":
+        dataidx_map_file_path = './data_preprocessing/non-iid-distribution/CIFAR10/net_dataidx_map.txt'
+        net_dataidx_map = read_net_dataidx_map(dataidx_map_file_path)
+
+    if partition == "hetero-fix":
+        distribution_file_path = './data_preprocessing/non-iid-distribution/CIFAR10/distribution.txt'
+        traindata_cls_counts = read_data_distribution(distribution_file_path)
+    else:
+        traindata_cls_counts = record_net_data_stats(y_train, net_dataidx_map)
+
+    return X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts
+
+
+# for centralized training
+def get_dataloader(dataset, datadir, train_bs, test_bs, dataidxs=None):
+    return get_dataloader_CIFAR10(datadir, train_bs, test_bs, dataidxs)
+
+
+# for local devices
+def get_dataloader_test(dataset, datadir, train_bs, test_bs, dataidxs_train, dataidxs_test):
+    return get_dataloader_test_CIFAR10(datadir, train_bs, test_bs, dataidxs_train, dataidxs_test)
+
+
+def get_dataloader_CIFAR10(datadir, train_bs, test_bs, dataidxs=None):
+    dl_obj = CIFAR10_truncated
+
+    transform_train, transform_test = _data_transforms_cifar10()
+
+    train_ds = dl_obj(datadir, dataidxs=dataidxs, train=True, transform=transform_train, download=True)
+    test_ds = dl_obj(datadir, train=False, transform=transform_test, download=True)
+
+    train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=True)
+    test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=True)
+
+    return train_dl, test_dl
+
+
+def get_dataloader_test_CIFAR10(datadir, train_bs, test_bs, dataidxs_train=None, dataidxs_test=None):
+    dl_obj = CIFAR10_truncated
+
+    transform_train, transform_test = _data_transforms_cifar10()
+
+    train_ds = dl_obj(datadir, dataidxs=dataidxs_train, train=True, transform=transform_train, download=True)
+    test_ds = dl_obj(datadir, dataidxs=dataidxs_test, train=False, transform=transform_test, download=True)
+
+    train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=True)
+    test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=True)
+
+    return train_dl, test_dl
+
+
+def load_partition_data_distributed_cifar10(process_id, dataset, data_dir, partition_method, partition_alpha,
+                                            client_number, batch_size):
+    X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts = partition_data(dataset,
+                                                                                             data_dir,
+                                                                                             partition_method,
+                                                                                             client_number,
+                                                                                             partition_alpha)
+    class_num = len(np.unique(y_train))
+    logging.info("traindata_cls_counts = " + str(traindata_cls_counts))
+    train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)])
+
+    # get global test data
+    if process_id == 0:
+        train_data_global, test_data_global = get_dataloader(dataset, data_dir, batch_size, batch_size)
+        logging.info("train_dl_global number = " + str(len(train_data_global)))
+        logging.info("test_dl_global number = " + str(len(test_data_global)))
+        train_data_local = None
+        test_data_local = None
+        local_data_num = 0
+    else:
+        # get local dataset
+        dataidxs = net_dataidx_map[process_id - 1]
+        local_data_num = len(dataidxs)
+        logging.info("rank = %d, local_sample_number = %d" % (process_id, local_data_num))
+        # training batch size = 64; algorithms batch size = 32
+        train_data_local, test_data_local = get_dataloader(dataset, data_dir, batch_size, batch_size,
+                                                 dataidxs)
+        logging.info("process_id = %d, batch_num_train_local = %d, batch_num_test_local = %d" % (
+            process_id, len(train_data_local), len(test_data_local)))
+        train_data_global = None
+        test_data_global = None
+    return train_data_num, train_data_global, test_data_global, local_data_num, train_data_local, test_data_local, class_num
+
+
+def load_partition_data_cifar10(dataset, data_dir, partition_method, partition_alpha, client_number, batch_size):
+    X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts = partition_data(dataset,
+                                                                                             data_dir,
+                                                                                             partition_method,
+                                                                                             client_number,
+                                                                                             partition_alpha)
+    class_num = len(np.unique(y_train))
+    logging.info("traindata_cls_counts = " + str(traindata_cls_counts))
+    train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)])
+
+    train_data_global, test_data_global = get_dataloader(dataset, data_dir, batch_size, batch_size)
+    logging.info("train_dl_global number = " + str(len(train_data_global)))
+    logging.info("test_dl_global number = " + str(len(test_data_global)))
+    test_data_num = len(test_data_global)
+
+    # get local dataset
+    data_local_num_dict = dict()
+    train_data_local_dict = dict()
+    test_data_local_dict = dict()
+
+    for client_idx in range(client_number):
+        dataidxs = net_dataidx_map[client_idx]
+        local_data_num = len(dataidxs)
+        data_local_num_dict[client_idx] = local_data_num
+        logging.info("client_idx = %d, local_sample_number = %d" % (client_idx, local_data_num))
+
+        # training batch size = 64; algorithms batch size = 32
+        train_data_local, test_data_local = get_dataloader(dataset, data_dir, batch_size, batch_size,
+                                                 dataidxs)
+        logging.info("client_idx = %d, batch_num_train_local = %d, batch_num_test_local = %d" % (
+            client_idx, len(train_data_local), len(test_data_local)))
+        train_data_local_dict[client_idx] = train_data_local
+        test_data_local_dict[client_idx] = test_data_local
+    return train_data_num, test_data_num, train_data_global, test_data_global, \
+           data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num
diff --git a/data_preprocessing/cifar10/datasets.py b/data_preprocessing/cifar10/datasets.py
new file mode 100644
index 0000000..54dabd9
--- /dev/null
+++ b/data_preprocessing/cifar10/datasets.py
@@ -0,0 +1,96 @@
+import logging
+
+import numpy as np
+import torch.utils.data as data
+from PIL import Image
+from torchvision.datasets import CIFAR10
+
+logging.basicConfig()
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp')
+
+
+def accimage_loader(path):
+    import accimage
+    try:
+        return accimage.Image(path)
+    except IOError:
+        # Potentially a decoding problem, fall back to PIL.Image
+        return pil_loader(path)
+
+
+def pil_loader(path):
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    with open(path, 'rb') as f:
+        img = Image.open(f)
+        return img.convert('RGB')
+
+
+def default_loader(path):
+    from torchvision import get_image_backend
+    if get_image_backend() == 'accimage':
+        return accimage_loader(path)
+    else:
+        return pil_loader(path)
+
+
+class CIFAR10_truncated(data.Dataset):
+
+    def __init__(self, root, dataidxs=None, train=True, transform=None, target_transform=None, download=False):
+
+        self.root = root
+        self.dataidxs = dataidxs
+        self.train = train
+        self.transform = transform
+        self.target_transform = target_transform
+        self.download = download
+
+        self.data, self.target = self.__build_truncated_dataset__()
+
+    def __build_truncated_dataset__(self):
+        print("download = " + str(self.download))
+        cifar_dataobj = CIFAR10(self.root, self.train, self.transform, self.target_transform, self.download)
+
+        if self.train:
+            # print("train member of the class: {}".format(self.train))
+            # data = cifar_dataobj.train_data
+            data = cifar_dataobj.data
+            target = np.array(cifar_dataobj.targets)
+        else:
+            data = cifar_dataobj.data
+            target = np.array(cifar_dataobj.targets)
+
+        if self.dataidxs is not None:
+            data = data[self.dataidxs]
+            target = target[self.dataidxs]
+
+        return data, target
+
+    def truncate_channel(self, index):
+        for i in range(index.shape[0]):
+            gs_index = index[i]
+            self.data[gs_index, :, :, 1] = 0.0
+            self.data[gs_index, :, :, 2] = 0.0
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: (image, target) where target is index of the target class.
+        """
+        img, target = self.data[index], self.target[index]
+
+        if self.transform is not None:
+            img = self.transform(img)
+
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.data)
diff --git a/data_preprocessing/cifar10/iid_data_loader.py b/data_preprocessing/cifar10/iid_data_loader.py
new file mode 100644
index 0000000..5582028
--- /dev/null
+++ b/data_preprocessing/cifar10/iid_data_loader.py
@@ -0,0 +1,80 @@
+import os
+import argparse
+import time
+import math
+import logging
+
+import torch
+import torch.utils.data as data
+import torchvision.transforms as transforms
+from torchvision.datasets import CIFAR10
+from torch.utils.data.distributed import DistributedSampler
+
+
+
+
+def load_iid_cifar10(dataset, data_dir, partition_method, 
+        partition_alpha, client_number, batch_size, rank=0):
+
+    CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
+    CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]
+
+    image_size = 32
+    train_transform = transforms.Compose([
+        transforms.RandomCrop(image_size, padding=4),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=CIFAR_MEAN , std=CIFAR_STD),
+        ])
+    test_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=CIFAR_MEAN , std=CIFAR_STD),
+        ])
+
+    train_dataset = CIFAR10(root=data_dir, train=True,
+                            transform=train_transform, download=False)
+
+    test_dataset = CIFAR10(root=data_dir, train=False,
+                            transform=test_transform, download=False)
+
+    train_sampler = None
+    shuffle = True
+    if client_number > 1:
+        train_sampler = data.distributed.DistributedSampler(
+            train_dataset, num_replicas=client_number, rank=rank)
+        train_sampler.set_epoch(0)
+        shuffle = False
+
+    train_sampler = train_sampler
+    train_dl = data.DataLoader(train_dataset, batch_size=batch_size,
+                                shuffle=shuffle, num_workers=4, sampler=train_sampler)
+    test_dl = data.DataLoader(test_dataset, batch_size=batch_size,
+                                shuffle=False, num_workers=4)
+    # classes = ('plane', 'car', 'bird', 'cat',
+    #         'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
+
+    class_num = 10
+
+    train_data_num = len(train_dataset)
+    test_data_num = len(test_dataset)
+
+    data_local_num_dict = dict()
+    train_data_local_dict = dict()
+    test_data_local_dict = dict()
+
+    for client_idx in range(client_number):
+        train_data_local_dict[client_idx] = train_dl
+        test_data_local_dict[client_idx] = test_dl
+        data_local_num_dict[client_idx] = train_data_num // client_number
+        logging.info("client_idx = %d, local_sample_number = %d" % (client_idx, train_data_num))
+
+    return train_data_num, test_data_num, train_dl, test_dl, \
+           data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num
+
+
+
+
+
+
+
+
diff --git a/data_preprocessing/cifar100/__init__.py b/data_preprocessing/cifar100/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/data_preprocessing/cifar100/data_loader.py b/data_preprocessing/cifar100/data_loader.py
new file mode 100644
index 0000000..c4e5a36
--- /dev/null
+++ b/data_preprocessing/cifar100/data_loader.py
@@ -0,0 +1,269 @@
+import logging
+
+import numpy as np
+import torch
+import torch.utils.data as data
+import torchvision.transforms as transforms
+
+from .datasets import CIFAR100_truncated
+
+logging.basicConfig()
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+
+# generate the non-IID distribution for all methods
+def read_data_distribution(filename='./data_preprocessing/non-iid-distribution/CIFAR10/distribution.txt'):
+    distribution = {}
+    with open(filename, 'r') as data:
+        for x in data.readlines():
+            if '{' != x[0] and '}' != x[0]:
+                tmp = x.split(':')
+                if '{' == tmp[1].strip():
+                    first_level_key = int(tmp[0])
+                    distribution[first_level_key] = {}
+                else:
+                    second_level_key = int(tmp[0])
+                    distribution[first_level_key][second_level_key] = int(tmp[1].strip().replace(',', ''))
+    return distribution
+
+
+def read_net_dataidx_map(filename='./data_preprocessing/non-iid-distribution/CIFAR10/net_dataidx_map.txt'):
+    net_dataidx_map = {}
+    with open(filename, 'r') as data:
+        for x in data.readlines():
+            if '{' != x[0] and '}' != x[0] and ']' != x[0]:
+                tmp = x.split(':')
+                if '[' == tmp[-1].strip():
+                    key = int(tmp[0])
+                    net_dataidx_map[key] = []
+                else:
+                    tmp_array = x.split(',')
+                    net_dataidx_map[key] = [int(i.strip()) for i in tmp_array]
+    return net_dataidx_map
+
+
+def record_net_data_stats(y_train, net_dataidx_map):
+    net_cls_counts = {}
+
+    for net_i, dataidx in net_dataidx_map.items():
+        unq, unq_cnt = np.unique(y_train[dataidx], return_counts=True)
+        tmp = {unq[i]: unq_cnt[i] for i in range(len(unq))}
+        net_cls_counts[net_i] = tmp
+    logging.debug('Data statistics: %s' % str(net_cls_counts))
+    return net_cls_counts
+
+
+class Cutout(object):
+    def __init__(self, length):
+        self.length = length
+
+    def __call__(self, img):
+        h, w = img.size(1), img.size(2)
+        mask = np.ones((h, w), np.float32)
+        y = np.random.randint(h)
+        x = np.random.randint(w)
+
+        y1 = np.clip(y - self.length // 2, 0, h)
+        y2 = np.clip(y + self.length // 2, 0, h)
+        x1 = np.clip(x - self.length // 2, 0, w)
+        x2 = np.clip(x + self.length // 2, 0, w)
+
+        mask[y1: y2, x1: x2] = 0.
+        mask = torch.from_numpy(mask)
+        mask = mask.expand_as(img)
+        img *= mask
+        return img
+
+
+def _data_transforms_cifar100():
+    CIFAR_MEAN = [0.5071, 0.4865, 0.4409]
+    CIFAR_STD = [0.2673, 0.2564, 0.2762]
+
+    train_transform = transforms.Compose([
+        transforms.ToPILImage(),
+        transforms.RandomCrop(32, padding=4),
+        transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
+    ])
+
+    train_transform.transforms.append(Cutout(16))
+
+    valid_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
+    ])
+
+    return train_transform, valid_transform
+
+def load_cifar100_data(datadir):
+    train_transform, test_transform = _data_transforms_cifar100()
+
+    cifar10_train_ds = CIFAR100_truncated(datadir, train=True, download=True, transform=train_transform)
+    cifar10_test_ds = CIFAR100_truncated(datadir, train=False, download=True, transform=test_transform)
+
+    X_train, y_train = cifar10_train_ds.data, cifar10_train_ds.target
+    X_test, y_test = cifar10_test_ds.data, cifar10_test_ds.target
+
+    return (X_train, y_train, X_test, y_test)
+
+
+def partition_data(dataset, datadir, partition, n_nets, alpha):
+    logging.info("*********partition data***************")
+    X_train, y_train, X_test, y_test = load_cifar100_data(datadir)
+    n_train = X_train.shape[0]
+    # n_test = X_test.shape[0]
+
+    if partition == "homo":
+        total_num = n_train
+        idxs = np.random.permutation(total_num)
+        batch_idxs = np.array_split(idxs, n_nets)
+        net_dataidx_map = {i: batch_idxs[i] for i in range(n_nets)}
+
+    elif partition == "hetero":
+        min_size = 0
+        K = 100
+        N = y_train.shape[0]
+        logging.info("N = " + str(N))
+        net_dataidx_map = {}
+
+        while min_size < 10:
+            idx_batch = [[] for _ in range(n_nets)]
+            # for each class in the dataset
+            for k in range(K):
+                idx_k = np.where(y_train == k)[0]
+                np.random.shuffle(idx_k)
+                proportions = np.random.dirichlet(np.repeat(alpha, n_nets))
+                ## Balance
+                proportions = np.array([p * (len(idx_j) < N / n_nets) for p, idx_j in zip(proportions, idx_batch)])
+                proportions = proportions / proportions.sum()
+                proportions = (np.cumsum(proportions) * len(idx_k)).astype(int)[:-1]
+                idx_batch = [idx_j + idx.tolist() for idx_j, idx in zip(idx_batch, np.split(idx_k, proportions))]
+                min_size = min([len(idx_j) for idx_j in idx_batch])
+
+        for j in range(n_nets):
+            np.random.shuffle(idx_batch[j])
+            net_dataidx_map[j] = idx_batch[j]
+
+    elif partition == "hetero-fix":
+        dataidx_map_file_path = './data_preprocessing/non-iid-distribution/CIFAR100/net_dataidx_map.txt'
+        net_dataidx_map = read_net_dataidx_map(dataidx_map_file_path)
+
+    if partition == "hetero-fix":
+        distribution_file_path = './data_preprocessing/non-iid-distribution/CIFAR100/distribution.txt'
+        traindata_cls_counts = read_data_distribution(distribution_file_path)
+    else:
+        traindata_cls_counts = record_net_data_stats(y_train, net_dataidx_map)
+
+    return X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts
+
+
+# for centralized training
+def get_dataloader(dataset, datadir, train_bs, test_bs, dataidxs=None):
+    return get_dataloader_CIFAR100(datadir, train_bs, test_bs, dataidxs)
+
+
+# for local devices
+def get_dataloader_test(dataset, datadir, train_bs, test_bs, dataidxs_train, dataidxs_test):
+    return get_dataloader_test_CIFAR100(datadir, train_bs, test_bs, dataidxs_train, dataidxs_test)
+
+
+def get_dataloader_CIFAR100(datadir, train_bs, test_bs, dataidxs=None):
+    dl_obj = CIFAR100_truncated
+
+    transform_train, transform_test = _data_transforms_cifar100()
+
+    train_ds = dl_obj(datadir, dataidxs=dataidxs, train=True, transform=transform_train, download=True)
+    test_ds = dl_obj(datadir, train=False, transform=transform_test, download=True)
+
+    train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=True)
+    test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=True)
+
+    return train_dl, test_dl
+
+
+def get_dataloader_test_CIFAR100(datadir, train_bs, test_bs, dataidxs_train=None, dataidxs_test=None):
+    dl_obj = CIFAR100_truncated
+
+    transform_train, transform_test = _data_transforms_cifar100()
+
+    train_ds = dl_obj(datadir, dataidxs=dataidxs_train, train=True, transform=transform_train, download=True)
+    test_ds = dl_obj(datadir, dataidxs=dataidxs_test, train=False, transform=transform_test, download=True)
+
+    train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=True)
+    test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=True)
+
+    return train_dl, test_dl
+
+
+def load_partition_data_distributed_cifar100(process_id, dataset, data_dir, partition_method, partition_alpha,
+                                            client_number, batch_size):
+    X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts = partition_data(dataset,
+                                                                                             data_dir,
+                                                                                             partition_method,
+                                                                                             client_number,
+                                                                                             partition_alpha)
+    class_num = len(np.unique(y_train))
+    logging.info("traindata_cls_counts = " + str(traindata_cls_counts))
+    train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)])
+
+    # get global test data
+    if process_id == 0:
+        train_data_global, test_data_global = get_dataloader(dataset, data_dir, batch_size, batch_size)
+        logging.info("train_dl_global number = " + str(len(train_data_global)))
+        logging.info("test_dl_global number = " + str(len(train_data_global)))
+        train_data_local = None
+        test_data_local = None
+        local_data_num = 0
+    else:
+        # get local dataset
+        dataidxs = net_dataidx_map[process_id - 1]
+        local_data_num = len(dataidxs)
+        logging.info("rank = %d, local_sample_number = %d" % (process_id, local_data_num))
+        # training batch size = 64; algorithms batch size = 32
+        train_data_local, test_data_local = get_dataloader(dataset, data_dir, batch_size, batch_size,
+                                                 dataidxs)
+        logging.info("process_id = %d, batch_num_train_local = %d, batch_num_test_local = %d" % (
+            process_id, len(train_data_local), len(test_data_local)))
+        train_data_global = None
+        test_data_global = None
+
+    return train_data_num, train_data_global, test_data_global, local_data_num, train_data_local, test_data_local, class_num
+
+
+def load_partition_data_cifar100(dataset, data_dir, partition_method, partition_alpha, client_number, batch_size):
+    X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts = partition_data(dataset,
+                                                                                             data_dir,
+                                                                                             partition_method,
+                                                                                             client_number,
+                                                                                             partition_alpha)
+    class_num = len(np.unique(y_train))
+    logging.info("traindata_cls_counts = " + str(traindata_cls_counts))
+    train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)])
+
+    train_data_global, test_data_global = get_dataloader(dataset, data_dir, batch_size, batch_size)
+    logging.info("train_dl_global number = " + str(len(train_data_global)))
+    logging.info("test_dl_global number = " + str(len(train_data_global)))
+    test_data_num = len(test_data_global)
+
+    # get local dataset
+    data_local_num_dict = dict()
+    train_data_local_dict = dict()
+    test_data_local_dict = dict()
+
+    for client_idx in range(client_number):
+        dataidxs = net_dataidx_map[client_idx]
+        local_data_num = len(dataidxs)
+        data_local_num_dict[client_idx] = local_data_num
+        logging.info("client_idx = %d, local_sample_number = %d" % (client_idx, local_data_num))
+
+        # training batch size = 64; algorithms batch size = 32
+        train_data_local, test_data_local = get_dataloader(dataset, data_dir, batch_size, batch_size,
+                                                 dataidxs)
+        logging.info("client_idx = %d, batch_num_train_local = %d, batch_num_test_local = %d" % (
+            client_idx, len(train_data_local), len(test_data_local)))
+        train_data_local_dict[client_idx] = train_data_local
+        test_data_local_dict[client_idx] = test_data_local
+    return train_data_num, test_data_num, train_data_global, test_data_global, \
+           data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num
diff --git a/data_preprocessing/cifar100/datasets.py b/data_preprocessing/cifar100/datasets.py
new file mode 100644
index 0000000..f94460c
--- /dev/null
+++ b/data_preprocessing/cifar100/datasets.py
@@ -0,0 +1,96 @@
+import logging
+
+import numpy as np
+import torch.utils.data as data
+from PIL import Image
+from torchvision.datasets import CIFAR100
+
+logging.basicConfig()
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp')
+
+
+def accimage_loader(path):
+    import accimage
+    try:
+        return accimage.Image(path)
+    except IOError:
+        # Potentially a decoding problem, fall back to PIL.Image
+        return pil_loader(path)
+
+
+def pil_loader(path):
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    with open(path, 'rb') as f:
+        img = Image.open(f)
+        return img.convert('RGB')
+
+
+def default_loader(path):
+    from torchvision import get_image_backend
+    if get_image_backend() == 'accimage':
+        return accimage_loader(path)
+    else:
+        return pil_loader(path)
+
+
+class CIFAR100_truncated(data.Dataset):
+
+    def __init__(self, root, dataidxs=None, train=True, transform=None, target_transform=None, download=False):
+
+        self.root = root
+        self.dataidxs = dataidxs
+        self.train = train
+        self.transform = transform
+        self.target_transform = target_transform
+        self.download = download
+
+        self.data, self.target = self.__build_truncated_dataset__()
+
+    def __build_truncated_dataset__(self):
+
+        cifar_dataobj = CIFAR100(self.root, self.train, self.transform, self.target_transform, self.download)
+
+        if self.train:
+            # print("train member of the class: {}".format(self.train))
+            # data = cifar_dataobj.train_data
+            data = cifar_dataobj.data
+            target = np.array(cifar_dataobj.targets)
+        else:
+            data = cifar_dataobj.data
+            target = np.array(cifar_dataobj.targets)
+
+        if self.dataidxs is not None:
+            data = data[self.dataidxs]
+            target = target[self.dataidxs]
+
+        return data, target
+
+    def truncate_channel(self, index):
+        for i in range(index.shape[0]):
+            gs_index = index[i]
+            self.data[gs_index, :, :, 1] = 0.0
+            self.data[gs_index, :, :, 2] = 0.0
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: (image, target) where target is index of the target class.
+        """
+        img, target = self.data[index], self.target[index]
+
+        if self.transform is not None:
+            img = self.transform(img)
+
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.data)
\ No newline at end of file
diff --git a/data_preprocessing/cinic10/__init__.py b/data_preprocessing/cinic10/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/data_preprocessing/cinic10/data_loader.py b/data_preprocessing/cinic10/data_loader.py
new file mode 100644
index 0000000..d499d2d
--- /dev/null
+++ b/data_preprocessing/cinic10/data_loader.py
@@ -0,0 +1,321 @@
+import logging
+import os
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.data as data
+import torchvision.transforms as transforms
+
+from .datasets import ImageFolderTruncated
+
+logging.basicConfig()
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+
+# generate the non-IID distribution for all methods
+def read_data_distribution(filename='./data_preprocessing/non-iid-distribution/CIFAR10/distribution.txt'):
+    distribution = {}
+    with open(filename, 'r') as data:
+        for x in data.readlines():
+            if '{' != x[0] and '}' != x[0]:
+                tmp = x.split(':')
+                if '{' == tmp[1].strip():
+                    first_level_key = int(tmp[0])
+                    distribution[first_level_key] = {}
+                else:
+                    second_level_key = int(tmp[0])
+                    distribution[first_level_key][second_level_key] = int(tmp[1].strip().replace(',', ''))
+    return distribution
+
+
+def read_net_dataidx_map(filename='./data_preprocessing/non-iid-distribution/CIFAR10/net_dataidx_map.txt'):
+    net_dataidx_map = {}
+    with open(filename, 'r') as data:
+        for x in data.readlines():
+            if '{' != x[0] and '}' != x[0] and ']' != x[0]:
+                tmp = x.split(':')
+                if '[' == tmp[-1].strip():
+                    key = int(tmp[0])
+                    net_dataidx_map[key] = []
+                else:
+                    tmp_array = x.split(',')
+                    net_dataidx_map[key] = [int(i.strip()) for i in tmp_array]
+    return net_dataidx_map
+
+
+def record_net_data_stats(y_train, net_dataidx_map):
+    net_cls_counts = {}
+
+    for net_i, dataidx in net_dataidx_map.items():
+        unq, unq_cnt = np.unique(y_train[dataidx], return_counts=True)
+        tmp = {unq[i]: unq_cnt[i] for i in range(len(unq))}
+        net_cls_counts[net_i] = tmp
+    logging.debug('Data statistics: %s' % str(net_cls_counts))
+    return net_cls_counts
+
+
+class Cutout(object):
+    def __init__(self, length):
+        self.length = length
+
+    def __call__(self, img):
+        h, w = img.size(1), img.size(2)
+        mask = np.ones((h, w), np.float32)
+        y = np.random.randint(h)
+        x = np.random.randint(w)
+
+        y1 = np.clip(y - self.length // 2, 0, h)
+        y2 = np.clip(y + self.length // 2, 0, h)
+        x1 = np.clip(x - self.length // 2, 0, w)
+        x2 = np.clip(x + self.length // 2, 0, w)
+
+        mask[y1: y2, x1: x2] = 0.
+        mask = torch.from_numpy(mask)
+        mask = mask.expand_as(img)
+        img *= mask
+        return img
+
+
+def _data_transforms_cinic10():
+    cinic_mean = [0.47889522, 0.47227842, 0.43047404]
+    cinic_std = [0.24205776, 0.23828046, 0.25874835]
+    # Transformer for train set: random crops and horizontal flip
+    train_transform = transforms.Compose([transforms.ToTensor(),
+                                          transforms.Lambda(
+                                              lambda x: F.pad(x.unsqueeze(0),
+                                                              (4, 4, 4, 4),
+                                                              mode='reflect').data.squeeze()),
+                                          transforms.ToPILImage(),
+                                          transforms.RandomCrop(32),
+                                          transforms.RandomHorizontalFlip(),
+                                          transforms.ToTensor(),
+                                          transforms.Normalize(mean=cinic_mean,
+                                                               std=cinic_std),
+                                          ])
+
+    # Transformer for test set
+    valid_transform = transforms.Compose([transforms.ToTensor(),
+                                          transforms.Lambda(
+                                              lambda x: F.pad(x.unsqueeze(0),
+                                                              (4, 4, 4, 4),
+                                                              mode='reflect').data.squeeze()),
+                                          transforms.ToPILImage(),
+                                          transforms.RandomCrop(32),
+                                          transforms.RandomHorizontalFlip(),
+                                          transforms.ToTensor(),
+                                          transforms.Normalize(mean=cinic_mean,
+                                                               std=cinic_std),
+                                          ])
+    return train_transform, valid_transform
+
+
+def load_cinic10_data(datadir):
+    _train_dir = datadir + str('/train')
+    logging.info("_train_dir = " + str(_train_dir))
+    _test_dir = datadir + str('/test')
+    cinic_mean = [0.47889522, 0.47227842, 0.43047404]
+    cinic_std = [0.24205776, 0.23828046, 0.25874835]
+    trainset = ImageFolderTruncated(_train_dir, transform=transforms.Compose([transforms.ToTensor(),
+                                                                              transforms.Lambda(
+                                                                                  lambda x: F.pad(x.unsqueeze(0),
+                                                                                                  (4, 4, 4, 4),
+                                                                                                  mode='reflect').data.squeeze()),
+                                                                              transforms.ToPILImage(),
+                                                                              transforms.RandomCrop(32),
+                                                                              transforms.RandomHorizontalFlip(),
+                                                                              transforms.ToTensor(),
+                                                                              transforms.Normalize(mean=cinic_mean,
+                                                                                                   std=cinic_std),
+                                                                              ]))
+
+    testset = ImageFolderTruncated(_test_dir, transform=transforms.Compose([transforms.ToTensor(),
+                                                                            transforms.Lambda(
+                                                                                lambda x: F.pad(x.unsqueeze(0),
+                                                                                                (4, 4, 4, 4),
+                                                                                                mode='reflect').data.squeeze()),
+                                                                            transforms.ToPILImage(),
+                                                                            transforms.RandomCrop(32),
+                                                                            transforms.RandomHorizontalFlip(),
+                                                                            transforms.ToTensor(),
+                                                                            transforms.Normalize(mean=cinic_mean,
+                                                                                                 std=cinic_std),
+                                                                            ]))
+    X_train, y_train = trainset.imgs, trainset.targets
+    X_test, y_test = testset.imgs, testset.targets
+    return (X_train, y_train, X_test, y_test)
+
+
+def partition_data(dataset, datadir, partition, n_nets, alpha):
+    logging.info("*********partition data***************")
+    pil_logger = logging.getLogger('PIL')
+    pil_logger.setLevel(logging.INFO)
+
+    X_train, y_train, X_test, y_test = load_cinic10_data(datadir)
+    X_train = np.array(X_train)
+    X_test = np.array(X_test)
+    y_train = np.array(y_train)
+    y_test = np.array(y_test)
+    n_train = len(X_train)
+    # n_test = len(X_test)
+
+    if partition == "homo":
+        total_num = n_train
+        idxs = np.random.permutation(total_num)
+        batch_idxs = np.array_split(idxs, n_nets)
+        net_dataidx_map = {i: batch_idxs[i] for i in range(n_nets)}
+
+    elif partition == "hetero":
+        min_size = 0
+        K = 10
+        N = y_train.shape[0]
+        logging.info("N = " + str(N))
+        net_dataidx_map = {}
+
+        while min_size < 10:
+            idx_batch = [[] for _ in range(n_nets)]
+            # for each class in the dataset
+            for k in range(K):
+                idx_k = np.where(y_train == k)[0]
+                np.random.shuffle(idx_k)
+                proportions = np.random.dirichlet(np.repeat(alpha, n_nets))
+                ## Balance
+                proportions = np.array([p * (len(idx_j) < N / n_nets) for p, idx_j in zip(proportions, idx_batch)])
+                proportions = proportions / proportions.sum()
+                proportions = (np.cumsum(proportions) * len(idx_k)).astype(int)[:-1]
+                idx_batch = [idx_j + idx.tolist() for idx_j, idx in zip(idx_batch, np.split(idx_k, proportions))]
+                min_size = min([len(idx_j) for idx_j in idx_batch])
+
+        for j in range(n_nets):
+            np.random.shuffle(idx_batch[j])
+            net_dataidx_map[j] = idx_batch[j]
+
+    elif partition == "hetero-fix":
+        dataidx_map_file_path = './data_preprocessing/non-iid-distribution/CINIC10/net_dataidx_map.txt'
+        net_dataidx_map = read_net_dataidx_map(dataidx_map_file_path)
+
+    if partition == "hetero-fix":
+        distribution_file_path = './data_preprocessing/non-iid-distribution/CINIC10/distribution.txt'
+        traindata_cls_counts = read_data_distribution(distribution_file_path)
+    else:
+        traindata_cls_counts = record_net_data_stats(y_train, net_dataidx_map)
+
+    return X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts
+
+
+# for centralized training
+def get_dataloader(dataset, datadir, train_bs, test_bs, dataidxs=None):
+    return get_dataloader_cinic10(datadir, train_bs, test_bs, dataidxs)
+
+
+# for local devices
+def get_dataloader_test(dataset, datadir, train_bs, test_bs, dataidxs_train, dataidxs_test):
+    return get_dataloader_test_cinic10(datadir, train_bs, test_bs, dataidxs_train, dataidxs_test)
+
+
+def get_dataloader_cinic10(datadir, train_bs, test_bs, dataidxs=None):
+    dl_obj = ImageFolderTruncated
+
+    transform_train, transform_test = _data_transforms_cinic10()
+
+    traindir = os.path.join(datadir, 'train')
+    valdir = os.path.join(datadir, 'test')
+
+    train_ds = dl_obj(traindir, dataidxs=dataidxs, transform=transform_train)
+    test_ds = dl_obj(valdir, transform=transform_train)
+
+    train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=True)
+    test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=True)
+
+    return train_dl, test_dl
+
+def get_dataloader_test_cinic10(datadir, train_bs, test_bs, dataidxs_train=None, dataidxs_test=None):
+    dl_obj = ImageFolderTruncated
+
+    transform_train, transform_test = _data_transforms_cinic10()
+
+    traindir = os.path.join(datadir, 'train')
+    valdir = os.path.join(datadir, 'test')
+
+    train_ds = dl_obj(traindir, dataidxs=dataidxs_train, transform=transform_train)
+    test_ds = dl_obj(valdir, dataidxs=dataidxs_test, transform=transform_test)
+
+    train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=True)
+    test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=True)
+
+    return train_dl, test_dl
+
+
+def load_partition_data_distributed_cinic10(process_id, dataset, data_dir, partition_method, partition_alpha,
+                                            client_number, batch_size):
+    X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts = partition_data(dataset,
+                                                                                             data_dir,
+                                                                                             partition_method,
+                                                                                             client_number,
+                                                                                             partition_alpha)
+    class_num = len(np.unique(y_train))
+    logging.info("traindata_cls_counts = " + str(traindata_cls_counts))
+    train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)])
+
+    # get global test data
+    if process_id == 0:
+        train_data_global, test_data_global = get_dataloader(dataset, data_dir, batch_size, batch_size)
+        logging.info("train_dl_global number = " + str(len(train_data_global)))
+        logging.info("test_dl_global number = " + str(len(train_data_global)))
+        test_data_num = len(test_data_global)
+        train_data_local = None
+        test_data_local = None
+        local_data_num = 0
+    else:
+        # get local dataset
+        dataidxs = net_dataidx_map[process_id - 1]
+        local_data_num = len(dataidxs)
+        logging.info("rank = %d, local_sample_number = %d" % (process_id, local_data_num))
+        # training batch size = 64; algorithms batch size = 32
+        train_data_local, test_data_local = get_dataloader(dataset, data_dir, batch_size, batch_size,
+                                                           dataidxs)
+        logging.info("process_id = %d, batch_num_train_local = %d, batch_num_test_local = %d" % (
+            process_id, len(train_data_local), len(test_data_local)))
+        test_data_num = 0
+        train_data_global = None
+        test_data_global = None
+
+    return train_data_num, test_data_num, train_data_global, test_data_global, local_data_num, train_data_local, test_data_local, class_num
+
+
+def load_partition_data_cinic10(dataset, data_dir, partition_method, partition_alpha, client_number, batch_size):
+    X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts = partition_data(dataset,
+                                                                                             data_dir,
+                                                                                             partition_method,
+                                                                                             client_number,
+                                                                                             partition_alpha)
+    class_num = len(np.unique(y_train))
+    logging.info("traindata_cls_counts = " + str(traindata_cls_counts))
+    train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)])
+
+    train_data_global, test_data_global = get_dataloader(dataset, data_dir, batch_size, batch_size)
+    logging.info("train_dl_global number = " + str(len(train_data_global)))
+    logging.info("test_dl_global number = " + str(len(train_data_global)))
+    test_data_num = len(test_data_global)
+
+    # get local dataset
+    data_local_num_dict = dict()
+    train_data_local_dict = dict()
+    test_data_local_dict = dict()
+
+    for client_idx in range(client_number):
+        dataidxs = net_dataidx_map[client_idx]
+        local_data_num = len(dataidxs)
+        data_local_num_dict[client_idx] = local_data_num
+        logging.info("client_idx = %d, local_sample_number = %d" % (client_idx, local_data_num))
+
+        # training batch size = 64; algorithms batch size = 32
+        train_data_local, test_data_local = get_dataloader(dataset, data_dir, batch_size, batch_size,
+                                                           dataidxs)
+        logging.info("client_idx = %d, batch_num_train_local = %d, batch_num_test_local = %d" % (
+            client_idx, len(train_data_local), len(test_data_local)))
+        train_data_local_dict[client_idx] = train_data_local
+        test_data_local_dict[client_idx] = test_data_local
+    return train_data_num, test_data_num, train_data_global, test_data_global, \
+           data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num
diff --git a/data_preprocessing/cinic10/datasets.py b/data_preprocessing/cinic10/datasets.py
new file mode 100644
index 0000000..454c651
--- /dev/null
+++ b/data_preprocessing/cinic10/datasets.py
@@ -0,0 +1,105 @@
+import logging
+
+import numpy as np
+from PIL import Image
+from torchvision.datasets import DatasetFolder
+
+logging.basicConfig()
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp')
+
+
+def accimage_loader(path):
+    import accimage
+    try:
+        return accimage.Image(path)
+    except IOError:
+        # Potentially a decoding problem, fall back to PIL.Image
+        return pil_loader(path)
+
+
+def pil_loader(path):
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    with open(path, 'rb') as f:
+        img = Image.open(f)
+        return img.convert('RGB')
+
+
+def default_loader(path):
+    from torchvision import get_image_backend
+    if get_image_backend() == 'accimage':
+        return accimage_loader(path)
+    else:
+        return pil_loader(path)
+
+
+class ImageFolderTruncated(DatasetFolder):
+    """A generic data loader where the images are arranged in this way: ::
+
+        root/dog/xxx.png
+        root/dog/xxy.png
+        root/dog/xxz.png
+
+        root/cat/123.png
+        root/cat/nsdf3.png
+        root/cat/asd932_.png
+
+    Args:
+        root (string): Root directory path.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        loader (callable, optional): A function to load an image given its path.
+        is_valid_file (callable, optional): A function that takes path of an Image file
+            and check if the file is a valid_file (used to check of corrupt files)
+
+     Attributes:
+        classes (list): List of the class names.
+        class_to_idx (dict): Dict with items (class_name, class_index).
+        imgs (list): List of (image path, class_index) tuples
+    """
+
+    def __init__(self, root, dataidxs=None, transform=None, target_transform=None,
+                 loader=default_loader, is_valid_file=None):
+        super(ImageFolderTruncated, self).__init__(root, loader, IMG_EXTENSIONS if is_valid_file is None else None,
+                                                   transform=transform,
+                                                   target_transform=target_transform,
+                                                   is_valid_file=is_valid_file)
+        self.imgs = self.samples
+        self.dataidxs = dataidxs
+
+        ### we need to fetch training labels out here:
+        self._train_labels = np.array([tup[-1] for tup in self.imgs])
+
+        self.__build_truncated_dataset__()
+
+    def __build_truncated_dataset__(self):
+        if self.dataidxs is not None:
+            # self.imgs = self.imgs[self.dataidxs]
+            self.imgs = [self.imgs[idx] for idx in self.dataidxs]
+
+    def __len__(self):
+        return len(self.imgs)
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: (sample, target) where target is class_index of the target class.
+        """
+        path, target = self.imgs[index]
+        sample = self.loader(path)
+        if self.transform is not None:
+            sample = self.transform(sample)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        return sample, target
+
+    @property
+    def get_train_labels(self):
+        return self._train_labels
diff --git a/data_preprocessing/coco/dectection/data_loader.py b/data_preprocessing/coco/dectection/data_loader.py
new file mode 100644
index 0000000..be3ac24
--- /dev/null
+++ b/data_preprocessing/coco/dectection/data_loader.py
@@ -0,0 +1,123 @@
+import logging
+
+
+import os
+import yaml
+import math
+import torch
+import numpy as np
+import torch.utils.data as data
+import torchvision.transforms as transforms
+from .datasets import create_dataloader
+from pathlib import Path
+
+# def partition_data(data_path, partition, n_nets):
+#     n_data = len(os.listdir(data_path))
+#     if partition == "homo":
+#         total_num = n_data
+#         idxs = np.random.permutation(total_num)
+#         batch_idxs = np.array_split(idxs, n_nets)
+#         net_dataidx_map = {i: batch_idxs[i] for i in range(n_nets)}
+#     elif partition == 'hetero':
+#         print("not support!")
+#         pass
+#     return net_dataidx_map
+def make_divisible(x, divisor):
+    # Returns x evenly divisible by divisor
+    return math.ceil(x / divisor) * divisor
+
+def check_img_size(img_size, s=32):
+    # Verify img_size is a multiple of stride s
+    new_size = make_divisible(img_size, int(s))  # ceil gs-multiple
+    if new_size != img_size:
+        print('WARNING: --img-size %g must be multiple of max stride %g, updating to %g' % (img_size, s, new_size))
+    return new_size
+
+def partition_data(data_path, partition, n_nets):
+    if os.path.isfile(data_path):
+        with open(data_path) as f:
+            data = f.readlines()
+        n_data = len(data)
+    else:
+        n_data = len(os.listdir(data_path))
+    if partition == "homo":
+        total_num = n_data
+        idxs = np.random.permutation(total_num)
+        batch_idxs = np.array_split(idxs, n_nets)
+        net_dataidx_map = {i: batch_idxs[i] for i in range(n_nets)}
+    elif partition == 'hetero':
+        print("not support!")
+        pass
+
+    return net_dataidx_map
+
+
+def load_partition_data_coco(opt, hyp, model):
+    save_dir, epochs, batch_size, total_batch_size, weights, rank = \
+        Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank
+
+    with open(opt.data) as f:
+        data_dict = yaml.load(f, Loader=yaml.FullLoader)  # data dict
+
+    train_path = data_dict['train']
+    test_path = data_dict['val']
+    nc, names = (1, ['item']) if opt.single_cls else (int(data_dict['nc']), data_dict['names'])  # number classes, names
+    gs = int(max(model.stride))  # grid size (max stride)
+    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size]
+
+
+
+    client_number = opt.client_num_in_total
+    partition = opt.partition_method
+
+    # client_list = []
+
+    net_dataidx_map = partition_data(train_path, partition=partition, n_nets=client_number)
+    net_dataidx_map_test = partition_data(test_path, partition=partition, n_nets=client_number)
+    train_data_loader_dict = dict()
+    test_data_loader_dict = dict()
+    train_data_num_dict = dict()
+    train_dataset_dict = dict()
+
+    train_dataloader_global, train_dataset_global = create_dataloader(train_path, imgsz, batch_size, gs, opt,
+                                            hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect,
+                                            rank=rank,
+                                            world_size=opt.world_size, workers=opt.workers,
+                                            image_weights=opt.image_weights)
+    train_data_num = len(train_dataset_global)
+
+    test_dataloader_global = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt,  # testloader
+                                   hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True,
+                                   rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5)[0]
+
+    test_data_num = len(test_dataloader_global.dataset)
+
+
+    for i in range(client_number):
+        print("net_dataidx_map trainer:", net_dataidx_map[i])
+        dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt,
+                                                hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect,
+                                                rank=rank,
+                                                world_size=opt.world_size, workers=opt.workers,
+                                                image_weights=opt.image_weights,
+                                                net_dataidx_map=net_dataidx_map[i])
+        testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt,  # testloader
+                                       hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True,
+                                       rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5, net_dataidx_map=net_dataidx_map_test[i])[0]
+
+
+
+        train_dataset_dict[i] = dataset
+        train_data_num_dict[i] = len(dataset)
+        train_data_loader_dict[i] = dataloader
+        test_data_loader_dict[i] = testloader
+        # client_list.append(
+        #     Client(i, train_data_loader_dict[i], len(dataset), opt, device, model, tb_writer=tb_writer,
+        #            hyp=hyp, wandb=wandb))
+        #
+
+
+
+    return train_data_num, test_data_num, train_dataloader_global, test_dataloader_global, \
+           train_data_num_dict, train_data_loader_dict, test_data_loader_dict, nc
+
diff --git a/data_preprocessing/coco/dectection/dataset.py b/data_preprocessing/coco/dectection/dataset.py
new file mode 100644
index 0000000..48319b3
--- /dev/null
+++ b/data_preprocessing/coco/dectection/dataset.py
@@ -0,0 +1,968 @@
+# Dataset utils and dataloaders
+
+import glob
+import logging
+import math
+import os
+import random
+import shutil
+import time
+from itertools import repeat
+from multiprocessing.pool import ThreadPool
+from pathlib import Path
+from threading import Thread
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image, ExifTags
+from torch.utils.data import Dataset
+from tqdm import tqdm
+import sys
+
+from fedml_api.model.object_detection.yolov5.utils.general import xyxy2xywh, xywh2xyxy
+from fedml_api.model.object_detection.yolov5.utils.torch_utils import torch_distributed_zero_first
+
+# Parameters
+help_url = 'https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data'
+img_formats = ['bmp', 'jpg', 'jpeg', 'png', 'tif', 'tiff', 'dng']  # acceptable image suffixes
+vid_formats = ['mov', 'avi', 'mp4', 'mpg', 'mpeg', 'm4v', 'wmv', 'mkv']  # acceptable video suffixes
+logger = logging.getLogger(__name__)
+
+# Get orientation exif tag
+for orientation in ExifTags.TAGS.keys():
+    if ExifTags.TAGS[orientation] == 'Orientation':
+        break
+
+
+def get_hash(files):
+    # Returns a single hash value of a list of files
+    return sum(os.path.getsize(f) for f in files if os.path.isfile(f))
+
+
+def exif_size(img):
+    # Returns exif-corrected PIL size
+    s = img.size  # (width, height)
+    try:
+        rotation = dict(img._getexif().items())[orientation]
+        if rotation == 6:  # rotation 270
+            s = (s[1], s[0])
+        elif rotation == 8:  # rotation 90
+            s = (s[1], s[0])
+    except:
+        pass
+
+    return s
+
+def partition_data(data_path, partition, n_nets):
+    if os.path.isfile(data_path):
+        with open(data_path) as f:
+            data = f.readlines()
+        n_data = len(data)
+    else:
+        n_data = len(os.listdir(data_path))
+    if partition == "homo":
+        total_num = n_data
+        idxs = np.random.permutation(total_num)
+        batch_idxs = np.array_split(idxs, n_nets)
+        net_dataidx_map = {i: batch_idxs[i] for i in range(n_nets)}
+    elif partition == 'hetero':
+        print("not support!")
+        pass
+
+    return net_dataidx_map
+
+def create_dataloader(path, imgsz, batch_size, stride, opt, hyp=None, augment=False, cache=False, pad=0.0, rect=False,
+                      rank=-1, world_size=1, workers=8, image_weights=False, net_dataidx_map=None):
+    # Make sure only the first process in DDP process the dataset first, and the following others can use the cache
+    with torch_distributed_zero_first(rank):
+        dataset = LoadImagesAndLabels(path, imgsz, batch_size,
+                                      augment=augment,  # augment images
+                                      hyp=hyp,  # augmentation hyperparameters
+                                      rect=rect,  # rectangular training
+                                      cache_images=cache,
+                                      single_cls=opt.single_cls,
+                                      stride=int(stride),
+                                      pad=pad,
+                                      rank=rank,
+                                      image_weights=image_weights,
+                                      net_dataidx_map=net_dataidx_map)
+
+    batch_size = min(batch_size, len(dataset))
+    nw = min([os.cpu_count() // world_size, batch_size if batch_size > 1 else 0, workers])  # number of workers
+    sampler = torch.utils.data.distributed.DistributedSampler(dataset) if rank != -1 else None
+    loader = torch.utils.data.DataLoader if image_weights else InfiniteDataLoader
+    # Use torch.utils.data.DataLoader() if dataset.properties will update during training else InfiniteDataLoader()
+    dataloader = loader(dataset,
+                        batch_size=batch_size,
+                        num_workers=nw,
+                        sampler=sampler,
+                        pin_memory=True,
+                        collate_fn=LoadImagesAndLabels.collate_fn)
+    return dataloader, dataset
+
+
+class InfiniteDataLoader(torch.utils.data.dataloader.DataLoader):
+    """ Dataloader that reuses workers
+
+    Uses same syntax as vanilla DataLoader
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        object.__setattr__(self, 'batch_sampler', _RepeatSampler(self.batch_sampler))
+        self.iterator = super().__iter__()
+
+    def __len__(self):
+        return len(self.batch_sampler.sampler)
+
+    def __iter__(self):
+        for i in range(len(self)):
+            yield next(self.iterator)
+
+
+class _RepeatSampler(object):
+    """ Sampler that repeats forever
+
+    Args:
+        sampler (Sampler)
+    """
+
+    def __init__(self, sampler):
+        self.sampler = sampler
+
+    def __iter__(self):
+        while True:
+            yield from iter(self.sampler)
+
+
+class LoadImages:  # for inference
+    def __init__(self, path, img_size=640):
+        p = str(Path(path))  # os-agnostic
+        p = os.path.abspath(p)  # absolute path
+        if '*' in p:
+            files = sorted(glob.glob(p, recursive=True))  # glob
+        elif os.path.isdir(p):
+            files = sorted(glob.glob(os.path.join(p, '*.*')))  # dir
+        elif os.path.isfile(p):
+            files = [p]  # files
+        else:
+            raise Exception('ERROR: %s does not exist' % p)
+
+        images = [x for x in files if x.split('.')[-1].lower() in img_formats]
+        videos = [x for x in files if x.split('.')[-1].lower() in vid_formats]
+        ni, nv = len(images), len(videos)
+
+        self.img_size = img_size
+        self.files = images + videos
+        self.nf = ni + nv  # number of files
+        self.video_flag = [False] * ni + [True] * nv
+        self.mode = 'image'
+        if any(videos):
+            self.new_video(videos[0])  # new video
+        else:
+            self.cap = None
+        assert self.nf > 0, 'No images or videos found in %s. Supported formats are:\nimages: %s\nvideos: %s' % \
+                            (p, img_formats, vid_formats)
+
+    def __iter__(self):
+        self.count = 0
+        return self
+
+    def __next__(self):
+        if self.count == self.nf:
+            raise StopIteration
+        path = self.files[self.count]
+
+        if self.video_flag[self.count]:
+            # Read video
+            self.mode = 'video'
+            ret_val, img0 = self.cap.read()
+            if not ret_val:
+                self.count += 1
+                self.cap.release()
+                if self.count == self.nf:  # last video
+                    raise StopIteration
+                else:
+                    path = self.files[self.count]
+                    self.new_video(path)
+                    ret_val, img0 = self.cap.read()
+
+            self.frame += 1
+            print('video %g/%g (%g/%g) %s: ' % (self.count + 1, self.nf, self.frame, self.nframes, path), end='')
+
+        else:
+            # Read image
+            self.count += 1
+            img0 = cv2.imread(path)  # BGR
+            assert img0 is not None, 'Image Not Found ' + path
+            print('image %g/%g %s: ' % (self.count, self.nf, path), end='')
+
+        # Padded resize
+        img = letterbox(img0, new_shape=self.img_size)[0]
+
+        # Convert
+        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
+        img = np.ascontiguousarray(img)
+
+        return path, img, img0, self.cap
+
+    def new_video(self, path):
+        self.frame = 0
+        self.cap = cv2.VideoCapture(path)
+        self.nframes = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+    def __len__(self):
+        return self.nf  # number of files
+
+
+class LoadWebcam:  # for inference
+    def __init__(self, pipe='0', img_size=640):
+        self.img_size = img_size
+
+        if pipe.isnumeric():
+            pipe = eval(pipe)  # local camera
+        # pipe = 'rtsp://192.168.1.64/1'  # IP camera
+        # pipe = 'rtsp://username:password@192.168.1.64/1'  # IP camera with login
+        # pipe = 'http://wmccpinetop.axiscam.net/mjpg/video.mjpg'  # IP golf camera
+
+        self.pipe = pipe
+        self.cap = cv2.VideoCapture(pipe)  # video capture object
+        self.cap.set(cv2.CAP_PROP_BUFFERSIZE, 3)  # set buffer size
+
+    def __iter__(self):
+        self.count = -1
+        return self
+
+    def __next__(self):
+        self.count += 1
+        if cv2.waitKey(1) == ord('q'):  # q to quit
+            self.cap.release()
+            cv2.destroyAllWindows()
+            raise StopIteration
+
+        # Read frame
+        if self.pipe == 0:  # local camera
+            ret_val, img0 = self.cap.read()
+            img0 = cv2.flip(img0, 1)  # flip left-right
+        else:  # IP camera
+            n = 0
+            while True:
+                n += 1
+                self.cap.grab()
+                if n % 30 == 0:  # skip frames
+                    ret_val, img0 = self.cap.retrieve()
+                    if ret_val:
+                        break
+
+        # Print
+        assert ret_val, 'Camera Error %s' % self.pipe
+        img_path = 'webcam.jpg'
+        print('webcam %g: ' % self.count, end='')
+
+        # Padded resize
+        img = letterbox(img0, new_shape=self.img_size)[0]
+
+        # Convert
+        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
+        img = np.ascontiguousarray(img)
+
+        return img_path, img, img0, None
+
+    def __len__(self):
+        return 0
+
+
+class LoadStreams:  # multiple IP or RTSP cameras
+    def __init__(self, sources='streams.txt', img_size=640):
+        self.mode = 'stream'
+        self.img_size = img_size
+
+        if os.path.isfile(sources):
+            with open(sources, 'r') as f:
+                sources = [x.strip() for x in f.read().strip().splitlines() if len(x.strip())]
+        else:
+            sources = [sources]
+
+        n = len(sources)
+        self.imgs = [None] * n
+        self.sources = sources
+        for i, s in enumerate(sources):
+            # Start the thread to read frames from the video stream
+            print('%g/%g: %s... ' % (i + 1, n, s), end='')
+            cap = cv2.VideoCapture(eval(s) if s.isnumeric() else s)
+            assert cap.isOpened(), 'Failed to open %s' % s
+            w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+            h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+            fps = cap.get(cv2.CAP_PROP_FPS) % 100
+            _, self.imgs[i] = cap.read()  # guarantee first frame
+            thread = Thread(target=self.update, args=([i, cap]), daemon=True)
+            print(' success (%gx%g at %.2f FPS).' % (w, h, fps))
+            thread.start()
+        print('')  # newline
+
+        # check for common shapes
+        s = np.stack([letterbox(x, new_shape=self.img_size)[0].shape for x in self.imgs], 0)  # inference shapes
+        self.rect = np.unique(s, axis=0).shape[0] == 1  # rect inference if all shapes equal
+        if not self.rect:
+            print('WARNING: Different stream shapes detected. For optimal performance supply similarly-shaped streams.')
+
+    def update(self, index, cap):
+        # Read next stream frame in a daemon thread
+        n = 0
+        while cap.isOpened():
+            n += 1
+            # _, self.imgs[index] = cap.read()
+            cap.grab()
+            if n == 4:  # read every 4th frame
+                _, self.imgs[index] = cap.retrieve()
+                n = 0
+            time.sleep(0.01)  # wait time
+
+    def __iter__(self):
+        self.count = -1
+        return self
+
+    def __next__(self):
+        self.count += 1
+        img0 = self.imgs.copy()
+        if cv2.waitKey(1) == ord('q'):  # q to quit
+            cv2.destroyAllWindows()
+            raise StopIteration
+
+        # Letterbox
+        img = [letterbox(x, new_shape=self.img_size, auto=self.rect)[0] for x in img0]
+
+        # Stack
+        img = np.stack(img, 0)
+
+        # Convert
+        img = img[:, :, :, ::-1].transpose(0, 3, 1, 2)  # BGR to RGB, to bsx3x416x416
+        img = np.ascontiguousarray(img)
+
+        return self.sources, img, img0, None
+
+    def __len__(self):
+        return 0  # 1E12 frames = 32 streams at 30 FPS for 30 years
+
+
+def img2label_paths(img_paths):
+    # Define label paths as a function of image paths
+    sa, sb = os.sep + 'images' + os.sep, os.sep + 'labels' + os.sep  # /images/, /labels/ substrings
+    return [x.replace(sa, sb, 1).replace('.' + x.split('.')[-1], '.txt') for x in img_paths]
+
+
+class LoadImagesAndLabels(Dataset):  # for training/testing
+    def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False,
+                 cache_images=False, single_cls=False, stride=32, pad=0.0, rank=-1, net_dataidx_map=None):
+        self.img_size = img_size
+        self.augment = augment
+        self.hyp = hyp
+        self.image_weights = image_weights
+        self.rect = False if image_weights else rect
+        self.mosaic = self.augment and not self.rect  # load 4 images at a time into a mosaic (only during training)
+        self.mosaic_border = [-img_size // 2, -img_size // 2]
+        self.stride = stride
+
+        try:
+            f = []  # image files
+            for p in path if isinstance(path, list) else [path]:
+                p = Path(p)  # os-agnostic
+                if p.is_dir():  # dir
+                    f += glob.glob(str(p / '**' / '*.*'), recursive=True)
+                elif p.is_file():  # file
+                    with open(p, 'r') as t:
+                        t = t.read().strip().splitlines()
+                        parent = str(p.parent) + os.sep
+                        f += [x.replace('./', parent) if x.startswith('./') else x for x in t]  # local to global path
+                else:
+                    raise Exception('%s does not exist' % p)
+            self.img_files = sorted([x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in img_formats])
+            assert self.img_files, 'No images found'
+        except Exception as e:
+            raise Exception('Error loading data from %s: %s\nSee %s' % (path, e, help_url))
+
+        # Check cache
+        self.label_files = img2label_paths(self.img_files)  # labels
+        cache_path = Path(self.label_files[0]).parent.with_suffix('.cache')  # cached labels
+        if cache_path.is_file():
+            cache = torch.load(cache_path)  # load
+            if cache['hash'] != get_hash(self.label_files + self.img_files) or 'results' not in cache:  # changed
+                cache = self.cache_labels(cache_path)  # re-cache
+        else:
+            cache = self.cache_labels(cache_path)  # cache
+
+        # Display cache
+        [nf, nm, ne, nc, n] = cache.pop('results')  # found, missing, empty, corrupted, total
+        desc = f"Scanning '{cache_path}' for images and labels... {nf} found, {nm} missing, {ne} empty, {nc} corrupted"
+        tqdm(None, desc=desc, total=n, initial=n)
+        assert nf > 0 or not augment, f'No labels found in {cache_path}. Can not train without labels. See {help_url}'
+
+        # Read cache
+        cache.pop('hash')  # remove hash
+        labels, shapes = zip(*cache.values())
+        self.labels = list(labels)
+        self.shapes = shapes
+        # self.shapes = np.array(shapes, dtype=np.float64)
+        self.img_files = list(cache.keys())  # update
+        self.label_files = img2label_paths(cache.keys())  # update
+
+        # client
+        print('net_dataidx_map:', net_dataidx_map)
+        print("len label:", len(self.labels))
+        # print("shapes:", self.shapes)
+        # 这里根据net_dataidx_map来更新labels和图像等数据
+        if net_dataidx_map is not None:
+            self.labels = [self.labels[i-1] for i in net_dataidx_map]
+            self.shapes = [self.shapes[i-1] for i in net_dataidx_map]
+            self.img_files = [self.img_files[i-1] for i in net_dataidx_map]
+            self.label_files = [self.label_files[i-1] for i in net_dataidx_map]
+
+        self.shapes = np.array(self.shapes, dtype=np.float64)
+        print("after shapes:", self.shapes, len(self.shapes))
+        if single_cls:
+            for x in self.labels:
+                x[:, 0] = 0
+
+        n = len(self.shapes)  # number of images
+        bi = np.floor(np.arange(n) / batch_size).astype(np.int)  # batch index
+        nb = bi[-1] + 1  # number of batches
+        self.batch = bi  # batch index of image
+        self.n = n
+        self.indices = range(n)
+        print("indices:", n, self.indices)
+
+        # Rectangular Training
+        if self.rect:
+            # Sort by aspect ratio
+            s = self.shapes  # wh
+            ar = s[:, 1] / s[:, 0]  # aspect ratio
+            irect = ar.argsort()
+            self.img_files = [self.img_files[i] for i in irect]
+            self.label_files = [self.label_files[i] for i in irect]
+            self.labels = [self.labels[i] for i in irect]
+            self.shapes = s[irect]  # wh
+            ar = ar[irect]
+
+            # Set training image shapes
+            shapes = [[1, 1]] * nb
+            for i in range(nb):
+                ari = ar[bi == i]
+                mini, maxi = ari.min(), ari.max()
+                if maxi < 1:
+                    shapes[i] = [maxi, 1]
+                elif mini > 1:
+                    shapes[i] = [1, 1 / mini]
+
+            self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(np.int) * stride
+
+        # Cache images into memory for faster training (WARNING: large datasets may exceed system RAM)
+        self.imgs = [None] * n
+        if cache_images:
+            gb = 0  # Gigabytes of cached images
+            self.img_hw0, self.img_hw = [None] * n, [None] * n
+            results = ThreadPool(8).imap(lambda x: load_image(*x), zip(repeat(self), range(n)))  # 8 threads
+            pbar = tqdm(enumerate(results), total=n)
+            for i, x in pbar:
+                self.imgs[i], self.img_hw0[i], self.img_hw[i] = x  # img, hw_original, hw_resized = load_image(self, i)
+                gb += self.imgs[i].nbytes
+                pbar.desc = 'Caching images (%.1fGB)' % (gb / 1E9)
+
+    def cache_labels(self, path=Path('./labels.cache')):
+        # Cache dataset labels, check images and read shapes
+        x = {}  # dict
+        nm, nf, ne, nc = 0, 0, 0, 0  # number missing, found, empty, duplicate
+        pbar = tqdm(zip(self.img_files, self.label_files), desc='Scanning images', total=len(self.img_files))
+        for i, (im_file, lb_file) in enumerate(pbar):
+            try:
+                # verify images
+                im = Image.open(im_file)
+                im.verify()  # PIL verify
+                shape = exif_size(im)  # image size
+                assert (shape[0] > 9) & (shape[1] > 9), 'image size <10 pixels'
+
+                # verify labels
+                if os.path.isfile(lb_file):
+                    nf += 1  # label found
+                    with open(lb_file, 'r') as f:
+                        l = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32)  # labels
+                    if len(l):
+                        assert l.shape[1] == 5, 'labels require 5 columns each'
+                        assert (l >= 0).all(), 'negative labels'
+                        assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
+                        assert np.unique(l, axis=0).shape[0] == l.shape[0], 'duplicate labels'
+                    else:
+                        ne += 1  # label empty
+                        l = np.zeros((0, 5), dtype=np.float32)
+                else:
+                    nm += 1  # label missing
+                    l = np.zeros((0, 5), dtype=np.float32)
+                x[im_file] = [l, shape]
+            except Exception as e:
+                nc += 1
+                print('WARNING: Ignoring corrupted image and/or label %s: %s' % (im_file, e))
+
+            pbar.desc = f"Scanning '{path.parent / path.stem}' for images and labels... " \
+                        f"{nf} found, {nm} missing, {ne} empty, {nc} corrupted"
+
+        if nf == 0:
+            print(f'WARNING: No labels found in {path}. See {help_url}')
+
+        x['hash'] = get_hash(self.label_files + self.img_files)
+        x['results'] = [nf, nm, ne, nc, i + 1]
+        torch.save(x, path)  # save for next time
+        logging.info(f"New cache created: {path}")
+        return x
+
+    def __len__(self):
+        return len(self.img_files)
+
+    # def __iter__(self):
+    #     self.count = -1
+    #     print('ran dataset iter')
+    #     #self.shuffled_vector = np.random.permutation(self.nF) if self.augment else np.arange(self.nF)
+    #     return self
+
+    def __getitem__(self, index):
+        index = self.indices[index]  # linear, shuffled, or image_weights
+
+        hyp = self.hyp
+        mosaic = self.mosaic and random.random() < hyp['mosaic']
+        if mosaic:
+            # Load mosaic
+            img, labels = load_mosaic(self, index)
+            shapes = None
+
+            # MixUp https://arxiv.org/pdf/1710.09412.pdf
+            if random.random() < hyp['mixup']:
+                img2, labels2 = load_mosaic(self, random.randint(0, self.n - 1))
+                r = np.random.beta(8.0, 8.0)  # mixup ratio, alpha=beta=8.0
+                img = (img * r + img2 * (1 - r)).astype(np.uint8)
+                labels = np.concatenate((labels, labels2), 0)
+
+        else:
+            # Load image
+            img, (h0, w0), (h, w) = load_image(self, index)
+
+            # Letterbox
+            shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size  # final letterboxed shape
+            img, ratio, pad = letterbox(img, shape, auto=False, scaleup=self.augment)
+            shapes = (h0, w0), ((h / h0, w / w0), pad)  # for COCO mAP rescaling
+
+            # Load labels
+            labels = []
+            x = self.labels[index]
+            if x.size > 0:
+                # Normalized xywh to pixel xyxy format
+                labels = x.copy()
+                labels[:, 1] = ratio[0] * w * (x[:, 1] - x[:, 3] / 2) + pad[0]  # pad width
+                labels[:, 2] = ratio[1] * h * (x[:, 2] - x[:, 4] / 2) + pad[1]  # pad height
+                labels[:, 3] = ratio[0] * w * (x[:, 1] + x[:, 3] / 2) + pad[0]
+                labels[:, 4] = ratio[1] * h * (x[:, 2] + x[:, 4] / 2) + pad[1]
+
+        if self.augment:
+            # Augment imagespace
+            if not mosaic:
+                img, labels = random_perspective(img, labels,
+                                                 degrees=hyp['degrees'],
+                                                 translate=hyp['translate'],
+                                                 scale=hyp['scale'],
+                                                 shear=hyp['shear'],
+                                                 perspective=hyp['perspective'])
+
+            # Augment colorspace
+            augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v'])
+
+            # Apply cutouts
+            # if random.random() < 0.9:
+            #     labels = cutout(img, labels)
+
+        nL = len(labels)  # number of labels
+        if nL:
+            labels[:, 1:5] = xyxy2xywh(labels[:, 1:5])  # convert xyxy to xywh
+            labels[:, [2, 4]] /= img.shape[0]  # normalized height 0-1
+            labels[:, [1, 3]] /= img.shape[1]  # normalized width 0-1
+
+        if self.augment:
+            # flip up-down
+            if random.random() < hyp['flipud']:
+                img = np.flipud(img)
+                if nL:
+                    labels[:, 2] = 1 - labels[:, 2]
+
+            # flip left-right
+            if random.random() < hyp['fliplr']:
+                img = np.fliplr(img)
+                if nL:
+                    labels[:, 1] = 1 - labels[:, 1]
+
+        labels_out = torch.zeros((nL, 6))
+        if nL:
+            labels_out[:, 1:] = torch.from_numpy(labels)
+
+        # Convert
+        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
+        img = np.ascontiguousarray(img)
+
+        return torch.from_numpy(img), labels_out, self.img_files[index], shapes
+
+    @staticmethod
+    def collate_fn(batch):
+        img, label, path, shapes = zip(*batch)  # transposed
+        for i, l in enumerate(label):
+            l[:, 0] = i  # add target image index for build_targets()
+        return torch.stack(img, 0), torch.cat(label, 0), path, shapes
+
+
+# Ancillary functions --------------------------------------------------------------------------------------------------
+def load_image(self, index):
+    # loads 1 image from dataset, returns img, original hw, resized hw
+    img = self.imgs[index]
+    if img is None:  # not cached
+        path = self.img_files[index]
+        img = cv2.imread(path)  # BGR
+        assert img is not None, 'Image Not Found ' + path
+        h0, w0 = img.shape[:2]  # orig hw
+        r = self.img_size / max(h0, w0)  # resize image to img_size
+        if r != 1:  # always resize down, only resize up if training with augmentation
+            interp = cv2.INTER_AREA if r < 1 and not self.augment else cv2.INTER_LINEAR
+            img = cv2.resize(img, (int(w0 * r), int(h0 * r)), interpolation=interp)
+        return img, (h0, w0), img.shape[:2]  # img, hw_original, hw_resized
+    else:
+        return self.imgs[index], self.img_hw0[index], self.img_hw[index]  # img, hw_original, hw_resized
+
+
+def augment_hsv(img, hgain=0.5, sgain=0.5, vgain=0.5):
+    r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1  # random gains
+    hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))
+    dtype = img.dtype  # uint8
+
+    x = np.arange(0, 256, dtype=np.int16)
+    lut_hue = ((x * r[0]) % 180).astype(dtype)
+    lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
+    lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
+
+    img_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))).astype(dtype)
+    cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)  # no return needed
+
+    # Histogram equalization
+    # if random.random() < 0.2:
+    #     for i in range(3):
+    #         img[:, :, i] = cv2.equalizeHist(img[:, :, i])
+
+
+def load_mosaic(self, index):
+    # loads images in a mosaic
+
+    labels4 = []
+    s = self.img_size
+    yc, xc = [int(random.uniform(-x, 2 * s + x)) for x in self.mosaic_border]  # mosaic center x, y
+    indices = [index] + [self.indices[random.randint(0, self.n - 1)] for _ in range(3)]  # 3 additional image indices
+    for i, index in enumerate(indices):
+        # Load image
+        img, _, (h, w) = load_image(self, index)
+
+        # place img in img4
+        if i == 0:  # top left
+            img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8)  # base image with 4 tiles
+            x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
+            x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
+        elif i == 1:  # top right
+            x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
+            x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
+        elif i == 2:  # bottom left
+            x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
+            x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
+        elif i == 3:  # bottom right
+            x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
+            x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
+
+        img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
+        padw = x1a - x1b
+        padh = y1a - y1b
+
+        # Labels
+        x = self.labels[index]
+        labels = x.copy()
+        if x.size > 0:  # Normalized xywh to pixel xyxy format
+            labels[:, 1] = w * (x[:, 1] - x[:, 3] / 2) + padw
+            labels[:, 2] = h * (x[:, 2] - x[:, 4] / 2) + padh
+            labels[:, 3] = w * (x[:, 1] + x[:, 3] / 2) + padw
+            labels[:, 4] = h * (x[:, 2] + x[:, 4] / 2) + padh
+        labels4.append(labels)
+
+    # Concat/clip labels
+    if len(labels4):
+        labels4 = np.concatenate(labels4, 0)
+        np.clip(labels4[:, 1:], 0, 2 * s, out=labels4[:, 1:])  # use with random_perspective
+        # img4, labels4 = replicate(img4, labels4)  # replicate
+
+    # Augment
+    img4, labels4 = random_perspective(img4, labels4,
+                                       degrees=self.hyp['degrees'],
+                                       translate=self.hyp['translate'],
+                                       scale=self.hyp['scale'],
+                                       shear=self.hyp['shear'],
+                                       perspective=self.hyp['perspective'],
+                                       border=self.mosaic_border)  # border to remove
+
+    return img4, labels4
+
+
+def replicate(img, labels):
+    # Replicate labels
+    h, w = img.shape[:2]
+    boxes = labels[:, 1:].astype(int)
+    x1, y1, x2, y2 = boxes.T
+    s = ((x2 - x1) + (y2 - y1)) / 2  # side length (pixels)
+    for i in s.argsort()[:round(s.size * 0.5)]:  # smallest indices
+        x1b, y1b, x2b, y2b = boxes[i]
+        bh, bw = y2b - y1b, x2b - x1b
+        yc, xc = int(random.uniform(0, h - bh)), int(random.uniform(0, w - bw))  # offset x, y
+        x1a, y1a, x2a, y2a = [xc, yc, xc + bw, yc + bh]
+        img[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
+        labels = np.append(labels, [[labels[i, 0], x1a, y1a, x2a, y2a]], axis=0)
+
+    return img, labels
+
+
+def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True):
+    # Resize image to a 32-pixel-multiple rectangle https://github.com/ultralytics/yolov3/issues/232
+    shape = img.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better test mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, 32), np.mod(dh, 32)  # wh padding
+    elif scaleFill:  # stretch
+        dw, dh = 0.0, 0.0
+        new_unpad = (new_shape[1], new_shape[0])
+        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    return img, ratio, (dw, dh)
+
+
+def random_perspective(img, targets=(), degrees=10, translate=.1, scale=.1, shear=10, perspective=0.0, border=(0, 0)):
+    # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10))
+    # targets = [cls, xyxy]
+
+    height = img.shape[0] + border[0] * 2  # shape(h,w,c)
+    width = img.shape[1] + border[1] * 2
+
+    # Center
+    C = np.eye(3)
+    C[0, 2] = -img.shape[1] / 2  # x translation (pixels)
+    C[1, 2] = -img.shape[0] / 2  # y translation (pixels)
+
+    # Perspective
+    P = np.eye(3)
+    P[2, 0] = random.uniform(-perspective, perspective)  # x perspective (about y)
+    P[2, 1] = random.uniform(-perspective, perspective)  # y perspective (about x)
+
+    # Rotation and Scale
+    R = np.eye(3)
+    a = random.uniform(-degrees, degrees)
+    # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
+    s = random.uniform(1 - scale, 1 + scale)
+    # s = 2 ** random.uniform(-scale, scale)
+    R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
+
+    # Shear
+    S = np.eye(3)
+    S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # x shear (deg)
+    S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # y shear (deg)
+
+    # Translation
+    T = np.eye(3)
+    T[0, 2] = random.uniform(0.5 - translate, 0.5 + translate) * width  # x translation (pixels)
+    T[1, 2] = random.uniform(0.5 - translate, 0.5 + translate) * height  # y translation (pixels)
+
+    # Combined rotation matrix
+    M = T @ S @ R @ P @ C  # order of operations (right to left) is IMPORTANT
+    if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any():  # image changed
+        if perspective:
+            img = cv2.warpPerspective(img, M, dsize=(width, height), borderValue=(114, 114, 114))
+        else:  # affine
+            img = cv2.warpAffine(img, M[:2], dsize=(width, height), borderValue=(114, 114, 114))
+
+    # Visualize
+    # import matplotlib.pyplot as plt
+    # ax = plt.subplots(1, 2, figsize=(12, 6))[1].ravel()
+    # ax[0].imshow(img[:, :, ::-1])  # base
+    # ax[1].imshow(img2[:, :, ::-1])  # warped
+
+    # Transform label coordinates
+    n = len(targets)
+    if n:
+        # warp points
+        xy = np.ones((n * 4, 3))
+        xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+        xy = xy @ M.T  # transform
+        if perspective:
+            xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)  # rescale
+        else:  # affine
+            xy = xy[:, :2].reshape(n, 8)
+
+        # create new boxes
+        x = xy[:, [0, 2, 4, 6]]
+        y = xy[:, [1, 3, 5, 7]]
+        xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+
+        # # apply angle-based reduction of bounding boxes
+        # radians = a * math.pi / 180
+        # reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5
+        # x = (xy[:, 2] + xy[:, 0]) / 2
+        # y = (xy[:, 3] + xy[:, 1]) / 2
+        # w = (xy[:, 2] - xy[:, 0]) * reduction
+        # h = (xy[:, 3] - xy[:, 1]) * reduction
+        # xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T
+
+        # clip boxes
+        xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
+        xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
+
+        # filter candidates
+        i = box_candidates(box1=targets[:, 1:5].T * s, box2=xy.T)
+        targets = targets[i]
+        targets[:, 1:5] = xy[i]
+
+    return img, targets
+
+
+def box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.1):  # box1(4,n), box2(4,n)
+    # Compute candidate boxes: box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio
+    w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
+    w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
+    ar = np.maximum(w2 / (h2 + 1e-16), h2 / (w2 + 1e-16))  # aspect ratio
+    return (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + 1e-16) > area_thr) & (ar < ar_thr)  # candidates
+
+
+def cutout(image, labels):
+    # Applies image cutout augmentation https://arxiv.org/abs/1708.04552
+    h, w = image.shape[:2]
+
+    def bbox_ioa(box1, box2):
+        # Returns the intersection over box2 area given box1, box2. box1 is 4, box2 is nx4. boxes are x1y1x2y2
+        box2 = box2.transpose()
+
+        # Get the coordinates of bounding boxes
+        b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
+        b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
+
+        # Intersection area
+        inter_area = (np.minimum(b1_x2, b2_x2) - np.maximum(b1_x1, b2_x1)).clip(0) * \
+                     (np.minimum(b1_y2, b2_y2) - np.maximum(b1_y1, b2_y1)).clip(0)
+
+        # box2 area
+        box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + 1e-16
+
+        # Intersection over box2 area
+        return inter_area / box2_area
+
+    # create random masks
+    scales = [0.5] * 1 + [0.25] * 2 + [0.125] * 4 + [0.0625] * 8 + [0.03125] * 16  # image size fraction
+    for s in scales:
+        mask_h = random.randint(1, int(h * s))
+        mask_w = random.randint(1, int(w * s))
+
+        # box
+        xmin = max(0, random.randint(0, w) - mask_w // 2)
+        ymin = max(0, random.randint(0, h) - mask_h // 2)
+        xmax = min(w, xmin + mask_w)
+        ymax = min(h, ymin + mask_h)
+
+        # apply random color mask
+        image[ymin:ymax, xmin:xmax] = [random.randint(64, 191) for _ in range(3)]
+
+        # return unobscured labels
+        if len(labels) and s > 0.03:
+            box = np.array([xmin, ymin, xmax, ymax], dtype=np.float32)
+            ioa = bbox_ioa(box, labels[:, 1:5])  # intersection over area
+            labels = labels[ioa < 0.60]  # remove >60% obscured labels
+
+    return labels
+
+
+def create_folder(path='./new'):
+    # Create folder
+    if os.path.exists(path):
+        shutil.rmtree(path)  # delete output folder
+    os.makedirs(path)  # make new output folder
+
+
+def flatten_recursive(path='../coco128'):
+    # Flatten a recursive directory by bringing all files to top level
+    new_path = Path(path + '_flat')
+    create_folder(new_path)
+    for file in tqdm(glob.glob(str(Path(path)) + '/**/*.*', recursive=True)):
+        shutil.copyfile(file, new_path / Path(file).name)
+
+
+def extract_boxes(path='../coco128/'):  # from utils.datasets import *; extract_boxes('../coco128')
+    # Convert detection dataset into classification dataset, with one directory per class
+
+    path = Path(path)  # images dir
+    shutil.rmtree(path / 'classifier') if (path / 'classifier').is_dir() else None  # remove existing
+    files = list(path.rglob('*.*'))
+    n = len(files)  # number of files
+    for im_file in tqdm(files, total=n):
+        if im_file.suffix[1:] in img_formats:
+            # image
+            im = cv2.imread(str(im_file))[..., ::-1]  # BGR to RGB
+            h, w = im.shape[:2]
+
+            # labels
+            lb_file = Path(img2label_paths([str(im_file)])[0])
+            if Path(lb_file).exists():
+                with open(lb_file, 'r') as f:
+                    lb = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32)  # labels
+
+                for j, x in enumerate(lb):
+                    c = int(x[0])  # class
+                    f = (path / 'classifier') / f'{c}' / f'{path.stem}_{im_file.stem}_{j}.jpg'  # new filename
+                    if not f.parent.is_dir():
+                        f.parent.mkdir(parents=True)
+
+                    b = x[1:] * [w, h, w, h]  # box
+                    # b[2:] = b[2:].max()  # rectangle to square
+                    b[2:] = b[2:] * 1.2 + 3  # pad
+                    b = xywh2xyxy(b.reshape(-1, 4)).ravel().astype(np.int)
+
+                    b[[0, 2]] = np.clip(b[[0, 2]], 0, w)  # clip boxes outside of image
+                    b[[1, 3]] = np.clip(b[[1, 3]], 0, h)
+                    assert cv2.imwrite(str(f), im[b[1]:b[3], b[0]:b[2]]), f'box failure in {f}'
+
+
+def autosplit(path='../coco128', weights=(0.9, 0.1, 0.0)):  # from utils.datasets import *; autosplit('../coco128')
+    """ Autosplit a dataset into train/val/test splits and save path/autosplit_*.txt files
+    # Arguments
+        path:       Path to images directory
+        weights:    Train, val, test weights (list)
+    """
+    path = Path(path)  # images dir
+    files = list(path.rglob('*.*'))
+    n = len(files)  # number of files
+    indices = random.choices([0, 1, 2], weights=weights, k=n)  # assign each image to a split
+    txt = ['autosplit_train.txt', 'autosplit_val.txt', 'autosplit_test.txt']  # 3 txt files
+    [(path / x).unlink() for x in txt if (path / x).exists()]  # remove existing
+    for i, img in tqdm(zip(indices, files), total=n):
+        if img.suffix[1:] in img_formats:
+            with open(path / txt[i], 'a') as f:
+                f.write(str(img) + '\n')  # add image to txt file
diff --git a/experiments/centralized/classification/README.md b/experiments/centralized/classification/README.md
new file mode 100644
index 0000000..138ca2e
--- /dev/null
+++ b/experiments/centralized/classification/README.md
@@ -0,0 +1,43 @@
+
+# PyTorch DDP classification
+
+## lr_scheduler parameter reference:
+
+
+EfficientNet-B0 with RandAugment - 77.7 top-1, 95.3 top-5
+
+
+
+```
+
+sh run_classification.sh 8 1 0 127.0.0.1 11111 "0,3"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset  --model efficientnet -b 384 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.048"
+
+# crashed need to rerun
+sh run_classification.sh 3 1 0 127.0.0.1 11112 "0,2,3"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --model efficientnet --distributed --if-timm-dataset -b 256 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr .048"
+
+sh run_classification.sh 3 1 0 127.0.0.1 11112 "0,2,3"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --model efficientnet --distributed --if-timm-dataset -b 256 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr .048"
+
+
+sh run_classification.sh 3 1 0 127.0.0.1 11112 "0,1,2"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --model efficientnet --distributed --if-timm-dataset -b 256 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr .048"
+
+```
+
+
+MobileNetV3-Large-100 - 75.766 top-1, 92,542 top-5
+
+
+
+```
+# crashed need to rerun
+sh run_classification.sh 4 1 0 127.0.0.1 11113 "0,1,2,3"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --model mobilenet_v3 --distributed --if-timm-dataset -b 256 --sched step --epochs 600 --decay-epochs 2.4 --decay-rate .973 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr .064 --lr-noise 0.42 0.9"
+
+# crashed need to rerun
+sh run_classification.sh 3 1 0 127.0.0.1 11113 "0,2,3"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --model mobilenet_v3 --distributed --if-timm-dataset -b 256 --sched step --epochs 600 --decay-epochs 2.4 --decay-rate .973 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr .05 --lr-noise 0.35 0.9"
+
+sh run_classification.sh 3 1 0 127.0.0.1 11113 "0,1,2"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --model mobilenet_v3 --distributed --if-timm-dataset -b 512 --sched step --epochs 600 --decay-epochs 2.4 --decay-rate .973 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr .064  --lr-noise 0.42 0.9"
+
+
+```
+# kill all processes
+kill $(ps aux | grep "ddp_classification.py" | grep -v grep | awk '{print $2}')
+```
\ No newline at end of file
diff --git a/experiments/centralized/classification/__init__.py b/experiments/centralized/classification/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/experiments/centralized/classification/configs/example.conf b/experiments/centralized/classification/configs/example.conf
new file mode 100644
index 0000000..a1c3d4d
--- /dev/null
+++ b/experiments/centralized/classification/configs/example.conf
@@ -0,0 +1,4 @@
+PYTHON=~/anaconda3/envs/py36/bin/python
+imagenet_data_dir=/home/datasets/imagenet/ILSVRC2012_dataset
+gld_data_dir=~/datasets/landmarks
+cifar10_data_dir=~/datasets/cifar10
\ No newline at end of file
diff --git a/experiments/centralized/classification/ddp_classification.py b/experiments/centralized/classification/ddp_classification.py
new file mode 100644
index 0000000..b60fe97
--- /dev/null
+++ b/experiments/centralized/classification/ddp_classification.py
@@ -0,0 +1,482 @@
+import argparse
+import logging
+import os
+import random
+import socket
+import sys
+import traceback
+
+
+import numpy as np
+import psutil
+import setproctitle
+import wandb
+from mpi4py import MPI
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.nn.parallel import DistributedDataParallel as DDP
+from timm import create_model as timm_create_model
+from timm.models import resume_checkpoint, load_checkpoint, convert_splitbn_model
+
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../../")))
+
+from utils.tracker import RuntimeTracker
+from utils.metrics import Metrics
+from utils.wandb_util import wandb_log
+from data_preprocessing.ImageNet.data_loader import distributed_centralized_ImageNet_loader
+from data_preprocessing.Landmarks.data_loader import load_partition_data_landmarks
+
+from data_preprocessing.cifar10.iid_data_loader import load_iid_cifar10
+from data_preprocessing.cifar10.data_loader import load_partition_data_cifar10
+from data_preprocessing.cifar100.data_loader import load_partition_data_cifar100
+from data_preprocessing.cinic10.data_loader import load_partition_data_cinic10
+
+
+
+
+
+from training.centralized_classification_trainer import ClassificationTrainer
+
+
+
+def add_args(parser):
+    """
+    parser : argparse.ArgumentParser
+    return a parser added with args required by fit
+    """
+    # Training settings
+    parser.add_argument('--model', type=str, default='mobilenet', metavar='N',
+                        help='neural network used in training')
+
+    parser.add_argument('--dataset', type=str, default='cifar10', metavar='N',
+                        help='dataset used for training')
+
+    parser.add_argument('--data_dir', type=str, default='./../../../data/cifar10',
+                        help='data directory')
+
+    parser.add_argument('--partition_method', type=str, default='hetero', metavar='N',
+                        help='how to partition the dataset on local workers')
+
+    parser.add_argument('--partition_alpha', type=float, default=0.5, metavar='PA',
+                        help='partition alpha (default: 0.5)')
+
+    parser.add_argument('--client_num_in_total', type=int, default=1000, metavar='NN',
+                        help='number of workers in a distributed cluster')
+
+    parser.add_argument('--client_num_per_round', type=int, default=4, metavar='NN',
+                        help='number of workers')
+
+    parser.add_argument('--batch_size', '-b', type=int, default=64, metavar='N',
+                        help='input batch size for training (default: 64)')
+
+    parser.add_argument('--client_optimizer', type=str, default='adam',
+                        help='SGD with momentum; adam')
+
+    # parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
+    #                     help='learning rate (default: 0.001)')
+
+    parser.add_argument('--wd', help='weight decay parameter;', type=float, default=0.001)
+
+    # parser.add_argument('--epochs', type=int, default=5, metavar='EP',
+    #                     help='how many epochs will be trained locally')
+
+    parser.add_argument('--comm_round', type=int, default=10,
+                        help='how many round of communications we shoud use')
+
+    parser.add_argument('--is_mobile', type=int, default=0,
+                        help='whether the program is running on the FedML-Mobile server side')
+
+    parser.add_argument('--frequency_of_the_test', type=int, default=1,
+                        help='the frequency of the algorithms')
+
+    parser.add_argument('--gpu_server_num', type=int, default=1,
+                        help='gpu_server_num')
+
+    parser.add_argument('--gpu_num_per_server', type=int, default=4,
+                        help='gpu_num_per_server')
+
+    parser.add_argument('--ci', type=int, default=0,
+                        help='CI')
+
+    parser.add_argument('--local_rank', type=int, default=0,
+                        help='given by torch.distributed.launch')
+
+    parser.add_argument('--pretrained',action='store_true', default=False,
+                        help='Start with pretrained version of specified network (if avail)')
+
+    parser.add_argument('--distributed', action='store_true', default=False,
+                        help='If distributed training')
+
+    parser.add_argument('--if-timm-dataset', action='store_true', default=False,
+                        help='If use timm dataset augmentation')
+
+    parser.add_argument('--data_load_num_workers', type=int, default=4,
+                        help='number of workers when loading data')
+
+
+    # Dataset
+    parser.add_argument('--img-size', type=int, default=None, metavar='N',
+                        help='Image patch size (default: None => model default)')
+    parser.add_argument('--crop-pct', default=None, type=float,
+                        metavar='N', help='Input image center crop percent (for validation only)')
+    parser.add_argument('--data_transform', default=None, type=str, metavar='TRANSFORM',
+                        help='How to do data transform')
+    parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+                        help='Override mean pixel value of dataset')
+    parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+                        help='Override std deviation of of dataset')
+    parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+                        help='Image resize interpolation type (overrides model)')
+    # parser.add_argument('-b', '--batch-size', type=int, default=32, metavar='N',
+    #                     help='input batch size for training (default: 32)')
+    parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N',
+                        help='ratio of validation batch size to training batch size (default: 1)')
+
+
+    # Model parameters
+    parser.add_argument('--gp', default=None, type=str, metavar='POOL',
+                        help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.')
+
+    # Optimizer parameters
+    parser.add_argument('--opt', default='sgd', type=str, metavar='OPTIMIZER',
+                        help='Optimizer (default: "sgd"')
+    parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON',
+                        help='Optimizer Epsilon (default: None, use opt default)')
+    parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+                        help='Optimizer Betas (default: None, use opt default)')
+    parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+                        help='Optimizer momentum (default: 0.9)')
+    parser.add_argument('--weight-decay', type=float, default=0.0001,
+                        help='weight decay (default: 0.0001)')
+    parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM',
+                        help='Clip gradient norm (default: None, no clipping)')
+
+
+    # Learning rate schedule parameters
+    parser.add_argument('--sched', default='step', type=str, metavar='SCHEDULER',
+                        help='LR scheduler (default: "step"')
+    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                        help='learning rate (default: 0.01)')
+    parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct',
+                        help='learning rate noise on/off epoch percentages')
+    parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT',
+                        help='learning rate noise limit percent (default: 0.67)')
+    parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV',
+                        help='learning rate noise std-dev (default: 1.0)')
+    parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT',
+                        help='learning rate cycle len multiplier (default: 1.0)')
+    parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N',
+                        help='learning rate cycle limit')
+    parser.add_argument('--warmup-lr', type=float, default=0.0001, metavar='LR',
+                        help='warmup learning rate (default: 0.0001)')
+    parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')
+    parser.add_argument('--epochs', type=int, default=200, metavar='N',
+                        help='number of epochs to train (default: 2)')
+    parser.add_argument('--start-epoch', default=None, type=int, metavar='N',
+                        help='manual epoch number (useful on restarts)')
+    parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
+                        help='epoch interval to decay LR')
+    parser.add_argument('--warmup-epochs', type=int, default=3, metavar='N',
+                        help='epochs to warmup LR, if scheduler supports')
+    parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N',
+                        help='epochs to cooldown LR at min_lr, after cyclic schedule ends')
+    parser.add_argument('--patience-epochs', type=int, default=10, metavar='N',
+                        help='patience epochs for Plateau LR scheduler (default: 10')
+    parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
+                        help='LR decay rate (default: 0.1)')
+
+    # Augmentation & regularization parameters
+    parser.add_argument('--no-aug', action='store_true', default=False,
+                        help='Disable all training augmentation, override other train aug args')
+    parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT',
+                        help='Random resize scale (default: 0.08 1.0)')
+    parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO',
+                        help='Random resize aspect ratio (default: 0.75 1.33)')
+    parser.add_argument('--hflip', type=float, default=0.5,
+                        help='Horizontal flip training aug probability')
+    parser.add_argument('--vflip', type=float, default=0.,
+                        help='Vertical flip training aug probability')
+    parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT',
+                        help='Color jitter factor (default: 0.4)')
+    parser.add_argument('--aa', type=str, default=None, metavar='NAME',
+                        help='Use AutoAugment policy. "v0" or "original". (default: None)'),
+    parser.add_argument('--aug-splits', type=int, default=0,
+                        help='Number of augmentation splits (default: 0, valid: 0 or >=2)')
+    parser.add_argument('--jsd', action='store_true', default=False,
+                        help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.')
+    parser.add_argument('--reprob', type=float, default=0., metavar='PCT',
+                        help='Random erase prob (default: 0.)')
+    parser.add_argument('--remode', type=str, default='const',
+                        help='Random erase mode (default: "const")')
+    parser.add_argument('--recount', type=int, default=1,
+                        help='Random erase count (default: 1)')
+    parser.add_argument('--resplit', action='store_true', default=False,
+                        help='Do not random erase first (clean) augmentation split')
+    parser.add_argument('--mixup', type=float, default=0.0,
+                        help='mixup alpha, mixup enabled if > 0. (default: 0.)')
+    parser.add_argument('--cutmix', type=float, default=0.0,
+                        help='cutmix alpha, cutmix enabled if > 0. (default: 0.)')
+    parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None,
+                        help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
+    parser.add_argument('--mixup-prob', type=float, default=1.0,
+                        help='Probability of performing mixup or cutmix when either/both is enabled')
+    parser.add_argument('--mixup-switch-prob', type=float, default=0.5,
+                        help='Probability of switching to cutmix when both mixup and cutmix enabled')
+    parser.add_argument('--mixup-mode', type=str, default='batch',
+                        help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
+    parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N',
+                        help='Turn off mixup after this epoch, disabled if 0 (default: 0)')
+    parser.add_argument('--smoothing', type=float, default=0.1,
+                        help='Label smoothing (default: 0.1)')
+    parser.add_argument('--train-interpolation', type=str, default='random',
+                        help='Training interpolation (random, bilinear, bicubic default: "random")')
+    parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
+                        help='Dropout rate (default: 0.)')
+    parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT',
+                        help='Drop connect rate, DEPRECATED, use drop-path (default: None)')
+    parser.add_argument('--drop-path', type=float, default=None, metavar='PCT',
+                        help='Drop path rate (default: None)')
+    parser.add_argument('--drop-block', type=float, default=None, metavar='PCT',
+                        help='Drop block rate (default: None)')
+
+    # Batch norm parameters (only works with gen_efficientnet based models currently)
+    parser.add_argument('--bn-tf', type=bool, default=False,
+                        help='Use Tensorflow BatchNorm defaults for models that support it (default: False)')
+    parser.add_argument('--bn-momentum', type=float, default=None,
+                        help='BatchNorm momentum override (if not None)')
+    parser.add_argument('--bn-eps', type=float, default=None,
+                        help='BatchNorm epsilon override (if not None)')
+    parser.add_argument('--sync-bn', action='store_true',
+                        help='Enable NVIDIA Apex or Torch synchronized BatchNorm.')
+    parser.add_argument('--dist-bn', type=str, default='',
+                        help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")')
+    parser.add_argument('--split-bn', action='store_true',
+                        help='Enable separate BN layers per augmentation split.')
+
+    # Model Exponential Moving Average
+    parser.add_argument('--model-ema', action='store_true', default=False,
+                        help='Enable tracking moving average of model weights')
+    parser.add_argument('--model-ema-force-cpu', action='store_true', default=False,
+                        help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.')
+    parser.add_argument('--model-ema-decay', type=float, default=0.9998,
+                        help='decay factor for model weights moving average (default: 0.9998)')
+
+    args = parser.parse_args()
+    return args
+
+
+def load_data(args, dataset_name):
+    if dataset_name in ["ILSVRC2012", "ILSVRC2012-100"]:
+        logging.info("load_data. dataset_name = %s" % dataset_name)
+        train_data_num, test_data_num, train_data_global, test_data_global, \
+        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
+        class_num = distributed_centralized_ImageNet_loader(dataset=dataset_name, data_dir=args.data_dir,
+                                                 world_size=args.client_num_in_total, 
+                                                 rank=args.rank, batch_size=args.batch_size,
+                                                 args=args)
+
+    elif dataset_name == "gld23k":
+        logging.info("load_data. dataset_name = %s" % dataset_name)
+        args.client_num_in_total = 233
+        # fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_train.csv')
+        # fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_test.csv')
+        fed_train_map_file = os.path.join(args.data_dir, 'gld23k_user_dict_train.csv')
+        fed_test_map_file = os.path.join(args.data_dir, 'gld23k_user_dict_test.csv')
+        args.data_dir = os.path.join(args.data_dir, 'images')
+
+        train_data_num, test_data_num, train_data_global, test_data_global, \
+        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
+        class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir,
+                                                  fed_train_map_file=fed_train_map_file,
+                                                  fed_test_map_file=fed_test_map_file,
+                                                  partition_method=None, partition_alpha=None,
+                                                  client_number=233, batch_size=args.batch_size)
+
+    elif dataset_name == "gld160k":
+        logging.info("load_data. dataset_name = %s" % dataset_name)
+        args.client_num_in_total = 1262
+        # fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_train.csv')
+        # fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_test.csv')
+        fed_train_map_file = os.path.join(args.data_dir, 'gld160k_user_dict_train.csv')
+        fed_test_map_file = os.path.join(args.data_dir, 'gld160k_user_dict_test.csv')
+        args.data_dir = os.path.join(args.data_dir, 'images')
+
+
+        train_data_num, test_data_num, train_data_global, test_data_global, \
+        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
+        class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir,
+                                                  fed_train_map_file=fed_train_map_file,
+                                                  fed_test_map_file=fed_test_map_file,
+                                                  partition_method=None, partition_alpha=None,
+                                                  client_number=args.client_num_in_total, batch_size=args.batch_size)
+    else:
+        if dataset_name == "cifar10":
+            data_loader = load_partition_data_cifar10
+        elif dataset_name == "cifar100":
+            data_loader = load_partition_data_cifar100
+        elif dataset_name == "cinic10":
+            data_loader = load_partition_data_cinic10
+        else:
+            raise Exception("no such dataset")
+
+        train_data_num, test_data_num, train_data_global, test_data_global, \
+        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
+        class_num = data_loader(args.dataset, args.data_dir, args.partition_method,
+                                args.partition_alpha, args.client_num_in_total, args.batch_size)
+
+
+    dataset = [train_data_num, test_data_num, train_data_global, test_data_global,
+               train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num]
+    return dataset
+
+
+def create_model(args, model_name, output_dim):
+    logging.info("create_model. model_name = %s, output_dim = %s" % (model_name, output_dim))
+    if model_name == 'mobilenet_v3':
+        '''model_mode \in {LARGE: 5.15M, SMALL: 2.94M}'''
+        # model = MobileNetV3(model_mode='LARGE')
+        model = timm_create_model(
+        model_name="mobilenetv3_large_100",
+        pretrained=args.pretrained,
+        num_classes=output_dim,
+        drop_rate=args.drop,
+        # drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
+        drop_path_rate=args.drop_path,
+        drop_block_rate=args.drop_block,
+        global_pool=args.gp,
+        bn_tf=args.bn_tf,
+        bn_momentum=args.bn_momentum,
+        bn_eps=args.bn_eps)
+
+    elif model_name == 'efficientnet':
+        model = timm_create_model(
+        model_name="efficientnet_b0",
+        pretrained=args.pretrained,
+        num_classes=output_dim,
+        drop_rate=args.drop,
+        # drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
+        drop_path_rate=args.drop_path,
+        drop_block_rate=args.drop_block,
+        global_pool=args.gp,
+        bn_tf=args.bn_tf,
+        bn_momentum=args.bn_momentum,
+        bn_eps=args.bn_eps)
+    else:
+        raise Exception("no such model")
+    return model
+
+
+def init_ddp():
+    # use InfiniBand
+    os.environ['NCCL_DEBUG'] = 'INFO'
+    os.environ['NCCL_SOCKET_IFNAME'] = 'lo'
+
+    # This the global rank: 0, 1, 2, ..., 15
+    global_rank = int(os.environ['RANK'])
+    print("int(os.environ['RANK']) = %d" % global_rank)
+
+    # This the globak world_size
+    world_size = int(os.environ['WORLD_SIZE'])
+    print("world_size = %d" % world_size)
+
+    # initialize the process group
+    # dist.init_process_group(backend="nccl", rank=global_rank, world_size=world_size)
+    dist.init_process_group(backend="nccl", init_method="env://")
+
+    local_rank = args.local_rank
+    print(f"Running basic DDP example on local rank {local_rank}.")
+    return local_rank, global_rank
+
+
+def get_ddp_model(model, local_rank):
+    return DDP(model, device_ids=[local_rank], output_device=local_rank)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="PyTorch DDP Demo")
+    args = add_args(parser)
+    # parser.add_argument("--local_rank", type=int, default=0)
+    args = parser.parse_args()
+    logging.info(args)
+    # args.weight_decay = args.wd
+    args.wd = args.weight_decay
+
+    # DDP
+    local_rank, global_rank = init_ddp()
+    process_id = global_rank
+    args.rank = global_rank
+
+    # customize the process name
+    str_process_name = "ddp_classification:" + str(process_id)
+    setproctitle.setproctitle(str_process_name)
+
+    # customize the log format
+    while logging.getLogger().handlers:
+        logging.getLogger().handlers.clear()
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+    formatter = logging.Formatter(str(process_id) + 
+        ' - %(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
+    console.setFormatter(formatter)
+    # Create an instance
+    logging.getLogger().addHandler(console)
+    hostname = socket.gethostname()
+    logging.info("#############process ID = " + str(process_id) +
+                ", host name = " + hostname + "########" +
+                ", process ID = " + str(os.getpid()) +
+                ", process Name = " + str(psutil.Process(os.getpid())))
+
+
+    name_model_ema = "-model_ema" if args.model_ema else "-no_model_ema"
+    name_aa = args.aa if args.aa is not None else "_None"
+    # initialize the wandb machine learning experimental tracking platform (https://www.wandb.com/).
+    if process_id == 0:
+        wandb.init(
+            entity="automl",
+            project="fedcv-classification",
+            name="FedCV (c new)" + str(args.partition_method) + "-" +str(args.dataset)+
+                "-e" + str(args.epochs) + "-" + str(args.model) + "-" +
+                args.data_transform + "-aa" + name_aa + "-" + str(args.opt) + 
+                name_model_ema + "-bs" + str(args.batch_size) +
+                "-lr" + str(args.lr) + "-wd" + str(args.wd),
+            config=args
+        )
+
+    random.seed(0)
+    np.random.seed(0)
+    torch.manual_seed(0)
+    torch.cuda.manual_seed_all(0)
+
+    # GPU
+    device = torch.device("cuda:" + str(local_rank))
+
+    # load data
+    dataset = load_data(args, args.dataset)
+    [train_data_num, test_data_num, train_data_global, test_data_global,
+     train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] = dataset
+
+    # create model.
+    # Note if the model is DNN (e.g., ResNet), the training will be very slow.
+    # In this case, please use our FedML distributed version (./fedml_experiments/distributed_fedavg)
+    model = create_model(args, model_name=args.model, output_dim=dataset[7])
+    model = model.to(device)
+    model = get_ddp_model(model, local_rank)
+    if global_rank == 0:
+        print(model)
+
+    metrics = Metrics(topks=[1], task="classification")
+    train_tracker = RuntimeTracker(things_to_track=metrics.metric_names)
+    test_tracker = RuntimeTracker(things_to_track=metrics.metric_names)
+
+    model_trainer = ClassificationTrainer(model, device, args)
+    for epoch in range(args.epochs):
+        model_trainer.train_one_epoch(train_data_global, device, args, epoch)
+        if global_rank == 0:
+            model_trainer.test(test_data_global, device, args, metrics, test_tracker)
+            wandb_log(prefix='Test', sp_values=test_tracker(), com_values={"epoch": epoch})
+        # I forget to reset the tracker previously
+        test_tracker.reset()
+    dist.destroy_process_group()
diff --git a/experiments/centralized/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md b/experiments/centralized/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md
new file mode 100644
index 0000000..62d25c6
--- /dev/null
+++ b/experiments/centralized/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md	
@@ -0,0 +1,70 @@
+# ILSVRC2012-100  MobileNetV3-Large-100
+
+# scigpu
+PYTHON=~/anaconda3/envs/py36/bin/python
+imagenet_data_dir=/home/datasets/imagenet/ILSVRC2012_dataset
+gld_data_dir=~/datasets/landmarks
+cifar10_data_dir=~/datasets/cifar10
+GPU_UTIL_FILE=scigpu_gpu_util.yaml
+MPI_HOST_FILE=scigpu_mpi_host_file
+
+# DAAI
+PYTHON=~/py36/bin/python
+imagenet_data_dir=/home/datasets/ILSVRC2012_dataset
+gld_data_dir=/home/datasets/landmarks
+cifar10_data_dir=/home/datasets/cifar10
+GPU_UTIL_FILE=DAAI_gpu_util.yaml
+MPI_HOST_FILE=DAAI_mpi_host_file_2
+
+
+
+## Pure
+
+```
+# running
+./single_run_classification.sh "0"  ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03"
+./single_run_classification.sh "1"  ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.01"
+./single_run_classification.sh "2"  ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.003"
+```
+
+## Pure with normal image transform
+
+```
+
+./single_run_classification.sh "1"  ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03"
+./single_run_classification.sh "1"  ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.01"
+./single_run_classification.sh "1"  ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.003"
+```
+
+
+## Add AutoAugmentation
+```
+
+
+./single_run_classification.sh "2"  ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03"
+./single_run_classification.sh "2"  ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.01"
+./single_run_classification.sh "2"  ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.003"
+```
+
+## Add model EMA
+```
+# running
+
+./single_run_classification.sh "3"  ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.03"
+./single_run_classification.sh "3"  ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.01"
+
+./single_run_classification.sh "3"  ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.003"
+```
+
+
+## Add all
+```
+./single_run_classification.sh "3"  ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03"
+
+./single_run_classification.sh "0"  ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.01"
+
+./single_run_classification.sh "1"  ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.003"
+
+```
+
+
diff --git a/experiments/centralized/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md b/experiments/centralized/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md
new file mode 100644
index 0000000..6eaeaad
--- /dev/null
+++ b/experiments/centralized/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md	
@@ -0,0 +1,79 @@
+# ILSVRC2012-100  MobileNetV3-Large-100
+
+# scigpu
+PYTHON=~/anaconda3/envs/py36/bin/python
+imagenet_data_dir=/home/datasets/imagenet/ILSVRC2012_dataset
+gld_data_dir=~/datasets/landmarks
+cifar10_data_dir=~/datasets/cifar10
+GPU_UTIL_FILE=scigpu_gpu_util.yaml
+MPI_HOST_FILE=scigpu_mpi_host_file
+
+# DAAI
+PYTHON=~/py36/bin/python
+imagenet_data_dir=/home/datasets/ILSVRC2012_dataset
+gld_data_dir=/home/datasets/landmarks
+cifar10_data_dir=/home/datasets/cifar10
+GPU_UTIL_FILE=DAAI_gpu_util.yaml
+MPI_HOST_FILE=DAAI_mpi_host_file_2
+
+
+
+## Pure
+
+```
+
+# running 
+./single_run_classification.sh "0"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03"
+
+# running
+./single_run_classification.sh "2"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.01"
+
+# running
+./single_run_classification.sh "2"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.06"
+```
+
+## Pure with normal image transform
+
+```
+# running 
+./single_run_classification.sh "3"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03"
+
+# running 
+./single_run_classification.sh "0"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.01"
+
+# running 
+./single_run_classification.sh "0"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.06"
+```
+
+
+## Add AutoAugmentation
+```
+# running 
+./single_run_classification.sh "0"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03"
+
+# running 
+./single_run_classification.sh "1"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.01"
+
+# running 
+./single_run_classification.sh "1"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.06"
+```
+
+## Add model EMA
+```
+# running 
+./single_run_classification.sh "1"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.03"
+```
+
+
+## Add all
+```
+# running 
+./single_run_classification.sh "0"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03"
+
+./single_run_classification.sh "1"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.01"
+
+./single_run_classification.sh "3"  ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.003"
+
+```
+
+
diff --git a/experiments/centralized/classification/experiment_scripts/cifar100 EfficientNet.md b/experiments/centralized/classification/experiment_scripts/cifar100 EfficientNet.md
new file mode 100644
index 0000000..7c3f319
--- /dev/null
+++ b/experiments/centralized/classification/experiment_scripts/cifar100 EfficientNet.md	
@@ -0,0 +1,22 @@
+# CIFAR100  MobileNetV3-Large-100
+
+
+# DAAI
+PYTHON=~/py36/bin/python
+imagenet_data_dir=/home/datasets/ILSVRC2012_dataset
+gld_data_dir=/home/datasets/landmarks
+cifar10_data_dir=/home/datasets/cifar10
+cifar100_data_dir=/home/datasets/cifar100
+GPU_UTIL_FILE=DAAI_gpu_util.yaml
+MPI_HOST_FILE=DAAI_mpi_host_file_2
+
+
+
+# Pure
+./single_run_classification.sh "0"  ~/py36/bin/python " --dataset cifar100 --data_dir /home/datasets/cifar100 --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03"
+
+
+
+
+
+
diff --git a/experiments/centralized/classification/experiment_scripts/cifar100 MobileNetV3.md b/experiments/centralized/classification/experiment_scripts/cifar100 MobileNetV3.md
new file mode 100644
index 0000000..5900292
--- /dev/null
+++ b/experiments/centralized/classification/experiment_scripts/cifar100 MobileNetV3.md	
@@ -0,0 +1,22 @@
+# CIFAR100  MobileNetV3-Large-100
+
+
+# DAAI
+PYTHON=~/py36/bin/python
+imagenet_data_dir=/home/datasets/ILSVRC2012_dataset
+gld_data_dir=/home/datasets/landmarks
+cifar10_data_dir=/home/datasets/cifar10
+cifar100_data_dir=/home/datasets/cifar100
+GPU_UTIL_FILE=DAAI_gpu_util.yaml
+MPI_HOST_FILE=DAAI_mpi_host_file_2
+
+
+
+# Pure
+./single_run_classification.sh "1"  ~/py36/bin/python " --dataset cifar100 --data_dir /home/datasets/cifar100 --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03"
+
+
+
+
+
+
diff --git a/experiments/centralized/classification/experiment_scripts/gld160k EfficientNet.md b/experiments/centralized/classification/experiment_scripts/gld160k EfficientNet.md
new file mode 100644
index 0000000..b8e91ce
--- /dev/null
+++ b/experiments/centralized/classification/experiment_scripts/gld160k EfficientNet.md	
@@ -0,0 +1,62 @@
+# ILSVRC2012-100  MobileNetV3-Large-100
+
+# scigpu
+PYTHON=~/anaconda3/envs/py36/bin/python
+imagenet_data_dir=/home/datasets/imagenet/ILSVRC2012_dataset
+gld_data_dir=~/datasets/landmarks
+cifar10_data_dir=~/datasets/cifar10
+GPU_UTIL_FILE=scigpu_gpu_util.yaml
+MPI_HOST_FILE=scigpu_mpi_host_file
+
+# DAAI
+PYTHON=~/py36/bin/python
+imagenet_data_dir=/home/datasets/ILSVRC2012_dataset
+gld_data_dir=/home/datasets/landmarks
+cifar10_data_dir=/home/datasets/cifar10
+GPU_UTIL_FILE=DAAI_gpu_util.yaml
+MPI_HOST_FILE=DAAI_mpi_host_file_2
+
+# t716
+PYTHON=~/miniconda3/bin/python
+imagenet_data_dir=/nfs_home/datasets/ILSVRC2012
+gld_data_dir=/nfs_home/datasets/landmarks
+cifar10_data_dir=/nfs_home/datasets/cifar10
+mnist_data_dir=/nfs_home/datasets/mnist
+
+
+./single_run_classification.sh "0"  ~/miniconda3/bin/python " --dataset gld160k --data_dir /nfs_home/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 64 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03"
+
+## Pure
+
+```
+cmd="
+cd ~/FedCV_classification/experiments/centralized/classification;
+./single_run_classification.sh \"0\"  ~/miniconda3/bin/python \" --dataset gld160k --data_dir /nfs_home/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03\" "
+ssh host5 $cmd
+```
+
+## Pure with normal image transform
+
+```
+cmd="
+./single_run_classification.sh \"0\"  ~/miniconda3/bin/python " --dataset gld160k --data_dir /nfs_home/datasets/landmarks --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03"
+```
+
+
+## Add AutoAugmentation
+```
+./single_run_classification.sh "0"  ~/miniconda3/bin/python " --dataset gld160k --data_dir /nfs_home/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03"
+```
+
+## Add model EMA
+```
+./single_run_classification.sh "0"  ~/miniconda3/bin/python " --dataset gld160k --data_dir /nfs_home/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.03"
+```
+
+
+## Add all
+```
+./single_run_classification.sh "0"  ~/miniconda3/bin/python " --dataset gld160k --data_dir /nfs_home/datasets/landmarks --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03"
+```
+
+
diff --git a/experiments/centralized/classification/experiment_scripts/gld160k MobileNetV3.md b/experiments/centralized/classification/experiment_scripts/gld160k MobileNetV3.md
new file mode 100644
index 0000000..0edbc91
--- /dev/null
+++ b/experiments/centralized/classification/experiment_scripts/gld160k MobileNetV3.md	
@@ -0,0 +1,60 @@
+# ILSVRC2012-100  MobileNetV3-Large-100
+
+# scigpu
+PYTHON=~/anaconda3/envs/py36/bin/python
+imagenet_data_dir=/home/datasets/imagenet/ILSVRC2012_dataset
+gld_data_dir=/home/comp/20481896/datasets/landmarks
+gld_data_dir=~/datasets/landmarks
+cifar10_data_dir=/home/comp/20481896/datasets/cifar10
+cifar10_data_dir=~/datasets/cifar10
+GPU_UTIL_FILE=scigpu_gpu_util.yaml
+MPI_HOST_FILE=scigpu_mpi_host_file
+
+# DAAI
+PYTHON=~/py36/bin/python
+imagenet_data_dir=/home/datasets/ILSVRC2012_dataset
+gld_data_dir=/home/datasets/landmarks
+cifar10_data_dir=/home/datasets/cifar10
+GPU_UTIL_FILE=DAAI_gpu_util.yaml
+MPI_HOST_FILE=DAAI_mpi_host_file_2
+
+# t716
+PYTHON=~/miniconda3/bin/python
+imagenet_data_dir=/nfs_home/datasets/ILSVRC2012
+gld_data_dir=/nfs_home/datasets/landmarks
+cifar10_data_dir=/nfs_home/datasets/cifar10
+mnist_data_dir=/nfs_home/datasets/mnist
+
+
+## Pure
+
+```
+
+./single_run_classification.sh "0"  ~/anaconda3/envs/py36/bin/python " --dataset gld160k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03"
+```
+
+## Pure with normal image transform
+
+```
+
+./single_run_classification.sh "1"  ~/anaconda3/envs/py36/bin/python " --dataset gld160k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03"
+```
+
+
+## Add AutoAugmentation
+```
+./single_run_classification.sh "2"  ~/anaconda3/envs/py36/bin/python " --dataset gld160k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03"
+```
+
+## Add model EMA
+```
+./single_run_classification.sh "0"  ~/anaconda3/envs/py36/bin/python " --dataset gld160k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.03"
+```
+
+
+## Add all
+```
+./single_run_classification.sh "0"  ~/anaconda3/envs/py36/bin/python " --dataset gld160k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03"
+```
+
+
diff --git a/experiments/centralized/classification/experiment_scripts/gld23k EfficientNet.md b/experiments/centralized/classification/experiment_scripts/gld23k EfficientNet.md
new file mode 100644
index 0000000..ff2dec6
--- /dev/null
+++ b/experiments/centralized/classification/experiment_scripts/gld23k EfficientNet.md	
@@ -0,0 +1,65 @@
+# ILSVRC2012-100  MobileNetV3-Large-100
+
+# t716
+PYTHON=~/miniconda3/bin/python
+imagenet_data_dir=/nfs_home/datasets/ILSVRC2012
+gld_data_dir=/nfs_home/datasets/landmarks
+cifar10_data_dir=/nfs_home/datasets/cifar10
+mnist_data_dir=/nfs_home/datasets/mnist
+
+
+
+## Pure
+
+```
+
+./single_run_classification.sh "0"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03"
+
+./single_run_classification.sh "0"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.01"
+
+./single_run_classification.sh "0"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.003"
+```
+
+## Pure with normal image transform
+
+```
+
+./single_run_classification.sh "0"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03"
+
+./single_run_classification.sh "1"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.01"
+
+./single_run_classification.sh "2"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.003"
+```
+
+
+## Add AutoAugmentation
+```
+./single_run_classification.sh "0"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03"
+
+./single_run_classification.sh "1"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.01"
+
+./single_run_classification.sh "3"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.003"
+```
+
+## Add model EMA
+```
+# running
+./single_run_classification.sh "3"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.03"
+
+./single_run_classification.sh "1"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.01"
+
+./single_run_classification.sh "0"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.003"
+
+```
+
+
+## Add all
+```
+./single_run_classification.sh "2"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03"
+
+./single_run_classification.sh "3"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.01"
+
+./single_run_classification.sh "2"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.003"
+```
+
+
diff --git a/experiments/centralized/classification/experiment_scripts/gld23k MobileNetV3.md b/experiments/centralized/classification/experiment_scripts/gld23k MobileNetV3.md
new file mode 100644
index 0000000..e339a0e
--- /dev/null
+++ b/experiments/centralized/classification/experiment_scripts/gld23k MobileNetV3.md	
@@ -0,0 +1,79 @@
+# ILSVRC2012-100  MobileNetV3-Large-100
+
+# t716
+PYTHON=~/miniconda3/bin/python
+imagenet_data_dir=/nfs_home/datasets/ILSVRC2012
+gld_data_dir=/nfs_home/datasets/landmarks
+cifar10_data_dir=/nfs_home/datasets/cifar10
+mnist_data_dir=/nfs_home/datasets/mnist
+
+# DAAI
+PYTHON=~/py36/bin/python
+imagenet_data_dir=/home/datasets/ILSVRC2012_dataset
+gld_data_dir=/home/datasets/landmarks
+cifar10_data_dir=/home/datasets/cifar10
+GPU_UTIL_FILE=DAAI_gpu_util.yaml
+MPI_HOST_FILE=DAAI_mpi_host_file_2
+
+
+# scigpu
+PYTHON=~/anaconda3/envs/py36/bin/python
+imagenet_data_dir=/home/datasets/imagenet/ILSVRC2012_dataset
+gld_data_dir=~/datasets/landmarks
+cifar10_data_dir=~/datasets/cifar10
+GPU_UTIL_FILE=scigpu_gpu_util.yaml
+MPI_HOST_FILE=scigpu_mpi_host_file
+
+
+## Pure
+
+```
+./single_run_classification.sh "0"   ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03"
+
+./single_run_classification.sh "1"   ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.01"
+
+./single_run_classification.sh "2"   ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.003"
+```
+
+## Pure with normal image transform
+
+```
+
+./single_run_classification.sh "0"   ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03"
+
+./single_run_classification.sh "1"   ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.01"
+
+./single_run_classification.sh "3"   ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.003"
+```
+
+
+## Add AutoAugmentation
+```
+./single_run_classification.sh "0"   ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03"
+
+./single_run_classification.sh "1"   ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.01"
+
+./single_run_classification.sh "2"   ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.003"
+```
+
+## Add model EMA
+```
+./single_run_classification.sh "0"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.03"
+
+./single_run_classification.sh "1"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.01"
+
+./single_run_classification.sh "2"  ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.003"
+```
+
+
+## Add all
+```
+./single_run_classification.sh "2"   ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03"
+
+./single_run_classification.sh "1"   ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.003"
+
+
+
+```
+
+
diff --git a/experiments/centralized/classification/remote_run.sh b/experiments/centralized/classification/remote_run.sh
new file mode 100644
index 0000000..931f374
--- /dev/null
+++ b/experiments/centralized/classification/remote_run.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+CD_PATH=$1
+HOST=$2
+EXECUTE_CMD=$3
+
+echo $CD_PATH
+echo $HOST
+echo $EXECUTE_CMD
+
+cmd="cd $CD_PATH ; $EXECUTE_CMD"
+echo $cmd
+ssh $HOST $cmd
diff --git a/experiments/centralized/classification/run_classification.sh b/experiments/centralized/classification/run_classification.sh
new file mode 100644
index 0000000..a426e86
--- /dev/null
+++ b/experiments/centralized/classification/run_classification.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+NPROC_PER_NODE=$1
+NNODE=$2
+NODE_RANK=$3
+MASTER_ADDR=$4
+MASTER_PORT=$5
+GPU_UTIL=$6
+PYTHON=$7
+ARGS=$8
+
+
+
+CUDA_VISIBLE_DEVICES=$GPU_UTIL $PYTHON -m torch.distributed.launch \
+--nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODE --node_rank=$NODE_RANK \
+--master_addr $MASTER_ADDR \
+--master_port $MASTER_PORT \
+./ddp_classification.py --client_num_in_total $NPROC_PER_NODE $ARGS
+
+# CUDA_VISIBLE_DEVICES=$GPU_UTIL $PYTHON -m torch.distributed.launch \
+# --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODE --node_rank=$NODE_RANK \
+# --master_addr $MASTER_ADDR \
+# --master_port $MASTER_PORT \
+# ./ddp_classification.py 
diff --git a/experiments/centralized/classification/run_classification_with_conf.sh b/experiments/centralized/classification/run_classification_with_conf.sh
new file mode 100644
index 0000000..a426e86
--- /dev/null
+++ b/experiments/centralized/classification/run_classification_with_conf.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+NPROC_PER_NODE=$1
+NNODE=$2
+NODE_RANK=$3
+MASTER_ADDR=$4
+MASTER_PORT=$5
+GPU_UTIL=$6
+PYTHON=$7
+ARGS=$8
+
+
+
+CUDA_VISIBLE_DEVICES=$GPU_UTIL $PYTHON -m torch.distributed.launch \
+--nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODE --node_rank=$NODE_RANK \
+--master_addr $MASTER_ADDR \
+--master_port $MASTER_PORT \
+./ddp_classification.py --client_num_in_total $NPROC_PER_NODE $ARGS
+
+# CUDA_VISIBLE_DEVICES=$GPU_UTIL $PYTHON -m torch.distributed.launch \
+# --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODE --node_rank=$NODE_RANK \
+# --master_addr $MASTER_ADDR \
+# --master_port $MASTER_PORT \
+# ./ddp_classification.py 
diff --git a/experiments/centralized/classification/single_classification.py b/experiments/centralized/classification/single_classification.py
new file mode 100755
index 0000000..1e8b72f
--- /dev/null
+++ b/experiments/centralized/classification/single_classification.py
@@ -0,0 +1,437 @@
+import argparse
+import logging
+import os
+import random
+import socket
+import sys
+import traceback
+
+
+import numpy as np
+import psutil
+import setproctitle
+import wandb
+import torch
+import torch.nn as nn
+from timm import create_model as timm_create_model
+from timm.models import resume_checkpoint, load_checkpoint, convert_splitbn_model
+
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../../")))
+
+from utils.tracker import RuntimeTracker
+from utils.metrics import Metrics
+from utils.wandb_util import wandb_log
+from data_preprocessing.ImageNet.data_loader import distributed_centralized_ImageNet_loader
+from data_preprocessing.Landmarks.data_loader import load_partition_data_landmarks
+from data_preprocessing.cifar10.iid_data_loader import load_iid_cifar10
+from data_preprocessing.cifar10.data_loader import load_partition_data_cifar10
+from data_preprocessing.cifar100.data_loader import load_partition_data_cifar100
+from data_preprocessing.cinic10.data_loader import load_partition_data_cinic10
+
+
+from training.centralized_classification_trainer import ClassificationTrainer
+
+from utils.logger import (
+    logging_config
+)
+
+def add_args(parser):
+    """
+    parser : argparse.ArgumentParser
+    return a parser added with args required by fit
+    """
+    # Training settings
+    parser.add_argument('--model', type=str, default='mobilenet', metavar='N',
+                        help='neural network used in training')
+
+    parser.add_argument('--dataset', type=str, default='cifar10', metavar='N',
+                        help='dataset used for training')
+
+    parser.add_argument('--data_dir', type=str, default='./../../../data/cifar10',
+                        help='data directory')
+
+    parser.add_argument('--partition_method', type=str, default='hetero', metavar='N',
+                        help='how to partition the dataset on local workers')
+
+    parser.add_argument('--partition_alpha', type=float, default=0.5, metavar='PA',
+                        help='partition alpha (default: 0.5)')
+
+    parser.add_argument('--client_num_in_total', type=int, default=1000, metavar='NN',
+                        help='number of workers in a distributed cluster')
+
+    parser.add_argument('--client_num_per_round', type=int, default=4, metavar='NN',
+                        help='number of workers')
+
+    parser.add_argument('--batch_size', '-b', type=int, default=64, metavar='N',
+                        help='input batch size for training (default: 64)')
+
+    parser.add_argument('--client_optimizer', type=str, default='adam',
+                        help='SGD with momentum; adam')
+
+
+    # parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
+    #                     help='learning rate (default: 0.001)')
+
+    parser.add_argument('--wd', help='weight decay parameter;', type=float, default=0.001)
+
+    # parser.add_argument('--epochs', type=int, default=5, metavar='EP',
+    #                     help='how many epochs will be trained locally')
+
+    parser.add_argument('--comm_round', type=int, default=10,
+                        help='how many round of communications we shoud use')
+
+    parser.add_argument('--is_mobile', type=int, default=0,
+                        help='whether the program is running on the FedML-Mobile server side')
+
+    parser.add_argument('--frequency_of_the_test', type=int, default=1,
+                        help='the frequency of the algorithms')
+
+    parser.add_argument('--gpu_server_num', type=int, default=1,
+                        help='gpu_server_num')
+
+    parser.add_argument('--gpu_num_per_server', type=int, default=4,
+                        help='gpu_num_per_server')
+
+    parser.add_argument('--ci', type=int, default=0,
+                        help='CI')
+
+    parser.add_argument('--local_rank', type=int, default=0,
+                        help='given by torch.distributed.launch')
+
+    parser.add_argument('--pretrained',action='store_true', default=False,
+                        help='Start with pretrained version of specified network (if avail)')
+
+    parser.add_argument('--distributed', action='store_true', default=False,
+                        help='If distributed training')
+
+    parser.add_argument('--if-timm-dataset', action='store_true', default=False,
+                        help='If use timm dataset augmentation')
+
+    parser.add_argument('--data_load_num_workers', type=int, default=4,
+                        help='number of workers when loading data')
+
+
+    # logging settings
+    parser.add_argument('--level', type=str, default='INFO',
+                        help='level of logging')
+
+    # Dataset
+    parser.add_argument('--img-size', type=int, default=None, metavar='N',
+                        help='Image patch size (default: None => model default)')
+    parser.add_argument('--crop-pct', default=None, type=float,
+                        metavar='N', help='Input image center crop percent (for validation only)')
+    parser.add_argument('--data_transform', default=None, type=str, metavar='TRANSFORM',
+                        help='How to do data transform')
+    parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+                        help='Override mean pixel value of dataset')
+    parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+                        help='Override std deviation of of dataset')
+    parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+                        help='Image resize interpolation type (overrides model)')
+    # parser.add_argument('-b', '--batch-size', type=int, default=32, metavar='N',
+    #                     help='input batch size for training (default: 32)')
+    parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N',
+                        help='ratio of validation batch size to training batch size (default: 1)')
+
+
+    # Model parameters
+    parser.add_argument('--gp', default=None, type=str, metavar='POOL',
+                        help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.')
+
+    # Optimizer parameters
+    parser.add_argument('--opt', default='sgd', type=str, metavar='OPTIMIZER',
+                        help='Optimizer (default: "sgd"')
+    parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON',
+                        help='Optimizer Epsilon (default: None, use opt default)')
+    parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+                        help='Optimizer Betas (default: None, use opt default)')
+    parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+                        help='Optimizer momentum (default: 0.9)')
+    parser.add_argument('--weight-decay', type=float, default=0.0001,
+                        help='weight decay (default: 0.0001)')
+    parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM',
+                        help='Clip gradient norm (default: None, no clipping)')
+
+
+    # Learning rate schedule parameters
+    parser.add_argument('--sched', default=None, type=str, metavar='SCHEDULER',
+                        help='LR scheduler (default: "step"')
+    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                        help='learning rate (default: 0.01)')
+    parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct',
+                        help='learning rate noise on/off epoch percentages')
+    parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT',
+                        help='learning rate noise limit percent (default: 0.67)')
+    parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV',
+                        help='learning rate noise std-dev (default: 1.0)')
+    parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT',
+                        help='learning rate cycle len multiplier (default: 1.0)')
+    parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N',
+                        help='learning rate cycle limit')
+    parser.add_argument('--warmup-lr', type=float, default=0.0001, metavar='LR',
+                        help='warmup learning rate (default: 0.0001)')
+    parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')
+    parser.add_argument('--epochs', type=int, default=200, metavar='N',
+                        help='number of epochs to train (default: 2)')
+    parser.add_argument('--start-epoch', default=None, type=int, metavar='N',
+                        help='manual epoch number (useful on restarts)')
+    parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
+                        help='epoch interval to decay LR')
+    parser.add_argument('--warmup-epochs', type=int, default=3, metavar='N',
+                        help='epochs to warmup LR, if scheduler supports')
+    parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N',
+                        help='epochs to cooldown LR at min_lr, after cyclic schedule ends')
+    parser.add_argument('--patience-epochs', type=int, default=10, metavar='N',
+                        help='patience epochs for Plateau LR scheduler (default: 10')
+    parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
+                        help='LR decay rate (default: 0.1)')
+
+    # Augmentation & regularization parameters
+    parser.add_argument('--no-aug', action='store_true', default=False,
+                        help='Disable all training augmentation, override other train aug args')
+    parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT',
+                        help='Random resize scale (default: 0.08 1.0)')
+    parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO',
+                        help='Random resize aspect ratio (default: 0.75 1.33)')
+    parser.add_argument('--hflip', type=float, default=0.5,
+                        help='Horizontal flip training aug probability')
+    parser.add_argument('--vflip', type=float, default=0.,
+                        help='Vertical flip training aug probability')
+    parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT',
+                        help='Color jitter factor (default: 0.4)')
+    parser.add_argument('--aa', type=str, default=None, metavar='NAME',
+                        help='Use AutoAugment policy. "v0" or "original". (default: None)'),
+    parser.add_argument('--aug-splits', type=int, default=0,
+                        help='Number of augmentation splits (default: 0, valid: 0 or >=2)')
+    parser.add_argument('--jsd', action='store_true', default=False,
+                        help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.')
+    parser.add_argument('--reprob', type=float, default=0., metavar='PCT',
+                        help='Random erase prob (default: 0.)')
+    parser.add_argument('--remode', type=str, default='const',
+                        help='Random erase mode (default: "const")')
+    parser.add_argument('--recount', type=int, default=1,
+                        help='Random erase count (default: 1)')
+    parser.add_argument('--resplit', action='store_true', default=False,
+                        help='Do not random erase first (clean) augmentation split')
+    parser.add_argument('--mixup', type=float, default=0.0,
+                        help='mixup alpha, mixup enabled if > 0. (default: 0.)')
+    parser.add_argument('--cutmix', type=float, default=0.0,
+                        help='cutmix alpha, cutmix enabled if > 0. (default: 0.)')
+    parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None,
+                        help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
+    parser.add_argument('--mixup-prob', type=float, default=1.0,
+                        help='Probability of performing mixup or cutmix when either/both is enabled')
+    parser.add_argument('--mixup-switch-prob', type=float, default=0.5,
+                        help='Probability of switching to cutmix when both mixup and cutmix enabled')
+    parser.add_argument('--mixup-mode', type=str, default='batch',
+                        help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
+    parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N',
+                        help='Turn off mixup after this epoch, disabled if 0 (default: 0)')
+    parser.add_argument('--smoothing', type=float, default=0.1,
+                        help='Label smoothing (default: 0.1)')
+    parser.add_argument('--train-interpolation', type=str, default='random',
+                        help='Training interpolation (random, bilinear, bicubic default: "random")')
+    parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
+                        help='Dropout rate (default: 0.)')
+    parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT',
+                        help='Drop connect rate, DEPRECATED, use drop-path (default: None)')
+    parser.add_argument('--drop-path', type=float, default=None, metavar='PCT',
+                        help='Drop path rate (default: None)')
+    parser.add_argument('--drop-block', type=float, default=None, metavar='PCT',
+                        help='Drop block rate (default: None)')
+
+    # Batch norm parameters (only works with gen_efficientnet based models currently)
+    parser.add_argument('--bn-tf', type=bool, default=False,
+                        help='Use Tensorflow BatchNorm defaults for models that support it (default: False)')
+    parser.add_argument('--bn-momentum', type=float, default=None,
+                        help='BatchNorm momentum override (if not None)')
+    parser.add_argument('--bn-eps', type=float, default=None,
+                        help='BatchNorm epsilon override (if not None)')
+    parser.add_argument('--sync-bn', action='store_true',
+                        help='Enable NVIDIA Apex or Torch synchronized BatchNorm.')
+    parser.add_argument('--dist-bn', type=str, default='',
+                        help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")')
+    parser.add_argument('--split-bn', action='store_true',
+                        help='Enable separate BN layers per augmentation split.')
+
+    # Model Exponential Moving Average
+    parser.add_argument('--model-ema', action='store_true', default=False,
+                        help='Enable tracking moving average of model weights')
+    parser.add_argument('--model-ema-force-cpu', action='store_true', default=False,
+                        help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.')
+    parser.add_argument('--model-ema-decay', type=float, default=0.9998,
+                        help='decay factor for model weights moving average (default: 0.9998)')
+
+    args = parser.parse_args()
+    return args
+
+
+def load_data(args, dataset_name):
+    if dataset_name in ["ILSVRC2012", "ILSVRC2012-100"]:
+        logging.info("load_data. dataset_name = %s" % dataset_name)
+        train_data_num, test_data_num, train_data_global, test_data_global, \
+        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
+        class_num = distributed_centralized_ImageNet_loader(dataset=dataset_name, data_dir=args.data_dir,
+                                                 world_size=args.client_num_in_total, 
+                                                 rank=args.rank, batch_size=args.batch_size,
+                                                 args=args)
+
+    elif dataset_name == "gld23k":
+        logging.info("load_data. dataset_name = %s" % dataset_name)
+        args.client_num_in_total = 233
+        # fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_train.csv')
+        # fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_test.csv')
+        fed_train_map_file = os.path.join(args.data_dir, 'mini_gld_train_split.csv')
+        fed_test_map_file = os.path.join(args.data_dir, 'mini_gld_test.csv')
+        args.data_dir = os.path.join(args.data_dir, 'images')
+
+        train_data_num, test_data_num, train_data_global, test_data_global, \
+        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
+        class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir,
+                                                  fed_train_map_file=fed_train_map_file,
+                                                  fed_test_map_file=fed_test_map_file,
+                                                  partition_method=None, partition_alpha=None,
+                                                  client_number=233, batch_size=args.batch_size, args=args)
+
+    elif dataset_name == "gld160k":
+        logging.info("load_data. dataset_name = %s" % dataset_name)
+        args.client_num_in_total = 1262
+        # fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_train.csv')
+        # fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_test.csv')
+        fed_train_map_file = os.path.join(args.data_dir, 'federated_train.csv')
+        fed_test_map_file = os.path.join(args.data_dir, 'test.csv')
+        args.data_dir = os.path.join(args.data_dir, 'images')
+
+        train_data_num, test_data_num, train_data_global, test_data_global, \
+        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
+        class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir,
+                                                  fed_train_map_file=fed_train_map_file,
+                                                  fed_test_map_file=fed_test_map_file,
+                                                  partition_method=None, partition_alpha=None,
+                                                  client_number=1262, batch_size=args.batch_size, args=args)
+    else:
+        if dataset_name == "cifar10":
+            data_loader = load_partition_data_cifar10
+        elif dataset_name == "cifar100":
+            data_loader = load_partition_data_cifar100
+        elif dataset_name == "cinic10":
+            data_loader = load_partition_data_cinic10
+        else:
+            raise Exception("no such dataset")
+
+        train_data_num, test_data_num, train_data_global, test_data_global, \
+        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
+        class_num = data_loader(args.dataset, args.data_dir, args.partition_method,
+                                args.partition_alpha, args.client_num_in_total, args.batch_size)
+
+
+    dataset = [train_data_num, test_data_num, train_data_global, test_data_global,
+               train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num]
+    return dataset
+
+
+def create_model(args, model_name, output_dim):
+    logging.info("create_model. model_name = %s, output_dim = %s" % (model_name, output_dim))
+    if model_name == 'mobilenet_v3':
+        '''model_mode \in {LARGE: 5.15M, SMALL: 2.94M}'''
+        # model = MobileNetV3(model_mode='LARGE')
+        model = timm_create_model(
+        model_name="mobilenetv3_large_100",
+        pretrained=args.pretrained,
+        num_classes=output_dim,
+        drop_rate=args.drop,
+        # drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
+        drop_path_rate=args.drop_path,
+        drop_block_rate=args.drop_block,
+        global_pool=args.gp,
+        bn_tf=args.bn_tf,
+        bn_momentum=args.bn_momentum,
+        bn_eps=args.bn_eps)
+
+    elif model_name == 'efficientnet':
+        model = timm_create_model(
+        model_name="efficientnet_b0",
+        pretrained=args.pretrained,
+        num_classes=output_dim,
+        drop_rate=args.drop,
+        # drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
+        drop_path_rate=args.drop_path,
+        drop_block_rate=args.drop_block,
+        global_pool=args.gp,
+        bn_tf=args.bn_tf,
+        bn_momentum=args.bn_momentum,
+        bn_eps=args.bn_eps)
+    else:
+        raise Exception("no such model")
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="PyTorch DDP Demo")
+    args = add_args(parser)
+    # parser.add_argument("--local_rank", type=int, default=0)
+    args = parser.parse_args()
+    logging.info(args)
+    # args.weight_decay = args.wd
+    args.wd = args.weight_decay
+
+    process_id = 0
+    args.rank = 0
+
+    # customize the process name
+    str_process_name = "single_classification:" + str(process_id)
+    setproctitle.setproctitle(str_process_name)
+
+    logging_config(args, process_id)
+
+
+    name_model_ema = "-model_ema" if args.model_ema else "-no_model_ema"
+    name_aa = args.aa if args.aa is not None else "_None"
+    # initialize the wandb machine learning experimental tracking platform (https://www.wandb.com/).
+    if process_id == 0:
+        wandb.init(
+            entity="automl",
+            project="fedcv-classification",
+            name="FedCV (c new)" + str(args.partition_method) + "-" +str(args.dataset)+
+                "-e" + str(args.epochs) + "-" + str(args.model) + "-" +
+                args.data_transform + "-aa" + name_aa + "-" + str(args.opt) + 
+                name_model_ema + "-bs" + str(args.batch_size) +
+                "-lr" + str(args.lr) + "-wd" + str(args.wd),
+            config=args
+        )
+
+
+    random.seed(0)
+    np.random.seed(0)
+    torch.manual_seed(0)
+    torch.cuda.manual_seed_all(0)
+
+    # GPU
+    device = torch.device("cuda:" + str(process_id))
+
+    # load data
+    dataset = load_data(args, args.dataset)
+    [train_data_num, test_data_num, train_data_global, test_data_global,
+     train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] = dataset
+
+    # create model.
+    # Note if the model is DNN (e.g., ResNet), the training will be very slow.
+    # In this case, please use our FedML distributed version (./fedml_experiments/distributed_fedavg)
+    model = create_model(args, model_name=args.model, output_dim=dataset[7])
+    model = model.to(device)
+    print(model)
+
+    metrics = Metrics(topks=[1], task="classification")
+    train_tracker = RuntimeTracker(things_to_track=metrics.metric_names)
+    test_tracker = RuntimeTracker(things_to_track=metrics.metric_names)
+
+    model_trainer = ClassificationTrainer(model, device, args)
+    for epoch in range(args.epochs):
+        model_trainer.train_one_epoch(train_data_global, device, args, epoch, train_tracker, metrics)
+        model_trainer.test(test_data_global, device, args, test_tracker, metrics)
+        wandb_log(prefix='Test', sp_values=test_tracker(), com_values={"epoch": epoch})
+        wandb_log(prefix='Train', sp_values=train_tracker(), com_values={"epoch": epoch})
+        train_tracker.reset()
+        test_tracker.reset()
diff --git a/experiments/centralized/classification/single_run_classification.sh b/experiments/centralized/classification/single_run_classification.sh
new file mode 100755
index 0000000..eae0f4d
--- /dev/null
+++ b/experiments/centralized/classification/single_run_classification.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+GPU_UTIL=$1
+PYTHON=$2
+ARGS=$3
+
+
+CUDA_VISIBLE_DEVICES=$GPU_UTIL $PYTHON ./single_classification.py --client_num_in_total 1 $ARGS
+
+# CUDA_VISIBLE_DEVICES=$GPU_UTIL $PYTHON -m torch.distributed.launch \
+# --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODE --node_rank=$NODE_RANK \
+# --master_addr $MASTER_ADDR \
+# --master_port $MASTER_PORT \
+# ./ddp_classification.py 
diff --git a/experiments/centralized/classification/single_run_classification_with_conf.sh b/experiments/centralized/classification/single_run_classification_with_conf.sh
new file mode 100644
index 0000000..fbaf85b
--- /dev/null
+++ b/experiments/centralized/classification/single_run_classification_with_conf.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+GPU_UTIL=$1
+DATASET=$2
+DATA_DIR=$3
+ARGS=$4
+
+source configs/cluster.conf
+PYTHON=`cat configs/cluster.conf | grep PYTHON | awk -F= "{print $2}"`
+data_dir=`cat configs/cluster.conf | grep $DATA_DIR | awk -F= "{print $2}"`
+
+
+CUDA_VISIBLE_DEVICES=$GPU_UTIL $PYTHON ./single_classification.py \
+    --client_num_in_total 1 \
+    --data_dir $data_dir --dataset $DATASET \ 
+    $ARGS
+
+# CUDA_VISIBLE_DEVICES=$GPU_UTIL $PYTHON -m torch.distributed.launch \
+# --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODE --node_rank=$NODE_RANK \
+# --master_addr $MASTER_ADDR \
+# --master_port $MASTER_PORT \
+# ./ddp_classification.py 
diff --git a/experiments/distributed/Detection/data/coco.yaml b/experiments/distributed/Detection/data/coco.yaml
new file mode 100644
index 0000000..09f3a78
--- /dev/null
+++ b/experiments/distributed/Detection/data/coco.yaml
@@ -0,0 +1,35 @@
+# COCO 2017 dataset http://cocodataset.org
+# Train command: python train.py --data coco.yaml
+# Default dataset location is next to /yolov5:
+#   /parent_folder
+#     /coco
+#     /yolov5
+
+
+# download command/URL (optional)
+download: bash data/scripts/get_coco.sh
+
+# train and val data as 1) directory: path/images/, 2) file: path/images.txt, or 3) list: [path1/images/, path2/images/]
+train: ../coco/train2017.txt  # 118287 images
+val: ../coco/val2017.txt  # 5000 images
+test: ../coco/test-dev2017.txt  # 20288 of 40670 images, submit to https://competitions.codalab.org/competitions/20794
+
+# number of classes
+nc: 80
+
+# class names
+names: ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
+        'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+        'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+        'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+        'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
+        'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+        'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
+        'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+        'hair drier', 'toothbrush']
+
+# Print classes
+# with open('data/coco.yaml') as f:
+#   d = yaml.load(f, Loader=yaml.FullLoader)  # dict
+#   for i, x in enumerate(d['names']):
+#     print(i, x)
diff --git a/experiments/distributed/Detection/data/coco128.yaml b/experiments/distributed/Detection/data/coco128.yaml
new file mode 100644
index 0000000..12e1d79
--- /dev/null
+++ b/experiments/distributed/Detection/data/coco128.yaml
@@ -0,0 +1,28 @@
+# COCO 2017 dataset http://cocodataset.org - first 128 training images
+# Train command: python train.py --data coco128.yaml
+# Default dataset location is next to /yolov5:
+#   /parent_folder
+#     /coco128
+#     /yolov5
+
+
+# download command/URL (optional)
+download: https://github.com/ultralytics/yolov5/releases/download/v1.0/coco128.zip
+
+# train and val data as 1) directory: path/images/, 2) file: path/images.txt, or 3) list: [path1/images/, path2/images/]
+train: ../coco128/images/train2017/  # 128 images
+val: ../coco128/images/train2017/  # 128 images
+
+# number of classes
+nc: 80
+
+# class names
+names: ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
+        'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+        'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+        'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+        'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
+        'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+        'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
+        'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+        'hair drier', 'toothbrush']
diff --git a/experiments/distributed/Detection/data/hyp.scratch.yaml b/experiments/distributed/Detection/data/hyp.scratch.yaml
new file mode 100644
index 0000000..44f26b6
--- /dev/null
+++ b/experiments/distributed/Detection/data/hyp.scratch.yaml
@@ -0,0 +1,33 @@
+# Hyperparameters for COCO training from scratch
+# python train.py --batch 40 --cfg yolov5m.yaml --weights '' --data coco.yaml --img 640 --epochs 300
+# See tutorials for hyperparameter evolution https://github.com/ultralytics/yolov5#tutorials
+
+
+lr0: 0.01  # initial learning rate (SGD=1E-2, Adam=1E-3)
+lrf: 0.2  # final OneCycleLR learning rate (lr0 * lrf)
+momentum: 0.937  # SGD momentum/Adam beta1
+weight_decay: 0.0005  # optimizer weight decay 5e-4
+warmup_epochs: 3.0  # warmup epochs (fractions ok)
+warmup_momentum: 0.8  # warmup initial momentum
+warmup_bias_lr: 0.1  # warmup initial bias lr
+box: 0.05  # box loss gain
+cls: 0.5  # cls loss gain
+cls_pw: 1.0  # cls BCELoss positive_weight
+obj: 1.0  # obj loss gain (scale with pixels)
+obj_pw: 1.0  # obj BCELoss positive_weight
+iou_t: 0.20  # IoU training threshold
+anchor_t: 4.0  # anchor-multiple threshold
+# anchors: 3  # anchors per output layer (0 to ignore)
+fl_gamma: 0.0  # focal loss gamma (efficientDet default gamma=1.5)
+hsv_h: 0.015  # image HSV-Hue augmentation (fraction)
+hsv_s: 0.7  # image HSV-Saturation augmentation (fraction)
+hsv_v: 0.4  # image HSV-Value augmentation (fraction)
+degrees: 0.0  # image rotation (+/- deg)
+translate: 0.1  # image translation (+/- fraction)
+scale: 0.5  # image scale (+/- gain)
+shear: 0.0  # image shear (+/- deg)
+perspective: 0.0  # image perspective (+/- fraction), range 0-0.001
+flipud: 0.0  # image flip up-down (probability)
+fliplr: 0.5  # image flip left-right (probability)
+mosaic: 1.0  # image mosaic (probability)
+mixup: 0.0  # image mixup (probability)
diff --git a/experiments/distributed/Detection/data/scripts/get_coco.sh b/experiments/distributed/Detection/data/scripts/get_coco.sh
new file mode 100644
index 0000000..157a0b0
--- /dev/null
+++ b/experiments/distributed/Detection/data/scripts/get_coco.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# COCO 2017 dataset http://cocodataset.org
+# Download command: bash data/scripts/get_coco.sh
+# Train command: python train.py --data coco.yaml
+# Default dataset location is next to /yolov5:
+#   /parent_folder
+#     /coco
+#     /yolov5
+
+# Download/unzip labels
+d='../' # unzip directory
+url=https://github.com/ultralytics/yolov5/releases/download/v1.0/
+f='coco2017labels.zip'                                                                 # 68 MB
+echo 'Downloading' $url$f ' ...' && curl -L $url$f -o $f && unzip -q $f -d $d && rm $f # download, unzip, remove
+
+# Download/unzip images
+d='../coco/images' # unzip directory
+url=http://images.cocodataset.org/zips/
+f1='train2017.zip' # 19G, 118k images
+f2='val2017.zip'   # 1G, 5k images
+f3='test2017.zip'  # 7G, 41k images (optional)
+for f in $f1 $f2; do
+  echo 'Downloading' $url$f ' ...' && curl -L $url$f -o $f && unzip -q $f -d $d && rm $f # download, unzip, remove
+done
diff --git a/experiments/distributed/Detection/data/scripts/get_voc.sh b/experiments/distributed/Detection/data/scripts/get_voc.sh
new file mode 100644
index 0000000..6bdaa9b
--- /dev/null
+++ b/experiments/distributed/Detection/data/scripts/get_voc.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+# PASCAL VOC dataset http://host.robots.ox.ac.uk/pascal/VOC/
+# Download command: bash data/scripts/get_voc.sh
+# Train command: python train.py --data voc.yaml
+# Default dataset location is next to /yolov5:
+#   /parent_folder
+#     /VOC
+#     /yolov5
+
+start=$(date +%s)
+mkdir -p ../tmp
+cd ../tmp/
+
+# Download/unzip images and labels
+d='.' # unzip directory
+url=https://github.com/ultralytics/yolov5/releases/download/v1.0/
+f1=VOCtrainval_06-Nov-2007.zip # 446MB, 5012 images
+f2=VOCtest_06-Nov-2007.zip     # 438MB, 4953 images
+f3=VOCtrainval_11-May-2012.zip # 1.95GB, 17126 images
+for f in $f1 $f2 $f3; do
+  echo 'Downloading' $url$f ' ...' && curl -L $url$f -o $f && unzip -q $f -d $d && rm $f # download, unzip, remove
+done
+
+end=$(date +%s)
+runtime=$((end - start))
+echo "Completed in" $runtime "seconds"
+
+echo "Splitting dataset..."
+python3 - "$@" <<END
+import xml.etree.ElementTree as ET
+import pickle
+import os
+from os import listdir, getcwd
+from os.path import join
+
+sets=[('2012', 'train'), ('2012', 'val'), ('2007', 'train'), ('2007', 'val'), ('2007', 'test')]
+
+classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
+
+
+def convert(size, box):
+    dw = 1./(size[0])
+    dh = 1./(size[1])
+    x = (box[0] + box[1])/2.0 - 1
+    y = (box[2] + box[3])/2.0 - 1
+    w = box[1] - box[0]
+    h = box[3] - box[2]
+    x = x*dw
+    w = w*dw
+    y = y*dh
+    h = h*dh
+    return (x,y,w,h)
+
+def convert_annotation(year, image_id):
+    in_file = open('VOCdevkit/VOC%s/Annotations/%s.xml'%(year, image_id))
+    out_file = open('VOCdevkit/VOC%s/labels/%s.txt'%(year, image_id), 'w')
+    tree=ET.parse(in_file)
+    root = tree.getroot()
+    size = root.find('size')
+    w = int(size.find('width').text)
+    h = int(size.find('height').text)
+
+    for obj in root.iter('object'):
+        difficult = obj.find('difficult').text
+        cls = obj.find('name').text
+        if cls not in classes or int(difficult)==1:
+            continue
+        cls_id = classes.index(cls)
+        xmlbox = obj.find('bndbox')
+        b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
+        bb = convert((w,h), b)
+        out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')
+
+wd = getcwd()
+
+for year, image_set in sets:
+    if not os.path.exists('VOCdevkit/VOC%s/labels/'%(year)):
+        os.makedirs('VOCdevkit/VOC%s/labels/'%(year))
+    image_ids = open('VOCdevkit/VOC%s/ImageSets/Main/%s.txt'%(year, image_set)).read().strip().split()
+    list_file = open('%s_%s.txt'%(year, image_set), 'w')
+    for image_id in image_ids:
+        list_file.write('%s/VOCdevkit/VOC%s/JPEGImages/%s.jpg\n'%(wd, year, image_id))
+        convert_annotation(year, image_id)
+    list_file.close()
+
+END
+
+cat 2007_train.txt 2007_val.txt 2012_train.txt 2012_val.txt >train.txt
+cat 2007_train.txt 2007_val.txt 2007_test.txt 2012_train.txt 2012_val.txt >train.all.txt
+
+python3 - "$@" <<END
+
+import shutil
+import os
+os.system('mkdir ../VOC/')
+os.system('mkdir ../VOC/images')
+os.system('mkdir ../VOC/images/train')
+os.system('mkdir ../VOC/images/val')
+
+os.system('mkdir ../VOC/labels')
+os.system('mkdir ../VOC/labels/train')
+os.system('mkdir ../VOC/labels/val')
+
+import os
+print(os.path.exists('../tmp/train.txt'))
+f = open('../tmp/train.txt', 'r')
+lines = f.readlines()
+
+for line in lines:
+    line = "/".join(line.split('/')[-5:]).strip()
+    if (os.path.exists("../" + line)):
+        os.system("cp ../"+ line + " ../VOC/images/train")
+        
+    line = line.replace('JPEGImages', 'labels')
+    line = line.replace('jpg', 'txt')
+    if (os.path.exists("../" + line)):
+        os.system("cp ../"+ line + " ../VOC/labels/train")
+
+
+print(os.path.exists('../tmp/2007_test.txt'))
+f = open('../tmp/2007_test.txt', 'r')
+lines = f.readlines()
+
+for line in lines:
+    line = "/".join(line.split('/')[-5:]).strip()
+    if (os.path.exists("../" + line)):
+        os.system("cp ../"+ line + " ../VOC/images/val")
+        
+    line = line.replace('JPEGImages', 'labels')
+    line = line.replace('jpg', 'txt')
+    if (os.path.exists("../" + line)):
+        os.system("cp ../"+ line + " ../VOC/labels/val")
+
+END
+
+rm -rf ../tmp # remove temporary directory
+echo "VOC download done."
diff --git a/experiments/distributed/Detection/data/yolov5s.yaml b/experiments/distributed/Detection/data/yolov5s.yaml
new file mode 100644
index 0000000..2bec452
--- /dev/null
+++ b/experiments/distributed/Detection/data/yolov5s.yaml
@@ -0,0 +1,48 @@
+# parameters
+nc: 80  # number of classes
+depth_multiple: 0.33  # model depth multiple
+width_multiple: 0.50  # layer channel multiple
+
+# anchors
+anchors:
+  - [10,13, 16,30, 33,23]  # P3/8
+  - [30,61, 62,45, 59,119]  # P4/16
+  - [116,90, 156,198, 373,326]  # P5/32
+
+# YOLOv5 backbone
+backbone:
+  # [from, number, module, args]
+  [[-1, 1, Focus, [64, 3]],  # 0-P1/2
+   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
+   [-1, 3, BottleneckCSP, [128]],
+   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
+   [-1, 9, BottleneckCSP, [256]],
+   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
+   [-1, 9, BottleneckCSP, [512]],
+   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
+   [-1, 1, SPP, [1024, [5, 9, 13]]],
+   [-1, 3, BottleneckCSP, [1024, False]],  # 9
+  ]
+
+# YOLOv5 head
+head:
+  [[-1, 1, Conv, [512, 1, 1]],
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
+   [-1, 3, BottleneckCSP, [512, False]],  # 13
+
+   [-1, 1, Conv, [256, 1, 1]],
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
+   [-1, 3, BottleneckCSP, [256, False]],  # 17 (P3/8-small)
+
+   [-1, 1, Conv, [256, 3, 2]],
+   [[-1, 14], 1, Concat, [1]],  # cat head P4
+   [-1, 3, BottleneckCSP, [512, False]],  # 20 (P4/16-medium)
+
+   [-1, 1, Conv, [512, 3, 2]],
+   [[-1, 10], 1, Concat, [1]],  # cat head P5
+   [-1, 3, BottleneckCSP, [1024, False]],  # 23 (P5/32-large)
+
+   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
+  ]
diff --git a/experiments/distributed/Detection/main_fedavg_yolo.py b/experiments/distributed/Detection/main_fedavg_yolo.py
new file mode 100644
index 0000000..34f34f0
--- /dev/null
+++ b/experiments/distributed/Detection/main_fedavg_yolo.py
@@ -0,0 +1,410 @@
+import argparse
+import logging
+import os
+import sys
+import random
+import time
+from pathlib import Path
+from threading import Thread
+from warnings import warn
+import traceback
+from mpi4py import MPI
+import psutil
+
+import math
+import socket
+import setproctitle
+import numpy as np
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.optim.lr_scheduler as lr_scheduler
+import torch.utils.data
+import yaml
+from torch.cuda import amp
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+import test  # import test.py to get mAP after each epoch
+import pdb
+
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../../")))
+
+
+
+
+from model.detection.models.yolo import Model
+
+from model.detection.utils.autoanchor import check_anchors
+from model.detection.utils.general import labels_to_class_weights, increment_path, labels_to_image_weights, init_seeds, \
+    fitness, strip_optimizer, get_latest_run, check_dataset, check_file, check_git_status, check_img_size, \
+    print_mutation, set_logging
+from model.detection.utils.google_utils import attempt_download
+from model.detection.utils.loss import compute_loss
+from model.detection.utils.plots import plot_images, plot_labels, plot_results, plot_evolution
+from model.detection.utils.torch_utils import ModelEMA, select_device, intersect_dicts, torch_distributed_zero_first
+
+
+from FedML.fedml_api.distributed.fedavg.FedAvgAPI import FedML_init, FedML_FedAvg_distributed
+from data_preprocessing.coco.coco_detection.data_loader import load_partition_data_coco
+
+logger = logging.getLogger(__name__)
+
+import sys
+import torch
+from torch.utils.data import dataloader
+from torch.multiprocessing import reductions
+from multiprocessing.reduction import ForkingPickler
+
+default_collate_func = dataloader.default_collate
+
+
+def default_collate_override(batch):
+  dataloader._use_shared_memory = False
+  return default_collate_func(batch)
+
+setattr(dataloader, 'default_collate', default_collate_override)
+
+for t in torch._storage_classes:
+  if sys.version_info[0] == 2:
+    if t in ForkingPickler.dispatch:
+        del ForkingPickler.dispatch[t]
+  else:
+    if t in ForkingPickler._extra_reducers:
+        del ForkingPickler._extra_reducers[t]
+
+
+
+try:
+    import wandb
+except ImportError:
+    wandb = None
+    logger.info("Install Weights & Biases for experiment logging via 'pip install wandb' (recommended)")
+
+
+def init_training_device(process_ID, fl_worker_num, gpu_num_per_machine):
+    # initialize the mapping from process ID to GPU ID: <process ID, GPU ID>
+    if process_ID == 0:
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        return device
+    process_gpu_dict = dict()
+    for client_index in range(fl_worker_num):
+        gpu_index = client_index % gpu_num_per_machine
+        process_gpu_dict[client_index] = gpu_index
+
+    logging.info(process_gpu_dict)
+    device = torch.device("cuda:" + str(process_gpu_dict[process_ID - 1]) if torch.cuda.is_available() else "cpu")
+    logging.info(device)
+    return device
+
+if __name__ == '__main__':
+    import tracemalloc
+    tracemalloc.start()
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--weights', type=str, default='yolov5s.pt', help='initial weights path')
+    parser.add_argument('--cfg', type=str, default='', help='model.yaml path')
+    parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path')
+    parser.add_argument('--hyp', type=str, default='data/hyp.scratch.yaml', help='hyperparameters path')
+    # parser.add_argument('--epochs', type=int, default=300)
+    # parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs')
+    parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='[train, test] image sizes')
+    parser.add_argument('--rect', action='store_true', help='rectangular training')
+    parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training')
+    parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
+    parser.add_argument('--notest', action='store_true', help='only test final epoch')
+    parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
+    parser.add_argument('--evolve', action='store_true', help='evolve hyperparameters')
+    parser.add_argument('--bucket', type=str, default='', help='gsutil bucket')
+    parser.add_argument('--cache-images', action='store_true', help='cache images for faster training')
+    parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training')
+    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
+    parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
+    parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
+    parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer')
+
+    parser.add_argument('--optimizer', default=None, help='optimizer')
+    parser.add_argument('--scheduler', default=None , help='optimizer scheduler')
+    parser.add_argument('--wandb', default=None, help='wandb init')
+    parser.add_argument('--ema', default=None, help='ema init')
+
+    parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
+    parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
+    parser.add_argument('--log-imgs', type=int, default=16, help='number of images for W&B logging, max 100')
+    parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers')
+    parser.add_argument('--project', default='runs/train', help='save to project/name')
+    parser.add_argument('--name', default='exp', help='save to project/name')
+    parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
+    parser.add_argument('--linear-lr', action='store_true', help='linear LR')
+    parser.add_argument('--model_stride', default=0, type=int)
+
+    # Training settings
+    parser.add_argument('--model', type=str, default='mobilenet', metavar='N',
+                        help='neural network used in training')
+
+    parser.add_argument('--dataset', type=str, default='cifar10', metavar='N',
+                        help='dataset used for training')
+
+    parser.add_argument('--data_dir', type=str, default='./../../../data/cifar10',
+                        help='data directory')
+
+    parser.add_argument('--partition_method', type=str, default='hetero', metavar='N',
+                        help='how to partition the dataset on local workers')
+
+    parser.add_argument('--partition_alpha', type=float, default=0.5, metavar='PA',
+                        help='partition alpha (default: 0.5)')
+
+    parser.add_argument('--client_num_in_total', type=int, default=1000, metavar='NN',
+                        help='number of workers in a distributed cluster')
+
+    parser.add_argument('--client_num_per_round', type=int, default=4, metavar='NN',
+                        help='number of workers')
+
+    parser.add_argument('--batch_size', type=int, default=64, metavar='N',
+                        help='input batch size for training (default: 64)')
+
+    parser.add_argument('--client_optimizer', type=str, default='adam',
+                        help='SGD with momentum; adam')
+
+    parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
+                        help='learning rate (default: 0.001)')
+
+    parser.add_argument('--wd', help='weight decay parameter;', type=float, default=0.001)
+
+    parser.add_argument('--epochs', type=int, default=5, metavar='EP',
+                        help='how many epochs will be trained locally')
+
+    parser.add_argument('--comm_round', type=int, default=10,
+                        help='how many round of communications we shoud use')
+
+    parser.add_argument('--is_mobile', type=int, default=0,
+                        help='whether the program is running on the FedML-Mobile server side')
+
+    parser.add_argument('--frequency_of_the_test', type=int, default=1,
+                        help='the frequency of the algorithms')
+
+    parser.add_argument('--gpu_server_num', type=int, default=1,
+                        help='gpu_server_num')
+
+    parser.add_argument('--gpu_num_per_server', type=int, default=4,
+                        help='gpu_num_per_server')
+
+    parser.add_argument('--ci', type=int, default=0,
+                        help='CI')
+
+    parser.add_argument('--round_idx', type=int, default=0,
+                        help='round_idx')
+    opt = parser.parse_args()
+    opt.total_batch_size = opt.batch_size
+    opt.world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
+    opt.global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1
+    set_logging(opt.global_rank)
+    if opt.global_rank in [-1, 0]:
+        check_git_status()
+
+    # fedml
+    comm, process_id, worker_number = FedML_init()
+    logging.info(opt)
+    print("process_id:", process_id)
+    #if process_id == 0:
+    wandb.init(
+            # project="federated_nas",
+            project="fedml distributed",
+            name="FedAVG(d)" + str(opt.partition_method) + "-c" + str(opt.comm_round) + "-e" + str(
+                opt.epochs),
+            config=opt
+        )
+    device = init_training_device(process_id, worker_number - 1, opt.gpu_num_per_server)
+    str_process_name = "FedAvg (distributed):" + str(process_id)
+    setproctitle.setproctitle(str_process_name)
+
+    # customize the log format
+    # logging.basicConfig(level=logging.INFO,
+    logging.basicConfig(level=logging.DEBUG,
+                        format=str(
+                            process_id) + ' - %(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
+                        datefmt='%a, %d %b %Y %H:%M:%S')
+    hostname = socket.gethostname()
+    logging.info("#############process ID = " + str(process_id) +
+                 ", host name = " + hostname + "########" +
+                 ", process ID = " + str(os.getpid()) +
+                 ", process Name = " + str(psutil.Process(os.getpid())))
+
+    if opt.resume:  # resume an interrupted run
+        ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run()  # specified or most recent path
+        assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
+        with open(Path(ckpt).parent.parent / 'opt.yaml') as f:
+            opt = argparse.Namespace(**yaml.load(f, Loader=yaml.FullLoader))  # replace
+        opt.cfg, opt.weights, opt.resume = '', ckpt, True
+        logger.info('Resuming training from %s' % ckpt)
+    else:
+        # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml')
+        opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp)  # check files
+        assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
+        opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size)))  # extend to 2 sizes (train, test)
+        opt.name = 'evolve' if opt.evolve else opt.name
+        opt.save_dir = increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok | opt.evolve)  # increment run
+
+    # DDP mode
+    #device = select_device(opt.device, batch_size=opt.batch_size)
+    if opt.local_rank != -1:
+        assert torch.cuda.device_count() > opt.local_rank
+        torch.cuda.set_device(opt.local_rank)
+        device = torch.device('cuda', opt.local_rank)
+        dist.init_process_group(backend='nccl', init_method='env://')  # distributed backend
+        assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'
+        opt.batch_size = opt.total_batch_size // opt.world_size
+
+    # Hyperparameters
+    with open(opt.hyp) as f:
+        hyp = yaml.load(f, Loader=yaml.FullLoader)  # load hyps
+        if 'box' not in hyp:
+            warn('Compatibility: %s missing "box" which was renamed from "giou" in %s' %
+                 (opt.hyp, 'https://github.com/ultralytics/yolov5/pull/1120'))
+            hyp['box'] = hyp.pop('giou')
+
+
+    logger.info(f'Hyperparameters {hyp}')
+    save_dir, epochs, batch_size, total_batch_size, weights, rank = \
+        Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank
+
+    # Directories
+    wdir = save_dir / 'weights'
+    wdir.mkdir(parents=True, exist_ok=True)  # make dir
+    last = wdir / 'last.pt'
+    best = wdir / 'best.pt'
+    results_file = save_dir / 'results.txt'
+
+    # Save run settings
+    with open(save_dir / 'hyp.yaml', 'w') as f:
+        yaml.dump(hyp, f, sort_keys=False)
+    with open(save_dir / 'opt.yaml', 'w') as f:
+        yaml.dump(vars(opt), f, sort_keys=False)
+
+        # Configure
+        plots = not opt.evolve  # create plots
+        cuda = device.type != 'cpu'
+        init_seeds(2 + rank)
+        with open(opt.data) as f:
+            data_dict = yaml.load(f, Loader=yaml.FullLoader)  # data dict
+        with torch_distributed_zero_first(rank):
+            check_dataset(data_dict)  # check
+        train_path = data_dict['train']
+        test_path = data_dict['val']
+        nc, names = (1, ['item']) if opt.single_cls else (
+        int(data_dict['nc']), data_dict['names'])  # number classes, names
+        assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data)  # check
+
+    # Model
+    print("weights:", weights)
+    pretrained = weights.endswith('.pt')
+    if pretrained:
+        with torch_distributed_zero_first(rank):
+            attempt_download(weights)  # download if not found locally
+        ckpt = torch.load(weights, map_location=device)  # load checkpoint
+        if hyp.get('anchors'):
+            ckpt['model'].yaml['anchors'] = round(hyp['anchors'])  # force autoanchor
+        model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device)  # create
+        exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else []  # exclude keys
+        state_dict = ckpt['model'].float().state_dict()  # to FP32
+        state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude)  # intersect
+        model.load_state_dict(state_dict, strict=False)  # load
+        logger.info(
+            'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights))  # report
+    else:
+        model = Model(opt.cfg, ch=3, nc=nc).to(device)  # create
+
+    # Freeze
+    freeze = []  # parameter names to freeze (full or partial)
+    for k, v in model.named_parameters():
+        v.requires_grad = True  # train all layers
+        if any(x in k for x in freeze):
+            print('freezing %s' % k)
+            v.requires_grad = False
+
+    # fedml
+
+    dataset = load_partition_data_coco(opt, hyp, model)
+    [train_data_num, test_data_num, train_data_global, test_data_global,
+     train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] = dataset
+
+    opt.model_stride = model.stride
+    gs = int(max(model.stride))  # grid size (max stride)
+    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size]  # verify imgsz are gs-multiples
+
+
+    # SyncBatchNorm
+    if opt.sync_bn and cuda and rank != -1:
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
+        logger.info('Using SyncBatchNorm()')
+
+    # EMA
+    ema = ModelEMA(model) if rank in [-1, 0] else None
+
+    # DDP mode
+    if cuda and rank != -1:
+        model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank)
+
+    hyp['cls'] *= nc / 80.
+    model.nc = nc  # attach number of classes to model
+    model.hyp = hyp  # attach hyperparameters to model
+    model.gr = 1.0  # iou loss ratio (obj_loss = 1.0 or iou)
+    model.class_weights = labels_to_class_weights(train_data_global.dataset.labels, nc).to(device)  # attach class weights
+    model.names = names
+    args = (opt, hyp)
+    # Optimizer
+    nbs = 64  # nominal batch size
+    accumulate = max(round(nbs / total_batch_size), 1)  # accumulate loss before optimizing
+    hyp['weight_decay'] *= total_batch_size * accumulate / nbs  # scale weight_decay
+    # logger.info(f"Scaled weight_decay = {hyp['weight_decay']}")
+
+    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
+    for k, v in model.named_modules():
+        if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
+            pg2.append(v.bias)  # biases
+        if isinstance(v, nn.BatchNorm2d):
+            pg0.append(v.weight)  # no decay
+        elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
+            pg1.append(v.weight)  # apply decay
+
+    if opt.adam:
+        optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999))  # adjust beta1 to momentum
+    else:
+        optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
+
+    optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']})  # add pg1 with weight_decay
+    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
+    logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
+    del pg0, pg1, pg2
+
+    total_epochs = epochs * opt.comm_round
+
+    lf = lambda x: ((1 + math.cos(x * math.pi / total_epochs)) / 2) * (1 - hyp['lrf']) + hyp['lrf']  # cosine
+    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
+
+
+
+    opt.scheduler = scheduler
+    opt.optimizer = optimizer
+    opt.ema = ema
+
+    opt.hyp = hyp  # add hyperparameters
+
+    opt.wandb = wandb
+    device = init_training_device(process_id, worker_number - 1, opt.gpu_num_per_server)
+    # start "federated averaging (FedAvg)"
+    print("start distributed")
+   
+    try:
+    # start "federated averaging (FedAvg)"
+        print("start distributed")
+        FedML_FedAvg_distributed(process_id, worker_number, device, comm,
+                             model, train_data_num, train_data_global, test_data_global,
+                             train_data_local_num_dict, train_data_local_dict, test_data_local_dict, opt, None, True, hyp)
+    except Exception as e:
+        print(e)
+        logging.info('traceback.format_exc():\n%s' % traceback.format_exc())
+        MPI.COMM_WORLD.Abort()
+
+
diff --git a/experiments/distributed/Detection/models/experimental.py b/experiments/distributed/Detection/models/experimental.py
new file mode 100644
index 0000000..0835ba9
--- /dev/null
+++ b/experiments/distributed/Detection/models/experimental.py
@@ -0,0 +1,152 @@
+# This file contains experimental modules
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from .common import Conv, DWConv
+from utils.google_utils import attempt_download
+
+
+class CrossConv(nn.Module):
+    # Cross Convolution Downsample
+    def __init__(self, c1, c2, k=3, s=1, g=1, e=1.0, shortcut=False):
+        # ch_in, ch_out, kernel, stride, groups, expansion, shortcut
+        super(CrossConv, self).__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, (1, k), (1, s))
+        self.cv2 = Conv(c_, c2, (k, 1), (s, 1), g=g)
+        self.add = shortcut and c1 == c2
+
+    def forward(self, x):
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+
+
+class C3(nn.Module):
+    # Cross Convolution CSP
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super(C3, self).__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
+        self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
+        self.cv4 = Conv(2 * c_, c2, 1, 1)
+        self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
+        self.act = nn.LeakyReLU(0.1, inplace=True)
+        self.m = nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)])
+
+    def forward(self, x):
+        y1 = self.cv3(self.m(self.cv1(x)))
+        y2 = self.cv2(x)
+        return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1))))
+
+
+class Sum(nn.Module):
+    # Weighted sum of 2 or more layers https://arxiv.org/abs/1911.09070
+    def __init__(self, n, weight=False):  # n: number of inputs
+        super(Sum, self).__init__()
+        self.weight = weight  # apply weights boolean
+        self.iter = range(n - 1)  # iter object
+        if weight:
+            self.w = nn.Parameter(-torch.arange(1., n) / 2, requires_grad=True)  # layer weights
+
+    def forward(self, x):
+        y = x[0]  # no weight
+        if self.weight:
+            w = torch.sigmoid(self.w) * 2
+            for i in self.iter:
+                y = y + x[i + 1] * w[i]
+        else:
+            for i in self.iter:
+                y = y + x[i + 1]
+        return y
+
+
+class GhostConv(nn.Module):
+    # Ghost Convolution https://github.com/huawei-noah/ghostnet
+    def __init__(self, c1, c2, k=1, s=1, g=1, act=True):  # ch_in, ch_out, kernel, stride, groups
+        super(GhostConv, self).__init__()
+        c_ = c2 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, k, s, None, g, act)
+        self.cv2 = Conv(c_, c_, 5, 1, None, c_, act)
+
+    def forward(self, x):
+        y = self.cv1(x)
+        return torch.cat([y, self.cv2(y)], 1)
+
+
+class GhostBottleneck(nn.Module):
+    # Ghost Bottleneck https://github.com/huawei-noah/ghostnet
+    def __init__(self, c1, c2, k, s):
+        super(GhostBottleneck, self).__init__()
+        c_ = c2 // 2
+        self.conv = nn.Sequential(GhostConv(c1, c_, 1, 1),  # pw
+                                  DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(),  # dw
+                                  GhostConv(c_, c2, 1, 1, act=False))  # pw-linear
+        self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False),
+                                      Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity()
+
+    def forward(self, x):
+        return self.conv(x) + self.shortcut(x)
+
+
+class MixConv2d(nn.Module):
+    # Mixed Depthwise Conv https://arxiv.org/abs/1907.09595
+    def __init__(self, c1, c2, k=(1, 3), s=1, equal_ch=True):
+        super(MixConv2d, self).__init__()
+        groups = len(k)
+        if equal_ch:  # equal c_ per group
+            i = torch.linspace(0, groups - 1E-6, c2).floor()  # c2 indices
+            c_ = [(i == g).sum() for g in range(groups)]  # intermediate channels
+        else:  # equal weight.numel() per group
+            b = [c2] + [0] * groups
+            a = np.eye(groups + 1, groups, k=-1)
+            a -= np.roll(a, 1, axis=1)
+            a *= np.array(k) ** 2
+            a[0] = 1
+            c_ = np.linalg.lstsq(a, b, rcond=None)[0].round()  # solve for equal weight indices, ax = b
+
+        self.m = nn.ModuleList([nn.Conv2d(c1, int(c_[g]), k[g], s, k[g] // 2, bias=False) for g in range(groups)])
+        self.bn = nn.BatchNorm2d(c2)
+        self.act = nn.LeakyReLU(0.1, inplace=True)
+
+    def forward(self, x):
+        return x + self.act(self.bn(torch.cat([m(x) for m in self.m], 1)))
+
+
+class Ensemble(nn.ModuleList):
+    # Ensemble of models
+    def __init__(self):
+        super(Ensemble, self).__init__()
+
+    def forward(self, x, augment=False):
+        y = []
+        for module in self:
+            y.append(module(x, augment)[0])
+        # y = torch.stack(y).max(0)[0]  # max ensemble
+        # y = torch.cat(y, 1)  # nms ensemble
+        y = torch.stack(y).mean(0)  # mean ensemble
+        return y, None  # inference, train output
+
+
+def attempt_load(weights, map_location=None):
+    # Loads an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a
+    model = Ensemble()
+    for w in weights if isinstance(weights, list) else [weights]:
+        attempt_download(w)
+        model.append(torch.load(w, map_location=map_location)['model'].float().fuse().eval())  # load FP32 model
+
+    # Compatibility updates
+    for m in model.modules():
+        if type(m) in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6]:
+            m.inplace = True  # pytorch 1.7.0 compatibility
+        elif type(m) is Conv:
+            m._non_persistent_buffers_set = set()  # pytorch 1.6.0 compatibility
+
+    if len(model) == 1:
+        return model[-1]  # return model
+    else:
+        print('Ensemble created with %s\n' % weights)
+        for k in ['names', 'stride']:
+            setattr(model, k, getattr(model[-1], k))
+        return model  # return ensemble
diff --git a/experiments/distributed/Detection/models/yolov5l.yaml b/experiments/distributed/Detection/models/yolov5l.yaml
new file mode 100644
index 0000000..1309554
--- /dev/null
+++ b/experiments/distributed/Detection/models/yolov5l.yaml
@@ -0,0 +1,48 @@
+# parameters
+nc: 80  # number of classes
+depth_multiple: 1.0  # model depth multiple
+width_multiple: 1.0  # layer channel multiple
+
+# anchors
+anchors:
+  - [10,13, 16,30, 33,23]  # P3/8
+  - [30,61, 62,45, 59,119]  # P4/16
+  - [116,90, 156,198, 373,326]  # P5/32
+
+# YOLOv5 backbone
+backbone:
+  # [from, number, module, args]
+  [[-1, 1, Focus, [64, 3]],  # 0-P1/2
+   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
+   [-1, 3, BottleneckCSP, [128]],
+   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
+   [-1, 9, BottleneckCSP, [256]],
+   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
+   [-1, 9, BottleneckCSP, [512]],
+   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
+   [-1, 1, SPP, [1024, [5, 9, 13]]],
+   [-1, 3, BottleneckCSP, [1024, False]],  # 9
+  ]
+
+# YOLOv5 head
+head:
+  [[-1, 1, Conv, [512, 1, 1]],
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
+   [-1, 3, BottleneckCSP, [512, False]],  # 13
+
+   [-1, 1, Conv, [256, 1, 1]],
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
+   [-1, 3, BottleneckCSP, [256, False]],  # 17 (P3/8-small)
+
+   [-1, 1, Conv, [256, 3, 2]],
+   [[-1, 14], 1, Concat, [1]],  # cat head P4
+   [-1, 3, BottleneckCSP, [512, False]],  # 20 (P4/16-medium)
+
+   [-1, 1, Conv, [512, 3, 2]],
+   [[-1, 10], 1, Concat, [1]],  # cat head P5
+   [-1, 3, BottleneckCSP, [1024, False]],  # 23 (P5/32-large)
+
+   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
+  ]
diff --git a/experiments/distributed/Detection/models/yolov5m.yaml b/experiments/distributed/Detection/models/yolov5m.yaml
new file mode 100644
index 0000000..eb50a71
--- /dev/null
+++ b/experiments/distributed/Detection/models/yolov5m.yaml
@@ -0,0 +1,48 @@
+# parameters
+nc: 80  # number of classes
+depth_multiple: 0.67  # model depth multiple
+width_multiple: 0.75  # layer channel multiple
+
+# anchors
+anchors:
+  - [10,13, 16,30, 33,23]  # P3/8
+  - [30,61, 62,45, 59,119]  # P4/16
+  - [116,90, 156,198, 373,326]  # P5/32
+
+# YOLOv5 backbone
+backbone:
+  # [from, number, module, args]
+  [[-1, 1, Focus, [64, 3]],  # 0-P1/2
+   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
+   [-1, 3, BottleneckCSP, [128]],
+   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
+   [-1, 9, BottleneckCSP, [256]],
+   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
+   [-1, 9, BottleneckCSP, [512]],
+   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
+   [-1, 1, SPP, [1024, [5, 9, 13]]],
+   [-1, 3, BottleneckCSP, [1024, False]],  # 9
+  ]
+
+# YOLOv5 head
+head:
+  [[-1, 1, Conv, [512, 1, 1]],
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
+   [-1, 3, BottleneckCSP, [512, False]],  # 13
+
+   [-1, 1, Conv, [256, 1, 1]],
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
+   [-1, 3, BottleneckCSP, [256, False]],  # 17 (P3/8-small)
+
+   [-1, 1, Conv, [256, 3, 2]],
+   [[-1, 14], 1, Concat, [1]],  # cat head P4
+   [-1, 3, BottleneckCSP, [512, False]],  # 20 (P4/16-medium)
+
+   [-1, 1, Conv, [512, 3, 2]],
+   [[-1, 10], 1, Concat, [1]],  # cat head P5
+   [-1, 3, BottleneckCSP, [1024, False]],  # 23 (P5/32-large)
+
+   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
+  ]
diff --git a/experiments/distributed/Detection/models/yolov5s.yaml b/experiments/distributed/Detection/models/yolov5s.yaml
new file mode 100644
index 0000000..2bec452
--- /dev/null
+++ b/experiments/distributed/Detection/models/yolov5s.yaml
@@ -0,0 +1,48 @@
+# parameters
+nc: 80  # number of classes
+depth_multiple: 0.33  # model depth multiple
+width_multiple: 0.50  # layer channel multiple
+
+# anchors
+anchors:
+  - [10,13, 16,30, 33,23]  # P3/8
+  - [30,61, 62,45, 59,119]  # P4/16
+  - [116,90, 156,198, 373,326]  # P5/32
+
+# YOLOv5 backbone
+backbone:
+  # [from, number, module, args]
+  [[-1, 1, Focus, [64, 3]],  # 0-P1/2
+   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
+   [-1, 3, BottleneckCSP, [128]],
+   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
+   [-1, 9, BottleneckCSP, [256]],
+   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
+   [-1, 9, BottleneckCSP, [512]],
+   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
+   [-1, 1, SPP, [1024, [5, 9, 13]]],
+   [-1, 3, BottleneckCSP, [1024, False]],  # 9
+  ]
+
+# YOLOv5 head
+head:
+  [[-1, 1, Conv, [512, 1, 1]],
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
+   [-1, 3, BottleneckCSP, [512, False]],  # 13
+
+   [-1, 1, Conv, [256, 1, 1]],
+   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
+   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
+   [-1, 3, BottleneckCSP, [256, False]],  # 17 (P3/8-small)
+
+   [-1, 1, Conv, [256, 3, 2]],
+   [[-1, 14], 1, Concat, [1]],  # cat head P4
+   [-1, 3, BottleneckCSP, [512, False]],  # 20 (P4/16-medium)
+
+   [-1, 1, Conv, [512, 3, 2]],
+   [[-1, 10], 1, Concat, [1]],  # cat head P5
+   [-1, 3, BottleneckCSP, [1024, False]],  # 23 (P5/32-large)
+
+   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
+  ]
diff --git a/experiments/distributed/Detection/run_fedavg_distributed_pytorch.sh b/experiments/distributed/Detection/run_fedavg_distributed_pytorch.sh
new file mode 100644
index 0000000..6a1b944
--- /dev/null
+++ b/experiments/distributed/Detection/run_fedavg_distributed_pytorch.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+
+CLIENT_NUM=$1
+WORKER_NUM=$2
+SERVER_NUM=$3
+GPU_NUM_PER_SERVER=$4
+DATA=$5
+DISTRIBUTION=$6
+ROUND=$7
+EPOCH=$8
+BATCH_SIZE=$9
+LR=${10}
+DATASET=${11}
+DATA_DIR=${12}
+WEIGHTS=${13}
+CI=${14}
+DEVICE=${15}
+
+PROCESS_NUM=`expr $WORKER_NUM + 1`
+#echo $PROCESS_NUM
+echo $DATA
+echo $DATASET
+echo $DATA_DIR
+export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning'
+
+hostname > mpi_host_file
+
+mpirun -np $PROCESS_NUM -hostfile ./mpi_host_file python ./main_fedavg_yolo.py \
+  --gpu_server_num $SERVER_NUM \
+  --gpu_num_per_server $GPU_NUM_PER_SERVER \
+  --data $DATA \
+  --cfg $DATASET \
+  --device $DATA_DIR \
+  --partition_method $DISTRIBUTION  \
+  --client_num_in_total $CLIENT_NUM \
+  --client_num_per_round $WORKER_NUM \
+  --comm_round $ROUND \
+  --epochs $EPOCH \
+  --weights $WEIGHTS \
+  --batch_size $BATCH_SIZE \
+  --lr $LR \
+  --ci $CI \
+#  --notest \
+  --device $DEVICE
diff --git a/experiments/distributed/Detection/test.py b/experiments/distributed/Detection/test.py
new file mode 100644
index 0000000..7626c07
--- /dev/null
+++ b/experiments/distributed/Detection/test.py
@@ -0,0 +1,346 @@
+import argparse
+import json
+import os
+from pathlib import Path
+from threading import Thread
+
+import numpy as np
+import torch
+import yaml
+from tqdm import tqdm
+
+import sys
+
+sys.path.append("../../../")
+# sys.path.append('/home/weiyaowu/Documents/project_doing/fedml/FedML-master')
+
+from fedml_api.model.object_detection.yolov5.models.experimental import attempt_load
+from fedml_api.model.object_detection.yolov5.utils.datasets import create_dataloader
+from fedml_api.model.object_detection.yolov5.utils.general import coco80_to_coco91_class, check_dataset, check_file, check_img_size, box_iou, \
+    non_max_suppression, scale_coords, xyxy2xywh, xywh2xyxy, set_logging, increment_path
+from fedml_api.model.object_detection.yolov5.utils.loss import compute_loss
+from fedml_api.model.object_detection.yolov5.utils.metrics import ap_per_class, ConfusionMatrix
+from fedml_api.model.object_detection.yolov5.utils.plots import plot_images, output_to_target, plot_study_txt
+from fedml_api.model.object_detection.yolov5.utils.torch_utils import select_device, time_synchronized
+
+
+def test(data,
+         weights=None,
+         batch_size=16,
+         imgsz=640,
+         conf_thres=0.001,
+         iou_thres=0.6,  # for NMS
+         save_json=False,
+         single_cls=False,
+         augment=False,
+         verbose=False,
+         model=None,
+         dataloader=None,
+         save_dir=Path(''),  # for saving images
+         save_txt=False,  # for auto-labelling
+         save_hybrid=False,  # for hybrid auto-labelling
+         save_conf=False,  # save auto-label confidences
+         plots=True,
+         log_imgs=0):  # number of logged images
+
+    # Initialize/load model and set device
+    training = model is not None
+    if training:  # called by train.py
+        device = next(model.parameters()).device  # get model device
+
+    else:  # called directly
+        set_logging()
+        device = select_device(opt.device, batch_size=batch_size)
+
+        # device = 'cpu'
+        # Directories
+        save_dir = Path(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok))  # increment run
+        (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True)  # make dir
+
+        # Load model
+        model = attempt_load(weights, map_location=device)  # load FP32 model
+        imgsz = check_img_size(imgsz, s=model.stride.max())  # check img_size
+
+        # Multi-GPU disabled, incompatible with .half() https://github.com/ultralytics/yolov5/issues/99
+        # if device.type != 'cpu' and torch.cuda.device_count() > 1:
+        #     model = nn.DataParallel(model)
+
+    device = torch.device('cpu')
+    # Half
+    half = False
+    # half = device.type != 'cpu'  # half precision only supported on CUDA
+    if False and half:
+        model.half()
+
+    # Configure
+    model = model.to(device)
+    model.eval()
+    is_coco = data.endswith('coco.yaml')  # is COCO dataset
+    with open(data) as f:
+        data = yaml.load(f, Loader=yaml.FullLoader)  # model dict
+    check_dataset(data)  # check
+    nc = 1 if single_cls else int(data['nc'])  # number of classes
+    iouv = torch.linspace(0.5, 0.95, 10).to(device)  # iou vector for mAP@0.5:0.95
+    niou = iouv.numel()
+
+    # Logging
+    log_imgs, wandb = min(log_imgs, 100), None  # ceil
+    try:
+        import wandb  # Weights & Biases
+    except ImportError:
+        log_imgs = 0
+
+    # Dataloader
+    if not training:
+        img = torch.zeros((1, 3, imgsz, imgsz), device=device)  # init img
+        _ = model(img.half() if half else img) if device.type != 'cpu' else None  # run once
+        path = data['test'] if opt.task == 'test' else data['val']  # path to val/test images
+        dataloader = create_dataloader(path, imgsz, batch_size, model.stride.max(), opt, pad=0.5, rect=True)[0]
+
+    seen = 0
+    confusion_matrix = ConfusionMatrix(nc=nc)
+    names = {k: v for k, v in enumerate(model.names if hasattr(model, 'names') else model.module.names)}
+    coco91class = coco80_to_coco91_class()
+    s = ('%20s' + '%12s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@.5', 'mAP@.5:.95')
+    p, r, f1, mp, mr, map50, map, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0.
+    loss = torch.zeros(3, device=device)
+    jdict, stats, ap, ap_class, wandb_images = [], [], [], [], []
+    for batch_i, (img, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)):
+        img = img.to(device, non_blocking=True)
+        img = img.half() if half else img.float()  # uint8 to fp16/32
+        img /= 255.0  # 0 - 255 to 0.0 - 1.0
+        targets = targets.to(device)
+        nb, _, height, width = img.shape  # batch size, channels, height, width
+
+        with torch.no_grad():
+            # Run model
+            t = time_synchronized()
+            inf_out, train_out = model(img, augment=augment)  # inference and training outputs
+            t0 += time_synchronized() - t
+
+            # Compute loss
+            if training:
+                loss += compute_loss([x.float() for x in train_out], targets, model)[1][:3]  # box, obj, cls
+
+            # Run NMS
+            targets[:, 2:] *= torch.Tensor([width, height, width, height]).to(device)  # to pixels
+            lb = [targets[targets[:, 0] == i, 1:] for i in range(nb)] if save_hybrid else []  # for autolabelling
+            t = time_synchronized()
+            inf_out = inf_out.cpu()
+            output = non_max_suppression(inf_out, conf_thres=conf_thres, iou_thres=iou_thres, labels=lb)
+            t1 += time_synchronized() - t
+
+        # Statistics per image
+        targets = targets.cpu()
+        for si, pred in enumerate(output):
+            labels = targets[targets[:, 0] == si, 1:]
+            nl = len(labels)
+            tcls = labels[:, 0].tolist() if nl else []  # target class
+            path = Path(paths[si])
+            seen += 1
+
+            if len(pred) == 0:
+                if nl:
+                    stats.append((torch.zeros(0, niou, dtype=torch.bool), torch.Tensor(), torch.Tensor(), tcls))
+                continue
+
+            # Predictions
+            predn = pred.clone()
+            scale_coords(img[si].shape[1:], predn[:, :4], shapes[si][0], shapes[si][1])  # native-space pred
+
+            # Append to text file
+            if save_txt:
+                gn = torch.tensor(shapes[si][0])[[1, 0, 1, 0]]  # normalization gain whwh
+                for *xyxy, conf, cls in predn.tolist():
+                    xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
+                    line = (cls, *xywh, conf) if save_conf else (cls, *xywh)  # label format
+                    with open(save_dir / 'labels' / (path.stem + '.txt'), 'a') as f:
+                        f.write(('%g ' * len(line)).rstrip() % line + '\n')
+
+            # W&B logging
+            if plots and len(wandb_images) < log_imgs:
+                box_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]},
+                             "class_id": int(cls),
+                             "box_caption": "%s %.3f" % (names[cls], conf),
+                             "scores": {"class_score": conf},
+                             "domain": "pixel"} for *xyxy, conf, cls in pred.tolist()]
+                boxes = {"predictions": {"box_data": box_data, "class_labels": names}}  # inference-space
+                wandb_images.append(wandb.Image(img[si], boxes=boxes, caption=path.name))
+
+            # Append to pycocotools JSON dictionary
+            if save_json:
+                # [{"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}, ...
+                image_id = int(path.stem) if path.stem.isnumeric() else path.stem
+                box = xyxy2xywh(predn[:, :4])  # xywh
+                box[:, :2] -= box[:, 2:] / 2  # xy center to top-left corner
+                for p, b in zip(pred.tolist(), box.tolist()):
+                    jdict.append({'image_id': image_id,
+                                  'category_id': coco91class[int(p[5])] if is_coco else int(p[5]),
+                                  'bbox': [round(x, 3) for x in b],
+                                  'score': round(p[4], 5)})
+
+            # Assign all predictions as incorrect
+            correct = torch.zeros(pred.shape[0], niou, dtype=torch.bool, device=device)
+            if nl:
+                detected = []  # target indices
+                tcls_tensor = labels[:, 0]
+
+                # target boxes
+                tbox = xywh2xyxy(labels[:, 1:5])
+                scale_coords(img[si].shape[1:], tbox, shapes[si][0], shapes[si][1])  # native-space labels
+                if plots:
+                    confusion_matrix.process_batch(pred, torch.cat((labels[:, 0:1], tbox), 1))
+
+                # Per target class
+                for cls in torch.unique(tcls_tensor):
+                    ti = (cls == tcls_tensor).nonzero(as_tuple=False).view(-1)  # prediction indices
+                    pi = (cls == pred[:, 5]).nonzero(as_tuple=False).view(-1)  # target indices
+
+                    # Search for detections
+                    if pi.shape[0]:
+                        # Prediction to target ious
+                        ious, i = box_iou(predn[pi, :4], tbox[ti]).max(1)  # best ious, indices
+
+                        # Append detections
+                        detected_set = set()
+                        for j in (ious > iouv[0]).nonzero(as_tuple=False):
+                            d = ti[i[j]]  # detected target
+                            if d.item() not in detected_set:
+                                detected_set.add(d.item())
+                                detected.append(d)
+                                correct[pi[j]] = ious[j] > iouv  # iou_thres is 1xn
+                                if len(detected) == nl:  # all targets already located in image
+                                    break
+
+            # Append statistics (correct, conf, pcls, tcls)
+            stats.append((correct.cpu(), pred[:, 4].cpu(), pred[:, 5].cpu(), tcls))
+
+        # Plot images
+        if plots and batch_i < 3:
+            f = save_dir / f'test_batch{batch_i}_labels.jpg'  # labels
+            Thread(target=plot_images, args=(img, targets, paths, f, names), daemon=True).start()
+            f = save_dir / f'test_batch{batch_i}_pred.jpg'  # predictions
+            Thread(target=plot_images, args=(img, output_to_target(output), paths, f, names), daemon=True).start()
+
+    # Compute statistics
+    stats = [np.concatenate(x, 0) for x in zip(*stats)]  # to numpy
+    if len(stats) and stats[0].any():
+        p, r, ap, f1, ap_class = ap_per_class(*stats, plot=plots, save_dir=save_dir, names=names)
+        p, r, ap50, ap = p[:, 0], r[:, 0], ap[:, 0], ap.mean(1)  # [P, R, AP@0.5, AP@0.5:0.95]
+        mp, mr, map50, map = p.mean(), r.mean(), ap50.mean(), ap.mean()
+        nt = np.bincount(stats[3].astype(np.int64), minlength=nc)  # number of targets per class
+    else:
+        nt = torch.zeros(1)
+
+    # Print results
+    pf = '%20s' + '%12.3g' * 6  # print format
+    print(pf % ('all', seen, nt.sum(), mp, mr, map50, map))
+
+    # Print results per class
+    if verbose and nc > 1 and len(stats):
+        for i, c in enumerate(ap_class):
+            print(pf % (names[c], seen, nt[c], p[i], r[i], ap50[i], ap[i]))
+
+    # Print speeds
+    t = tuple(x / seen * 1E3 for x in (t0, t1, t0 + t1)) + (imgsz, imgsz, batch_size)  # tuple
+    if not training:
+        print('Speed: %.1f/%.1f/%.1f ms inference/NMS/total per %gx%g image at batch-size %g' % t)
+
+    # Plots
+    if plots:
+        confusion_matrix.plot(save_dir=save_dir, names=list(names.values()))
+        if wandb and wandb.run:
+            wandb.log({"Images": wandb_images})
+            wandb.log({"Validation": [wandb.Image(str(f), caption=f.name) for f in sorted(save_dir.glob('test*.jpg'))]})
+
+    # Save JSON
+    if save_json and len(jdict):
+        w = Path(weights[0] if isinstance(weights, list) else weights).stem if weights is not None else ''  # weights
+        anno_json = '../coco/annotations/instances_val2017.json'  # annotations json
+        pred_json = str(save_dir / f"{w}_predictions.json")  # predictions json
+        print('\nEvaluating pycocotools mAP... saving %s...' % pred_json)
+        with open(pred_json, 'w') as f:
+            json.dump(jdict, f)
+
+        try:  # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb
+            from pycocotools.coco import COCO
+            from pycocotools.cocoeval import COCOeval
+
+            anno = COCO(anno_json)  # init annotations api
+            pred = anno.loadRes(pred_json)  # init predictions api
+            eval = COCOeval(anno, pred, 'bbox')
+            if is_coco:
+                eval.params.imgIds = [int(Path(x).stem) for x in dataloader.dataset.img_files]  # image IDs to evaluate
+            eval.evaluate()
+            eval.accumulate()
+            eval.summarize()
+            map, map50 = eval.stats[:2]  # update results (mAP@0.5:0.95, mAP@0.5)
+        except Exception as e:
+            print(f'pycocotools unable to run: {e}')
+
+    # Return results
+    if not training:
+        s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
+        print(f"Results saved to {save_dir}{s}")
+    model.float()  # for training
+    model.cuda()
+    maps = np.zeros(nc) + map
+    for i, c in enumerate(ap_class):
+        maps[c] = ap[i]
+    return (mp, mr, map50, map, *(loss.cpu() / len(dataloader)).tolist()), maps, t
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(prog='test.py')
+    parser.add_argument('--weights', nargs='+', type=str, default='yolov5s.pt', help='model.pt path(s)')
+    parser.add_argument('--data', type=str, default='data/coco128.yaml', help='*.data path')
+    parser.add_argument('--batch-size', type=int, default=32, help='size of each image batch')
+    parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)')
+    parser.add_argument('--conf-thres', type=float, default=0.001, help='object confidence threshold')
+    parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS')
+    parser.add_argument('--task', default='val', help="'val', 'test', 'study'")
+    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
+    parser.add_argument('--single-cls', action='store_true', help='treat as single-class dataset')
+    parser.add_argument('--augment', action='store_true', help='augmented inference')
+    parser.add_argument('--verbose', action='store_true', help='report mAP by class')
+    parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
+    parser.add_argument('--save-hybrid', action='store_true', help='save label+prediction hybrid results to *.txt')
+    parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels')
+    parser.add_argument('--save-json', action='store_true', help='save a cocoapi-compatible JSON results file')
+    parser.add_argument('--project', default='runs/test', help='save to project/name')
+    parser.add_argument('--name', default='exp', help='save to project/name')
+    parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
+    opt = parser.parse_args()
+    opt.save_json |= opt.data.endswith('coco.yaml')
+    opt.data = check_file(opt.data)  # check file
+    print(opt)
+    # opt.device = 'cpu'
+    if opt.task in ['val', 'test']:  # run normally
+        test(opt.data,
+             opt.weights,
+             opt.batch_size,
+             opt.img_size,
+             opt.conf_thres,
+             opt.iou_thres,
+             opt.save_json,
+             opt.single_cls,
+             opt.augment,
+             opt.verbose,
+             save_txt=opt.save_txt | opt.save_hybrid,
+             save_hybrid=opt.save_hybrid,
+             save_conf=opt.save_conf,
+             )
+
+    elif opt.task == 'study':  # run over a range of settings and save/plot
+        for weights in ['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt']:
+            f = 'study_%s_%s.txt' % (Path(opt.data).stem, Path(weights).stem)  # filename to save to
+            x = list(range(320, 800, 64))  # x axis
+            y = []  # y axis
+            for i in x:  # img-size
+                print('\nRunning %s point %s...' % (f, i))
+                r, _, t = test(opt.data, weights, opt.batch_size, i, opt.conf_thres, opt.iou_thres, opt.save_json,
+                               plots=False)
+                y.append(r + t)  # results and times
+            np.savetxt(f, y, fmt='%10.4g')  # save
+        os.system('zip -r study.zip study_*.txt')
+        plot_study_txt(f, x)  # plot
diff --git a/experiments/distributed/Detection/utils/activations.py b/experiments/distributed/Detection/utils/activations.py
new file mode 100644
index 0000000..24f5a30
--- /dev/null
+++ b/experiments/distributed/Detection/utils/activations.py
@@ -0,0 +1,72 @@
+# Activation functions
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+# Swish https://arxiv.org/pdf/1905.02244.pdf ---------------------------------------------------------------------------
+class Swish(nn.Module):  #
+    @staticmethod
+    def forward(x):
+        return x * torch.sigmoid(x)
+
+
+class Hardswish(nn.Module):  # export-friendly version of nn.Hardswish()
+    @staticmethod
+    def forward(x):
+        # return x * F.hardsigmoid(x)  # for torchscript and CoreML
+        return x * F.hardtanh(x + 3, 0., 6.) / 6.  # for torchscript, CoreML and ONNX
+
+
+class MemoryEfficientSwish(nn.Module):
+    class F(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x):
+            ctx.save_for_backward(x)
+            return x * torch.sigmoid(x)
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            x = ctx.saved_tensors[0]
+            sx = torch.sigmoid(x)
+            return grad_output * (sx * (1 + x * (1 - sx)))
+
+    def forward(self, x):
+        return self.F.apply(x)
+
+
+# Mish https://github.com/digantamisra98/Mish --------------------------------------------------------------------------
+class Mish(nn.Module):
+    @staticmethod
+    def forward(x):
+        return x * F.softplus(x).tanh()
+
+
+class MemoryEfficientMish(nn.Module):
+    class F(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x):
+            ctx.save_for_backward(x)
+            return x.mul(torch.tanh(F.softplus(x)))  # x * tanh(ln(1 + exp(x)))
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            x = ctx.saved_tensors[0]
+            sx = torch.sigmoid(x)
+            fx = F.softplus(x).tanh()
+            return grad_output * (fx + x * sx * (1 - fx * fx))
+
+    def forward(self, x):
+        return self.F.apply(x)
+
+
+# FReLU https://arxiv.org/abs/2007.11824 -------------------------------------------------------------------------------
+class FReLU(nn.Module):
+    def __init__(self, c1, k=3):  # ch_in, kernel
+        super().__init__()
+        self.conv = nn.Conv2d(c1, c1, k, 1, 1, groups=c1, bias=False)
+        self.bn = nn.BatchNorm2d(c1)
+
+    def forward(self, x):
+        return torch.max(x, self.bn(self.conv(x)))
diff --git a/experiments/distributed/Detection/utils/datasets.py b/experiments/distributed/Detection/utils/datasets.py
new file mode 100644
index 0000000..313180f
--- /dev/null
+++ b/experiments/distributed/Detection/utils/datasets.py
@@ -0,0 +1,933 @@
+# Dataset utils and dataloaders
+
+import glob
+import logging
+import math
+import os
+import random
+import shutil
+import time
+from itertools import repeat
+from multiprocessing.pool import ThreadPool
+from pathlib import Path
+from threading import Thread
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image, ExifTags
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+from .general import xyxy2xywh, xywh2xyxy
+from .torch_utils import torch_distributed_zero_first
+
+# Parameters
+help_url = 'https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data'
+img_formats = ['bmp', 'jpg', 'jpeg', 'png', 'tif', 'tiff', 'dng']  # acceptable image suffixes
+vid_formats = ['mov', 'avi', 'mp4', 'mpg', 'mpeg', 'm4v', 'wmv', 'mkv']  # acceptable video suffixes
+logger = logging.getLogger(__name__)
+
+# Get orientation exif tag
+for orientation in ExifTags.TAGS.keys():
+    if ExifTags.TAGS[orientation] == 'Orientation':
+        break
+
+
+def get_hash(files):
+    # Returns a single hash value of a list of files
+    return sum(os.path.getsize(f) for f in files if os.path.isfile(f))
+
+
+def exif_size(img):
+    # Returns exif-corrected PIL size
+    s = img.size  # (width, height)
+    try:
+        rotation = dict(img._getexif().items())[orientation]
+        if rotation == 6:  # rotation 270
+            s = (s[1], s[0])
+        elif rotation == 8:  # rotation 90
+            s = (s[1], s[0])
+    except:
+        pass
+
+    return s
+
+
+def create_dataloader(path, imgsz, batch_size, stride, opt, hyp=None, augment=False, cache=False, pad=0.0, rect=False,
+                      rank=-1, world_size=1, workers=8, image_weights=False):
+    # Make sure only the first process in DDP process the dataset first, and the following others can use the cache
+    with torch_distributed_zero_first(rank):
+        dataset = LoadImagesAndLabels(path, imgsz, batch_size,
+                                      augment=augment,  # augment images
+                                      hyp=hyp,  # augmentation hyperparameters
+                                      rect=rect,  # rectangular training
+                                      cache_images=cache,
+                                      single_cls=opt.single_cls,
+                                      stride=int(stride),
+                                      pad=pad,
+                                      rank=rank,
+                                      image_weights=image_weights)
+
+    batch_size = min(batch_size, len(dataset))
+    nw = min([os.cpu_count() // world_size, batch_size if batch_size > 1 else 0, workers])  # number of workers
+    sampler = torch.utils.data.distributed.DistributedSampler(dataset) if rank != -1 else None
+    loader = torch.utils.data.DataLoader if image_weights else InfiniteDataLoader
+    # Use torch.utils.data.DataLoader() if dataset.properties will update during training else InfiniteDataLoader()
+    dataloader = loader(dataset,
+                        batch_size=batch_size,
+                        num_workers=nw,
+                        sampler=sampler,
+                        pin_memory=True,
+                        collate_fn=LoadImagesAndLabels.collate_fn)
+    return dataloader, dataset
+
+
+class InfiniteDataLoader(torch.utils.data.dataloader.DataLoader):
+    """ Dataloader that reuses workers
+
+    Uses same syntax as vanilla DataLoader
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        object.__setattr__(self, 'batch_sampler', _RepeatSampler(self.batch_sampler))
+        self.iterator = super().__iter__()
+
+    def __len__(self):
+        return len(self.batch_sampler.sampler)
+
+    def __iter__(self):
+        for i in range(len(self)):
+            yield next(self.iterator)
+
+
+class _RepeatSampler(object):
+    """ Sampler that repeats forever
+
+    Args:
+        sampler (Sampler)
+    """
+
+    def __init__(self, sampler):
+        self.sampler = sampler
+
+    def __iter__(self):
+        while True:
+            yield from iter(self.sampler)
+
+
+class LoadImages:  # for inference
+    def __init__(self, path, img_size=640):
+        p = str(Path(path))  # os-agnostic
+        p = os.path.abspath(p)  # absolute path
+        if '*' in p:
+            files = sorted(glob.glob(p, recursive=True))  # glob
+        elif os.path.isdir(p):
+            files = sorted(glob.glob(os.path.join(p, '*.*')))  # dir
+        elif os.path.isfile(p):
+            files = [p]  # files
+        else:
+            raise Exception('ERROR: %s does not exist' % p)
+
+        images = [x for x in files if x.split('.')[-1].lower() in img_formats]
+        videos = [x for x in files if x.split('.')[-1].lower() in vid_formats]
+        ni, nv = len(images), len(videos)
+
+        self.img_size = img_size
+        self.files = images + videos
+        self.nf = ni + nv  # number of files
+        self.video_flag = [False] * ni + [True] * nv
+        self.mode = 'image'
+        if any(videos):
+            self.new_video(videos[0])  # new video
+        else:
+            self.cap = None
+        assert self.nf > 0, 'No images or videos found in %s. Supported formats are:\nimages: %s\nvideos: %s' % \
+                            (p, img_formats, vid_formats)
+
+    def __iter__(self):
+        self.count = 0
+        return self
+
+    def __next__(self):
+        if self.count == self.nf:
+            raise StopIteration
+        path = self.files[self.count]
+
+        if self.video_flag[self.count]:
+            # Read video
+            self.mode = 'video'
+            ret_val, img0 = self.cap.read()
+            if not ret_val:
+                self.count += 1
+                self.cap.release()
+                if self.count == self.nf:  # last video
+                    raise StopIteration
+                else:
+                    path = self.files[self.count]
+                    self.new_video(path)
+                    ret_val, img0 = self.cap.read()
+
+            self.frame += 1
+            print('video %g/%g (%g/%g) %s: ' % (self.count + 1, self.nf, self.frame, self.nframes, path), end='')
+
+        else:
+            # Read image
+            self.count += 1
+            img0 = cv2.imread(path)  # BGR
+            assert img0 is not None, 'Image Not Found ' + path
+            print('image %g/%g %s: ' % (self.count, self.nf, path), end='')
+
+        # Padded resize
+        img = letterbox(img0, new_shape=self.img_size)[0]
+
+        # Convert
+        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
+        img = np.ascontiguousarray(img)
+
+        return path, img, img0, self.cap
+
+    def new_video(self, path):
+        self.frame = 0
+        self.cap = cv2.VideoCapture(path)
+        self.nframes = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+    def __len__(self):
+        return self.nf  # number of files
+
+
+class LoadWebcam:  # for inference
+    def __init__(self, pipe='0', img_size=640):
+        self.img_size = img_size
+
+        if pipe.isnumeric():
+            pipe = eval(pipe)  # local camera
+        # pipe = 'rtsp://192.168.1.64/1'  # IP camera
+        # pipe = 'rtsp://username:password@192.168.1.64/1'  # IP camera with login
+        # pipe = 'http://wmccpinetop.axiscam.net/mjpg/video.mjpg'  # IP golf camera
+
+        self.pipe = pipe
+        self.cap = cv2.VideoCapture(pipe)  # video capture object
+        self.cap.set(cv2.CAP_PROP_BUFFERSIZE, 3)  # set buffer size
+
+    def __iter__(self):
+        self.count = -1
+        return self
+
+    def __next__(self):
+        self.count += 1
+        if cv2.waitKey(1) == ord('q'):  # q to quit
+            self.cap.release()
+            cv2.destroyAllWindows()
+            raise StopIteration
+
+        # Read frame
+        if self.pipe == 0:  # local camera
+            ret_val, img0 = self.cap.read()
+            img0 = cv2.flip(img0, 1)  # flip left-right
+        else:  # IP camera
+            n = 0
+            while True:
+                n += 1
+                self.cap.grab()
+                if n % 30 == 0:  # skip frames
+                    ret_val, img0 = self.cap.retrieve()
+                    if ret_val:
+                        break
+
+        # Print
+        assert ret_val, 'Camera Error %s' % self.pipe
+        img_path = 'webcam.jpg'
+        print('webcam %g: ' % self.count, end='')
+
+        # Padded resize
+        img = letterbox(img0, new_shape=self.img_size)[0]
+
+        # Convert
+        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
+        img = np.ascontiguousarray(img)
+
+        return img_path, img, img0, None
+
+    def __len__(self):
+        return 0
+
+
+class LoadStreams:  # multiple IP or RTSP cameras
+    def __init__(self, sources='streams.txt', img_size=640):
+        self.mode = 'stream'
+        self.img_size = img_size
+
+        if os.path.isfile(sources):
+            with open(sources, 'r') as f:
+                sources = [x.strip() for x in f.read().strip().splitlines() if len(x.strip())]
+        else:
+            sources = [sources]
+
+        n = len(sources)
+        self.imgs = [None] * n
+        self.sources = sources
+        for i, s in enumerate(sources):
+            # Start the thread to read frames from the video stream
+            print('%g/%g: %s... ' % (i + 1, n, s), end='')
+            cap = cv2.VideoCapture(eval(s) if s.isnumeric() else s)
+            assert cap.isOpened(), 'Failed to open %s' % s
+            w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+            h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+            fps = cap.get(cv2.CAP_PROP_FPS) % 100
+            _, self.imgs[i] = cap.read()  # guarantee first frame
+            thread = Thread(target=self.update, args=([i, cap]), daemon=True)
+            print(' success (%gx%g at %.2f FPS).' % (w, h, fps))
+            thread.start()
+        print('')  # newline
+
+        # check for common shapes
+        s = np.stack([letterbox(x, new_shape=self.img_size)[0].shape for x in self.imgs], 0)  # inference shapes
+        self.rect = np.unique(s, axis=0).shape[0] == 1  # rect inference if all shapes equal
+        if not self.rect:
+            print('WARNING: Different stream shapes detected. For optimal performance supply similarly-shaped streams.')
+
+    def update(self, index, cap):
+        # Read next stream frame in a daemon thread
+        n = 0
+        while cap.isOpened():
+            n += 1
+            # _, self.imgs[index] = cap.read()
+            cap.grab()
+            if n == 4:  # read every 4th frame
+                _, self.imgs[index] = cap.retrieve()
+                n = 0
+            time.sleep(0.01)  # wait time
+
+    def __iter__(self):
+        self.count = -1
+        return self
+
+    def __next__(self):
+        self.count += 1
+        img0 = self.imgs.copy()
+        if cv2.waitKey(1) == ord('q'):  # q to quit
+            cv2.destroyAllWindows()
+            raise StopIteration
+
+        # Letterbox
+        img = [letterbox(x, new_shape=self.img_size, auto=self.rect)[0] for x in img0]
+
+        # Stack
+        img = np.stack(img, 0)
+
+        # Convert
+        img = img[:, :, :, ::-1].transpose(0, 3, 1, 2)  # BGR to RGB, to bsx3x416x416
+        img = np.ascontiguousarray(img)
+
+        return self.sources, img, img0, None
+
+    def __len__(self):
+        return 0  # 1E12 frames = 32 streams at 30 FPS for 30 years
+
+
+def img2label_paths(img_paths):
+    # Define label paths as a function of image paths
+    sa, sb = os.sep + 'images' + os.sep, os.sep + 'labels' + os.sep  # /images/, /labels/ substrings
+    return [x.replace(sa, sb, 1).replace('.' + x.split('.')[-1], '.txt') for x in img_paths]
+
+
+class LoadImagesAndLabels(Dataset):  # for training/testing
+    def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False,
+                 cache_images=False, single_cls=False, stride=32, pad=0.0, rank=-1):
+        self.img_size = img_size
+        self.augment = augment
+        self.hyp = hyp
+        self.image_weights = image_weights
+        self.rect = False if image_weights else rect
+        self.mosaic = self.augment and not self.rect  # load 4 images at a time into a mosaic (only during training)
+        self.mosaic_border = [-img_size // 2, -img_size // 2]
+        self.stride = stride
+
+        try:
+            f = []  # image files
+            for p in path if isinstance(path, list) else [path]:
+                p = Path(p)  # os-agnostic
+                if p.is_dir():  # dir
+                    f += glob.glob(str(p / '**' / '*.*'), recursive=True)
+                elif p.is_file():  # file
+                    with open(p, 'r') as t:
+                        t = t.read().strip().splitlines()
+                        parent = str(p.parent) + os.sep
+                        f += [x.replace('./', parent) if x.startswith('./') else x for x in t]  # local to global path
+                else:
+                    raise Exception('%s does not exist' % p)
+            self.img_files = sorted([x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in img_formats])
+            assert self.img_files, 'No images found'
+        except Exception as e:
+            raise Exception('Error loading data from %s: %s\nSee %s' % (path, e, help_url))
+
+        # Check cache
+        self.label_files = img2label_paths(self.img_files)  # labels
+        cache_path = Path(self.label_files[0]).parent.with_suffix('.cache')  # cached labels
+        if cache_path.is_file():
+            cache = torch.load(cache_path)  # load
+            if cache['hash'] != get_hash(self.label_files + self.img_files) or 'results' not in cache:  # changed
+                cache = self.cache_labels(cache_path)  # re-cache
+        else:
+            cache = self.cache_labels(cache_path)  # cache
+
+        # Display cache
+        [nf, nm, ne, nc, n] = cache.pop('results')  # found, missing, empty, corrupted, total
+        desc = f"Scanning '{cache_path}' for images and labels... {nf} found, {nm} missing, {ne} empty, {nc} corrupted"
+        tqdm(None, desc=desc, total=n, initial=n)
+        assert nf > 0 or not augment, f'No labels found in {cache_path}. Can not train without labels. See {help_url}'
+
+        # Read cache
+        cache.pop('hash')  # remove hash
+        labels, shapes = zip(*cache.values())
+        self.labels = list(labels)
+        self.shapes = np.array(shapes, dtype=np.float64)
+        self.img_files = list(cache.keys())  # update
+        self.label_files = img2label_paths(cache.keys())  # update
+        if single_cls:
+            for x in self.labels:
+                x[:, 0] = 0
+
+        n = len(shapes)  # number of images
+        bi = np.floor(np.arange(n) / batch_size).astype(np.int)  # batch index
+        nb = bi[-1] + 1  # number of batches
+        self.batch = bi  # batch index of image
+        self.n = n
+        self.indices = range(n)
+
+        # Rectangular Training
+        if self.rect:
+            # Sort by aspect ratio
+            s = self.shapes  # wh
+            ar = s[:, 1] / s[:, 0]  # aspect ratio
+            irect = ar.argsort()
+            self.img_files = [self.img_files[i] for i in irect]
+            self.label_files = [self.label_files[i] for i in irect]
+            self.labels = [self.labels[i] for i in irect]
+            self.shapes = s[irect]  # wh
+            ar = ar[irect]
+
+            # Set training image shapes
+            shapes = [[1, 1]] * nb
+            for i in range(nb):
+                ari = ar[bi == i]
+                mini, maxi = ari.min(), ari.max()
+                if maxi < 1:
+                    shapes[i] = [maxi, 1]
+                elif mini > 1:
+                    shapes[i] = [1, 1 / mini]
+
+            self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(np.int) * stride
+
+        # Cache images into memory for faster training (WARNING: large datasets may exceed system RAM)
+        self.imgs = [None] * n
+        if cache_images:
+            gb = 0  # Gigabytes of cached images
+            self.img_hw0, self.img_hw = [None] * n, [None] * n
+            results = ThreadPool(8).imap(lambda x: load_image(*x), zip(repeat(self), range(n)))  # 8 threads
+            pbar = tqdm(enumerate(results), total=n)
+            for i, x in pbar:
+                self.imgs[i], self.img_hw0[i], self.img_hw[i] = x  # img, hw_original, hw_resized = load_image(self, i)
+                gb += self.imgs[i].nbytes
+                pbar.desc = 'Caching images (%.1fGB)' % (gb / 1E9)
+
+    def cache_labels(self, path=Path('./labels.cache')):
+        # Cache dataset labels, check images and read shapes
+        x = {}  # dict
+        nm, nf, ne, nc = 0, 0, 0, 0  # number missing, found, empty, duplicate
+        pbar = tqdm(zip(self.img_files, self.label_files), desc='Scanning images', total=len(self.img_files))
+        for i, (im_file, lb_file) in enumerate(pbar):
+            try:
+                # verify images
+                im = Image.open(im_file)
+                im.verify()  # PIL verify
+                shape = exif_size(im)  # image size
+                assert (shape[0] > 9) & (shape[1] > 9), 'image size <10 pixels'
+
+                # verify labels
+                if os.path.isfile(lb_file):
+                    nf += 1  # label found
+                    with open(lb_file, 'r') as f:
+                        l = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32)  # labels
+                    if len(l):
+                        assert l.shape[1] == 5, 'labels require 5 columns each'
+                        assert (l >= 0).all(), 'negative labels'
+                        assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
+                        assert np.unique(l, axis=0).shape[0] == l.shape[0], 'duplicate labels'
+                    else:
+                        ne += 1  # label empty
+                        l = np.zeros((0, 5), dtype=np.float32)
+                else:
+                    nm += 1  # label missing
+                    l = np.zeros((0, 5), dtype=np.float32)
+                x[im_file] = [l, shape]
+            except Exception as e:
+                nc += 1
+                print('WARNING: Ignoring corrupted image and/or label %s: %s' % (im_file, e))
+
+            pbar.desc = f"Scanning '{path.parent / path.stem}' for images and labels... " \
+                        f"{nf} found, {nm} missing, {ne} empty, {nc} corrupted"
+
+        if nf == 0:
+            print(f'WARNING: No labels found in {path}. See {help_url}')
+
+        x['hash'] = get_hash(self.label_files + self.img_files)
+        x['results'] = [nf, nm, ne, nc, i + 1]
+        torch.save(x, path)  # save for next time
+        logging.info(f"New cache created: {path}")
+        return x
+
+    def __len__(self):
+        return len(self.img_files)
+
+    # def __iter__(self):
+    #     self.count = -1
+    #     print('ran dataset iter')
+    #     #self.shuffled_vector = np.random.permutation(self.nF) if self.augment else np.arange(self.nF)
+    #     return self
+
+    def __getitem__(self, index):
+        index = self.indices[index]  # linear, shuffled, or image_weights
+
+        hyp = self.hyp
+        mosaic = self.mosaic and random.random() < hyp['mosaic']
+        if mosaic:
+            # Load mosaic
+            img, labels = load_mosaic(self, index)
+            shapes = None
+
+            # MixUp https://arxiv.org/pdf/1710.09412.pdf
+            if random.random() < hyp['mixup']:
+                img2, labels2 = load_mosaic(self, random.randint(0, self.n - 1))
+                r = np.random.beta(8.0, 8.0)  # mixup ratio, alpha=beta=8.0
+                img = (img * r + img2 * (1 - r)).astype(np.uint8)
+                labels = np.concatenate((labels, labels2), 0)
+
+        else:
+            # Load image
+            img, (h0, w0), (h, w) = load_image(self, index)
+
+            # Letterbox
+            shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size  # final letterboxed shape
+            img, ratio, pad = letterbox(img, shape, auto=False, scaleup=self.augment)
+            shapes = (h0, w0), ((h / h0, w / w0), pad)  # for COCO mAP rescaling
+
+            # Load labels
+            labels = []
+            x = self.labels[index]
+            if x.size > 0:
+                # Normalized xywh to pixel xyxy format
+                labels = x.copy()
+                labels[:, 1] = ratio[0] * w * (x[:, 1] - x[:, 3] / 2) + pad[0]  # pad width
+                labels[:, 2] = ratio[1] * h * (x[:, 2] - x[:, 4] / 2) + pad[1]  # pad height
+                labels[:, 3] = ratio[0] * w * (x[:, 1] + x[:, 3] / 2) + pad[0]
+                labels[:, 4] = ratio[1] * h * (x[:, 2] + x[:, 4] / 2) + pad[1]
+
+        if self.augment:
+            # Augment imagespace
+            if not mosaic:
+                img, labels = random_perspective(img, labels,
+                                                 degrees=hyp['degrees'],
+                                                 translate=hyp['translate'],
+                                                 scale=hyp['scale'],
+                                                 shear=hyp['shear'],
+                                                 perspective=hyp['perspective'])
+
+            # Augment colorspace
+            augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v'])
+
+            # Apply cutouts
+            # if random.random() < 0.9:
+            #     labels = cutout(img, labels)
+
+        nL = len(labels)  # number of labels
+        if nL:
+            labels[:, 1:5] = xyxy2xywh(labels[:, 1:5])  # convert xyxy to xywh
+            labels[:, [2, 4]] /= img.shape[0]  # normalized height 0-1
+            labels[:, [1, 3]] /= img.shape[1]  # normalized width 0-1
+
+        if self.augment:
+            # flip up-down
+            if random.random() < hyp['flipud']:
+                img = np.flipud(img)
+                if nL:
+                    labels[:, 2] = 1 - labels[:, 2]
+
+            # flip left-right
+            if random.random() < hyp['fliplr']:
+                img = np.fliplr(img)
+                if nL:
+                    labels[:, 1] = 1 - labels[:, 1]
+
+        labels_out = torch.zeros((nL, 6))
+        if nL:
+            labels_out[:, 1:] = torch.from_numpy(labels)
+
+        # Convert
+        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
+        img = np.ascontiguousarray(img)
+
+        return torch.from_numpy(img), labels_out, self.img_files[index], shapes
+
+    @staticmethod
+    def collate_fn(batch):
+        img, label, path, shapes = zip(*batch)  # transposed
+        for i, l in enumerate(label):
+            l[:, 0] = i  # add target image index for build_targets()
+        return torch.stack(img, 0), torch.cat(label, 0), path, shapes
+
+
+# Ancillary functions --------------------------------------------------------------------------------------------------
+def load_image(self, index):
+    # loads 1 image from dataset, returns img, original hw, resized hw
+    img = self.imgs[index]
+    if img is None:  # not cached
+        path = self.img_files[index]
+        img = cv2.imread(path)  # BGR
+        assert img is not None, 'Image Not Found ' + path
+        h0, w0 = img.shape[:2]  # orig hw
+        r = self.img_size / max(h0, w0)  # resize image to img_size
+        if r != 1:  # always resize down, only resize up if training with augmentation
+            interp = cv2.INTER_AREA if r < 1 and not self.augment else cv2.INTER_LINEAR
+            img = cv2.resize(img, (int(w0 * r), int(h0 * r)), interpolation=interp)
+        return img, (h0, w0), img.shape[:2]  # img, hw_original, hw_resized
+    else:
+        return self.imgs[index], self.img_hw0[index], self.img_hw[index]  # img, hw_original, hw_resized
+
+
+def augment_hsv(img, hgain=0.5, sgain=0.5, vgain=0.5):
+    r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1  # random gains
+    hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))
+    dtype = img.dtype  # uint8
+
+    x = np.arange(0, 256, dtype=np.int16)
+    lut_hue = ((x * r[0]) % 180).astype(dtype)
+    lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
+    lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
+
+    img_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))).astype(dtype)
+    cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)  # no return needed
+
+    # Histogram equalization
+    # if random.random() < 0.2:
+    #     for i in range(3):
+    #         img[:, :, i] = cv2.equalizeHist(img[:, :, i])
+
+
+def load_mosaic(self, index):
+    # loads images in a mosaic
+
+    labels4 = []
+    s = self.img_size
+    yc, xc = [int(random.uniform(-x, 2 * s + x)) for x in self.mosaic_border]  # mosaic center x, y
+    indices = [index] + [self.indices[random.randint(0, self.n - 1)] for _ in range(3)]  # 3 additional image indices
+    for i, index in enumerate(indices):
+        # Load image
+        img, _, (h, w) = load_image(self, index)
+
+        # place img in img4
+        if i == 0:  # top left
+            img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8)  # base image with 4 tiles
+            x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
+            x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
+        elif i == 1:  # top right
+            x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
+            x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
+        elif i == 2:  # bottom left
+            x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
+            x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
+        elif i == 3:  # bottom right
+            x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
+            x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
+
+        img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
+        padw = x1a - x1b
+        padh = y1a - y1b
+
+        # Labels
+        x = self.labels[index]
+        labels = x.copy()
+        if x.size > 0:  # Normalized xywh to pixel xyxy format
+            labels[:, 1] = w * (x[:, 1] - x[:, 3] / 2) + padw
+            labels[:, 2] = h * (x[:, 2] - x[:, 4] / 2) + padh
+            labels[:, 3] = w * (x[:, 1] + x[:, 3] / 2) + padw
+            labels[:, 4] = h * (x[:, 2] + x[:, 4] / 2) + padh
+        labels4.append(labels)
+
+    # Concat/clip labels
+    if len(labels4):
+        labels4 = np.concatenate(labels4, 0)
+        np.clip(labels4[:, 1:], 0, 2 * s, out=labels4[:, 1:])  # use with random_perspective
+        # img4, labels4 = replicate(img4, labels4)  # replicate
+
+    # Augment
+    img4, labels4 = random_perspective(img4, labels4,
+                                       degrees=self.hyp['degrees'],
+                                       translate=self.hyp['translate'],
+                                       scale=self.hyp['scale'],
+                                       shear=self.hyp['shear'],
+                                       perspective=self.hyp['perspective'],
+                                       border=self.mosaic_border)  # border to remove
+
+    return img4, labels4
+
+
+def replicate(img, labels):
+    # Replicate labels
+    h, w = img.shape[:2]
+    boxes = labels[:, 1:].astype(int)
+    x1, y1, x2, y2 = boxes.T
+    s = ((x2 - x1) + (y2 - y1)) / 2  # side length (pixels)
+    for i in s.argsort()[:round(s.size * 0.5)]:  # smallest indices
+        x1b, y1b, x2b, y2b = boxes[i]
+        bh, bw = y2b - y1b, x2b - x1b
+        yc, xc = int(random.uniform(0, h - bh)), int(random.uniform(0, w - bw))  # offset x, y
+        x1a, y1a, x2a, y2a = [xc, yc, xc + bw, yc + bh]
+        img[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
+        labels = np.append(labels, [[labels[i, 0], x1a, y1a, x2a, y2a]], axis=0)
+
+    return img, labels
+
+
+def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True):
+    # Resize image to a 32-pixel-multiple rectangle https://github.com/ultralytics/yolov3/issues/232
+    shape = img.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better test mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, 32), np.mod(dh, 32)  # wh padding
+    elif scaleFill:  # stretch
+        dw, dh = 0.0, 0.0
+        new_unpad = (new_shape[1], new_shape[0])
+        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    return img, ratio, (dw, dh)
+
+
+def random_perspective(img, targets=(), degrees=10, translate=.1, scale=.1, shear=10, perspective=0.0, border=(0, 0)):
+    # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10))
+    # targets = [cls, xyxy]
+
+    height = img.shape[0] + border[0] * 2  # shape(h,w,c)
+    width = img.shape[1] + border[1] * 2
+
+    # Center
+    C = np.eye(3)
+    C[0, 2] = -img.shape[1] / 2  # x translation (pixels)
+    C[1, 2] = -img.shape[0] / 2  # y translation (pixels)
+
+    # Perspective
+    P = np.eye(3)
+    P[2, 0] = random.uniform(-perspective, perspective)  # x perspective (about y)
+    P[2, 1] = random.uniform(-perspective, perspective)  # y perspective (about x)
+
+    # Rotation and Scale
+    R = np.eye(3)
+    a = random.uniform(-degrees, degrees)
+    # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
+    s = random.uniform(1 - scale, 1 + scale)
+    # s = 2 ** random.uniform(-scale, scale)
+    R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
+
+    # Shear
+    S = np.eye(3)
+    S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # x shear (deg)
+    S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # y shear (deg)
+
+    # Translation
+    T = np.eye(3)
+    T[0, 2] = random.uniform(0.5 - translate, 0.5 + translate) * width  # x translation (pixels)
+    T[1, 2] = random.uniform(0.5 - translate, 0.5 + translate) * height  # y translation (pixels)
+
+    # Combined rotation matrix
+    M = T @ S @ R @ P @ C  # order of operations (right to left) is IMPORTANT
+    if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any():  # image changed
+        if perspective:
+            img = cv2.warpPerspective(img, M, dsize=(width, height), borderValue=(114, 114, 114))
+        else:  # affine
+            img = cv2.warpAffine(img, M[:2], dsize=(width, height), borderValue=(114, 114, 114))
+
+    # Visualize
+    # import matplotlib.pyplot as plt
+    # ax = plt.subplots(1, 2, figsize=(12, 6))[1].ravel()
+    # ax[0].imshow(img[:, :, ::-1])  # base
+    # ax[1].imshow(img2[:, :, ::-1])  # warped
+
+    # Transform label coordinates
+    n = len(targets)
+    if n:
+        # warp points
+        xy = np.ones((n * 4, 3))
+        xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+        xy = xy @ M.T  # transform
+        if perspective:
+            xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)  # rescale
+        else:  # affine
+            xy = xy[:, :2].reshape(n, 8)
+
+        # create new boxes
+        x = xy[:, [0, 2, 4, 6]]
+        y = xy[:, [1, 3, 5, 7]]
+        xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+
+        # # apply angle-based reduction of bounding boxes
+        # radians = a * math.pi / 180
+        # reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5
+        # x = (xy[:, 2] + xy[:, 0]) / 2
+        # y = (xy[:, 3] + xy[:, 1]) / 2
+        # w = (xy[:, 2] - xy[:, 0]) * reduction
+        # h = (xy[:, 3] - xy[:, 1]) * reduction
+        # xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T
+
+        # clip boxes
+        xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
+        xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
+
+        # filter candidates
+        i = box_candidates(box1=targets[:, 1:5].T * s, box2=xy.T)
+        targets = targets[i]
+        targets[:, 1:5] = xy[i]
+
+    return img, targets
+
+
+def box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.1):  # box1(4,n), box2(4,n)
+    # Compute candidate boxes: box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio
+    w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
+    w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
+    ar = np.maximum(w2 / (h2 + 1e-16), h2 / (w2 + 1e-16))  # aspect ratio
+    return (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + 1e-16) > area_thr) & (ar < ar_thr)  # candidates
+
+
+def cutout(image, labels):
+    # Applies image cutout augmentation https://arxiv.org/abs/1708.04552
+    h, w = image.shape[:2]
+
+    def bbox_ioa(box1, box2):
+        # Returns the intersection over box2 area given box1, box2. box1 is 4, box2 is nx4. boxes are x1y1x2y2
+        box2 = box2.transpose()
+
+        # Get the coordinates of bounding boxes
+        b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
+        b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
+
+        # Intersection area
+        inter_area = (np.minimum(b1_x2, b2_x2) - np.maximum(b1_x1, b2_x1)).clip(0) * \
+                     (np.minimum(b1_y2, b2_y2) - np.maximum(b1_y1, b2_y1)).clip(0)
+
+        # box2 area
+        box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + 1e-16
+
+        # Intersection over box2 area
+        return inter_area / box2_area
+
+    # create random masks
+    scales = [0.5] * 1 + [0.25] * 2 + [0.125] * 4 + [0.0625] * 8 + [0.03125] * 16  # image size fraction
+    for s in scales:
+        mask_h = random.randint(1, int(h * s))
+        mask_w = random.randint(1, int(w * s))
+
+        # box
+        xmin = max(0, random.randint(0, w) - mask_w // 2)
+        ymin = max(0, random.randint(0, h) - mask_h // 2)
+        xmax = min(w, xmin + mask_w)
+        ymax = min(h, ymin + mask_h)
+
+        # apply random color mask
+        image[ymin:ymax, xmin:xmax] = [random.randint(64, 191) for _ in range(3)]
+
+        # return unobscured labels
+        if len(labels) and s > 0.03:
+            box = np.array([xmin, ymin, xmax, ymax], dtype=np.float32)
+            ioa = bbox_ioa(box, labels[:, 1:5])  # intersection over area
+            labels = labels[ioa < 0.60]  # remove >60% obscured labels
+
+    return labels
+
+
+def create_folder(path='./new'):
+    # Create folder
+    if os.path.exists(path):
+        shutil.rmtree(path)  # delete output folder
+    os.makedirs(path)  # make new output folder
+
+
+def flatten_recursive(path='../coco128'):
+    # Flatten a recursive directory by bringing all files to top level
+    new_path = Path(path + '_flat')
+    create_folder(new_path)
+    for file in tqdm(glob.glob(str(Path(path)) + '/**/*.*', recursive=True)):
+        shutil.copyfile(file, new_path / Path(file).name)
+
+
+def extract_boxes(path='../coco128/'):  # from utils.datasets import *; extract_boxes('../coco128')
+    # Convert detection dataset into classification dataset, with one directory per class
+
+    path = Path(path)  # images dir
+    shutil.rmtree(path / 'classifier') if (path / 'classifier').is_dir() else None  # remove existing
+    files = list(path.rglob('*.*'))
+    n = len(files)  # number of files
+    for im_file in tqdm(files, total=n):
+        if im_file.suffix[1:] in img_formats:
+            # image
+            im = cv2.imread(str(im_file))[..., ::-1]  # BGR to RGB
+            h, w = im.shape[:2]
+
+            # labels
+            lb_file = Path(img2label_paths([str(im_file)])[0])
+            if Path(lb_file).exists():
+                with open(lb_file, 'r') as f:
+                    lb = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32)  # labels
+
+                for j, x in enumerate(lb):
+                    c = int(x[0])  # class
+                    f = (path / 'classifier') / f'{c}' / f'{path.stem}_{im_file.stem}_{j}.jpg'  # new filename
+                    if not f.parent.is_dir():
+                        f.parent.mkdir(parents=True)
+
+                    b = x[1:] * [w, h, w, h]  # box
+                    # b[2:] = b[2:].max()  # rectangle to square
+                    b[2:] = b[2:] * 1.2 + 3  # pad
+                    b = xywh2xyxy(b.reshape(-1, 4)).ravel().astype(np.int)
+
+                    b[[0, 2]] = np.clip(b[[0, 2]], 0, w)  # clip boxes outside of image
+                    b[[1, 3]] = np.clip(b[[1, 3]], 0, h)
+                    assert cv2.imwrite(str(f), im[b[1]:b[3], b[0]:b[2]]), f'box failure in {f}'
+
+
+def autosplit(path='../coco128', weights=(0.9, 0.1, 0.0)):  # from utils.datasets import *; autosplit('../coco128')
+    """ Autosplit a dataset into train/val/test splits and save path/autosplit_*.txt files
+    # Arguments
+        path:       Path to images directory
+        weights:    Train, val, test weights (list)
+    """
+    path = Path(path)  # images dir
+    files = list(path.rglob('*.*'))
+    n = len(files)  # number of files
+    indices = random.choices([0, 1, 2], weights=weights, k=n)  # assign each image to a split
+    txt = ['autosplit_train.txt', 'autosplit_val.txt', 'autosplit_test.txt']  # 3 txt files
+    [(path / x).unlink() for x in txt if (path / x).exists()]  # remove existing
+    for i, img in tqdm(zip(indices, files), total=n):
+        if img.suffix[1:] in img_formats:
+            with open(path / txt[i], 'a') as f:
+                f.write(str(img) + '\n')  # add image to txt file
diff --git a/experiments/distributed/Detection/utils/google_app_engine/Dockerfile b/experiments/distributed/Detection/utils/google_app_engine/Dockerfile
new file mode 100644
index 0000000..0155618
--- /dev/null
+++ b/experiments/distributed/Detection/utils/google_app_engine/Dockerfile
@@ -0,0 +1,25 @@
+FROM gcr.io/google-appengine/python
+
+# Create a virtualenv for dependencies. This isolates these packages from
+# system-level packages.
+# Use -p python3 or -p python3.7 to select python version. Default is version 2.
+RUN virtualenv /env -p python3
+
+# Setting these environment variables are the same as running
+# source /env/bin/activate.
+ENV VIRTUAL_ENV /env
+ENV PATH /env/bin:$PATH
+
+RUN apt-get update && apt-get install -y python-opencv
+
+# Copy the application's requirements.txt and run pip to install all
+# dependencies into the virtualenv.
+ADD requirements.txt /app/requirements.txt
+RUN pip install -r /app/requirements.txt
+
+# Add the application source code.
+ADD . /app
+
+# Run a WSGI server to serve the application. gunicorn must be declared as
+# a dependency in requirements.txt.
+CMD gunicorn -b :$PORT main:app
diff --git a/experiments/distributed/Detection/utils/google_app_engine/additional_requirements.txt b/experiments/distributed/Detection/utils/google_app_engine/additional_requirements.txt
new file mode 100644
index 0000000..5fcc305
--- /dev/null
+++ b/experiments/distributed/Detection/utils/google_app_engine/additional_requirements.txt
@@ -0,0 +1,4 @@
+# add these requirements in your app on top of the existing ones
+pip==18.1
+Flask==1.0.2
+gunicorn==19.9.0
diff --git a/experiments/distributed/Detection/utils/google_app_engine/app.yaml b/experiments/distributed/Detection/utils/google_app_engine/app.yaml
new file mode 100644
index 0000000..ac29d10
--- /dev/null
+++ b/experiments/distributed/Detection/utils/google_app_engine/app.yaml
@@ -0,0 +1,14 @@
+runtime: custom
+env: flex
+
+service: yolov5app
+
+liveness_check:
+  initial_delay_sec: 600
+
+manual_scaling:
+  instances: 1
+resources:
+  cpu: 1
+  memory_gb: 4
+  disk_size_gb: 20
\ No newline at end of file
diff --git a/experiments/distributed/Detection/utils/metrics.py b/experiments/distributed/Detection/utils/metrics.py
new file mode 100644
index 0000000..99d5bcf
--- /dev/null
+++ b/experiments/distributed/Detection/utils/metrics.py
@@ -0,0 +1,200 @@
+# Model validation metrics
+
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+from . import general
+
+
+def fitness(x):
+    # Model fitness as a weighted combination of metrics
+    w = [0.0, 0.0, 0.1, 0.9]  # weights for [P, R, mAP@0.5, mAP@0.5:0.95]
+    return (x[:, :4] * w).sum(1)
+
+
+def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='precision-recall_curve.png', names=[]):
+    """ Compute the average precision, given the recall and precision curves.
+    Source: https://github.com/rafaelpadilla/Object-Detection-Metrics.
+    # Arguments
+        tp:  True positives (nparray, nx1 or nx10).
+        conf:  Objectness value from 0-1 (nparray).
+        pred_cls:  Predicted object classes (nparray).
+        target_cls:  True object classes (nparray).
+        plot:  Plot precision-recall curve at mAP@0.5
+        save_dir:  Plot save directory
+    # Returns
+        The average precision as computed in py-faster-rcnn.
+    """
+
+    # Sort by objectness
+    i = np.argsort(-conf)
+    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
+
+    # Find unique classes
+    unique_classes = np.unique(target_cls)
+
+    # Create Precision-Recall curve and compute AP for each class
+    px, py = np.linspace(0, 1, 1000), []  # for plotting
+    pr_score = 0.1  # score to evaluate P and R https://github.com/ultralytics/yolov3/issues/898
+    s = [unique_classes.shape[0], tp.shape[1]]  # number class, number iou thresholds (i.e. 10 for mAP0.5...0.95)
+    ap, p, r = np.zeros(s), np.zeros(s), np.zeros(s)
+    for ci, c in enumerate(unique_classes):
+        i = pred_cls == c
+        n_l = (target_cls == c).sum()  # number of labels
+        n_p = i.sum()  # number of predictions
+
+        if n_p == 0 or n_l == 0:
+            continue
+        else:
+            # Accumulate FPs and TPs
+            fpc = (1 - tp[i]).cumsum(0)
+            tpc = tp[i].cumsum(0)
+
+            # Recall
+            recall = tpc / (n_l + 1e-16)  # recall curve
+            r[ci] = np.interp(-pr_score, -conf[i], recall[:, 0])  # r at pr_score, negative x, xp because xp decreases
+
+            # Precision
+            precision = tpc / (tpc + fpc)  # precision curve
+            p[ci] = np.interp(-pr_score, -conf[i], precision[:, 0])  # p at pr_score
+
+            # AP from recall-precision curve
+            for j in range(tp.shape[1]):
+                ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j])
+                if plot and (j == 0):
+                    py.append(np.interp(px, mrec, mpre))  # precision at mAP@0.5
+
+    # Compute F1 score (harmonic mean of precision and recall)
+    f1 = 2 * p * r / (p + r + 1e-16)
+
+    if plot:
+        plot_pr_curve(px, py, ap, save_dir, names)
+
+    return p, r, ap, f1, unique_classes.astype('int32')
+
+
+def compute_ap(recall, precision):
+    """ Compute the average precision, given the recall and precision curves
+    # Arguments
+        recall:    The recall curve (list)
+        precision: The precision curve (list)
+    # Returns
+        Average precision, precision curve, recall curve
+    """
+
+    # Append sentinel values to beginning and end
+    mrec = np.concatenate(([0.], recall, [recall[-1] + 0.01]))
+    mpre = np.concatenate(([1.], precision, [0.]))
+
+    # Compute the precision envelope
+    mpre = np.flip(np.maximum.accumulate(np.flip(mpre)))
+
+    # Integrate area under curve
+    method = 'interp'  # methods: 'continuous', 'interp'
+    if method == 'interp':
+        x = np.linspace(0, 1, 101)  # 101-point interp (COCO)
+        ap = np.trapz(np.interp(x, mrec, mpre), x)  # integrate
+    else:  # 'continuous'
+        i = np.where(mrec[1:] != mrec[:-1])[0]  # points where x axis (recall) changes
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])  # area under curve
+
+    return ap, mpre, mrec
+
+
+class ConfusionMatrix:
+    # Updated version of https://github.com/kaanakan/object_detection_confusion_matrix
+    def __init__(self, nc, conf=0.25, iou_thres=0.45):
+        self.matrix = np.zeros((nc + 1, nc + 1))
+        self.nc = nc  # number of classes
+        self.conf = conf
+        self.iou_thres = iou_thres
+
+    def process_batch(self, detections, labels):
+        """
+        Return intersection-over-union (Jaccard index) of boxes.
+        Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+        Arguments:
+            detections (Array[N, 6]), x1, y1, x2, y2, conf, class
+            labels (Array[M, 5]), class, x1, y1, x2, y2
+        Returns:
+            None, updates confusion matrix accordingly
+        """
+        detections = detections[detections[:, 4] > self.conf]
+        gt_classes = labels[:, 0].int()
+        detection_classes = detections[:, 5].int()
+        iou = general.box_iou(labels[:, 1:], detections[:, :4])
+
+        x = torch.where(iou > self.iou_thres)
+        if x[0].shape[0]:
+            matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy()
+            if x[0].shape[0] > 1:
+                matches = matches[matches[:, 2].argsort()[::-1]]
+                matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
+                matches = matches[matches[:, 2].argsort()[::-1]]
+                matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
+        else:
+            matches = np.zeros((0, 3))
+
+        n = matches.shape[0] > 0
+        m0, m1, _ = matches.transpose().astype(np.int16)
+        for i, gc in enumerate(gt_classes):
+            j = m0 == i
+            if n and sum(j) == 1:
+                self.matrix[gc, detection_classes[m1[j]]] += 1  # correct
+            else:
+                self.matrix[gc, self.nc] += 1  # background FP
+
+        if n:
+            for i, dc in enumerate(detection_classes):
+                if not any(m1 == i):
+                    self.matrix[self.nc, dc] += 1  # background FN
+
+    def matrix(self):
+        return self.matrix
+
+    def plot(self, save_dir='', names=()):
+        try:
+            import seaborn as sn
+
+            array = self.matrix / (self.matrix.sum(0).reshape(1, self.nc + 1) + 1E-6)  # normalize
+            array[array < 0.005] = np.nan  # don't annotate (would appear as 0.00)
+
+            fig = plt.figure(figsize=(12, 9), tight_layout=True)
+            sn.set(font_scale=1.0 if self.nc < 50 else 0.8)  # for label size
+            labels = (0 < len(names) < 99) and len(names) == self.nc  # apply names to ticklabels
+            sn.heatmap(array, annot=self.nc < 30, annot_kws={"size": 8}, cmap='Blues', fmt='.2f', square=True,
+                       xticklabels=names + ['background FN'] if labels else "auto",
+                       yticklabels=names + ['background FP'] if labels else "auto").set_facecolor((1, 1, 1))
+            fig.axes[0].set_xlabel('True')
+            fig.axes[0].set_ylabel('Predicted')
+            fig.savefig(Path(save_dir) / 'confusion_matrix.png', dpi=250)
+        except Exception as e:
+            pass
+
+    def print(self):
+        for i in range(self.nc + 1):
+            print(' '.join(map(str, self.matrix[i])))
+
+
+# Plots ----------------------------------------------------------------------------------------------------------------
+
+def plot_pr_curve(px, py, ap, save_dir='.', names=()):
+    fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True)
+    py = np.stack(py, axis=1)
+
+    if 0 < len(names) < 21:  # show mAP in legend if < 10 classes
+        for i, y in enumerate(py.T):
+            ax.plot(px, y, linewidth=1, label=f'{names[i]} %.3f' % ap[i, 0])  # plot(recall, precision)
+    else:
+        ax.plot(px, py, linewidth=1, color='grey')  # plot(recall, precision)
+
+    ax.plot(px, py.mean(1), linewidth=3, color='blue', label='all classes %.3f mAP@0.5' % ap[:, 0].mean())
+    ax.set_xlabel('Recall')
+    ax.set_ylabel('Precision')
+    ax.set_xlim(0, 1)
+    ax.set_ylim(0, 1)
+    plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
+    fig.savefig(Path(save_dir) / 'precision_recall_curve.png', dpi=250)
diff --git a/experiments/distributed/classification/configs/example.conf b/experiments/distributed/classification/configs/example.conf
new file mode 100644
index 0000000..a1c3d4d
--- /dev/null
+++ b/experiments/distributed/classification/configs/example.conf
@@ -0,0 +1,4 @@
+PYTHON=~/anaconda3/envs/py36/bin/python
+imagenet_data_dir=/home/datasets/imagenet/ILSVRC2012_dataset
+gld_data_dir=~/datasets/landmarks
+cifar10_data_dir=~/datasets/cifar10
\ No newline at end of file
diff --git a/experiments/distributed/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md b/experiments/distributed/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md
new file mode 100644
index 0000000..1ce71f3
--- /dev/null
+++ b/experiments/distributed/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md	
@@ -0,0 +1,15 @@
+# ILSVRC2012-100
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/experiments/distributed/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md b/experiments/distributed/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md
new file mode 100644
index 0000000..e3797fd
--- /dev/null
+++ b/experiments/distributed/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md	
@@ -0,0 +1,41 @@
+# ILSVRC2012-100
+
+
+
+
+
+
+
+
+
+
+# 10 clients
+```
+
+# DAAI
+# srun -N2 -B 4-4:2-2 \
+# srun -w hkbugpusrv03 -n 21 -B 21:4 \
+salloc -w hkbugpusrv03 -n 21 --cpus-per-task=4 \
+mpiexec \
+    ~/py36/bin/python ./main.py \
+    --gpu_util_parse "hkbugpusrv03:6,5,5,5" \
+    --client_num_per_round 20 --client_num_in_total 100 \
+    --gpu_server_num 1 --gpu_num_per_server 1 --ci 0 \
+    --frequency_of_the_test 10 \
+    --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+    --if-timm-dataset -b 16  --data_transform FLTransform \
+    --data_load_num_workers 2 \
+    --comm_round 1000  --epochs 1 \
+    --model mobilenet_v3 \
+    --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+    --opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+    --sched step --decay-rounds 1 --decay-rate .992
+
+```
+
+
+
+
+
+
+
diff --git a/experiments/distributed/classification/experiment_scripts/gld160k EfficientNet.md b/experiments/distributed/classification/experiment_scripts/gld160k EfficientNet.md
new file mode 100644
index 0000000..e69de29
diff --git a/experiments/distributed/classification/experiment_scripts/gld160k MobileNetV3.md b/experiments/distributed/classification/experiment_scripts/gld160k MobileNetV3.md
new file mode 100644
index 0000000..e69de29
diff --git a/experiments/distributed/classification/experiment_scripts/gld23k EfficientNet.md b/experiments/distributed/classification/experiment_scripts/gld23k EfficientNet.md
new file mode 100644
index 0000000..e69de29
diff --git a/experiments/distributed/classification/experiment_scripts/gld23k MobileNetV3.md b/experiments/distributed/classification/experiment_scripts/gld23k MobileNetV3.md
new file mode 100644
index 0000000..9d7a5ef
--- /dev/null
+++ b/experiments/distributed/classification/experiment_scripts/gld23k MobileNetV3.md	
@@ -0,0 +1,148 @@
+# ILSVRC2012-100  MobileNetV3-Large-100
+
+# t716
+PYTHON=/nfs_home/zhtang/miniconda3/bin/python
+imagenet_data_dir=/nfs_home/datasets/ILSVRC2012
+gld_data_dir=/nfs_home/datasets/landmarks
+cifar10_data_dir=/nfs_home/datasets/cifar10
+mnist_data_dir=/nfs_home/datasets/mnist
+
+
+
+```
+# directly run
+# on scigpu
+
+mpirun -np 3 -host scigpu10:2,scigpu13:1 \
+    ~/anaconda3/envs/py36/bin/python ./main.py \
+    --gpu_util_parse "scigpu10:1,1,0,0;scigpu13:0,0,1,0" \
+    --client_num_per_round 2 --client_num_in_total 233 \
+    --gpu_server_num 1 --gpu_num_per_server 1 --ci 0 \
+    --frequency_of_the_test 10 \
+    --dataset gld23k --data_dir ~/datasets/landmarks \
+    --if-timm-dataset -b 16  --data_transform FLTransform \
+    --comm_round 300  --epochs 1 \
+    --model mobilenet_v3 \
+    --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+    --opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+    --sched step --decay-rounds 1 --decay-rate .97
+
+
+
+# on t716
+
+
+mpirun -np 3 -host gpu1:2,gpu3:1 \
+    -mca btl_tcp_if_include 192.168.0.101/24 \
+    ~/miniconda3/bin/python ./main.py \
+    --client_num_per_round 2 --client_num_in_total 233 \
+    --gpu_util_parse "gpu1:2;gpu3:1" \
+    --gpu_server_num 1 --gpu_num_per_server 1 --ci 0 \
+    --frequency_of_the_test 10 \
+    --dataset gld23k --data_dir /nfs_home/datasets/landmarks \
+    --data_load_num_workers 2 \
+    --if-timm-dataset -b 16  --data_transform FLTransform \
+    --comm_round 300  --epochs 1 \
+    --model mobilenet_v3 \
+    --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+    --opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+    --sched step --decay-rounds 1 --decay-rate .97
+
+
+```
+
+# 5 clients
+```
+mpirun -np 6 -host gpu1:1,gpu3:1,gpu4:1,gpu5:1,gpu6:1,gpu7:1 \
+    -mca btl_tcp_if_include 192.168.0.101/24 \
+    ~/miniconda3/bin/python ./main.py \
+    --gpu_util_parse "gpu1:1;gpu3:1;gpu4:1;gpu5:1;gpu6:1;gpu7:1" \
+    --client_num_per_round 5 --client_num_in_total 233 \
+    --gpu_server_num 1 --gpu_num_per_server 1 --ci 0 \
+    --frequency_of_the_test 10 \
+    --dataset gld23k --data_dir /nfs_home/datasets/landmarks \
+    --data_load_num_workers 2 \
+    --if-timm-dataset -b 16  --data_transform FLTransform \
+    --comm_round 300  --epochs 1 \
+    --model mobilenet_v3 \
+    --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+    --opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+    --sched step --decay-rounds 1 --decay-rate .97
+```
+
+
+# 10 clients
+```
+# bad, killed
+mpirun -np 11 -host gpu1:1,gpu3:1,gpu4:1,gpu5:1,gpu6:1,gpu7:1,gpu8:1,gpu9:1,gpu10:1,gpu11:1,gpu13:1 \
+    -mca btl_tcp_if_include 192.168.0.101/24 \
+    ~/miniconda3/bin/python ./main.py \
+    --gpu_util_parse "gpu1:1;gpu3:1;gpu4:1;gpu5:1;gpu6:1;gpu7:1;gpu8:1;gpu9:1;gpu10:1;gpu11:1;gpu13:1" \
+    --client_num_per_round 10 --client_num_in_total 233 \
+    --gpu_server_num 1 --gpu_num_per_server 1 --ci 0 \
+    --frequency_of_the_test 10 \
+    --dataset gld23k --data_dir /nfs_home/datasets/landmarks \
+    --if-timm-dataset -b 16  --data_transform FLTransform \
+    --data_load_num_workers 2 \
+    --comm_round 1000  --epochs 1 \
+    --model mobilenet_v3 \
+    --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+    --opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+    --sched step --decay-rounds 1 --decay-rate .97
+
+mpirun -np 11 -host gpu1:1,gpu27:1,gpu4:1,gpu5:1,gpu6:1,gpu7:1,gpu8:1,gpu9:1,gpu10:1,gpu11:1,gpu13:1 \
+    -mca btl_tcp_if_include 192.168.0.101/24 \
+    ~/miniconda3/bin/python ./main.py \
+    --gpu_util_parse "gpu1:1;gpu27:1;gpu4:1;gpu5:1;gpu6:1;gpu7:1;gpu8:1;gpu9:1;gpu10:1;gpu11:1;gpu13:1" \
+    --client_num_per_round 10 --client_num_in_total 233 \
+    --gpu_server_num 1 --gpu_num_per_server 1 --ci 0 \
+    --frequency_of_the_test 10 \
+    --dataset gld23k --data_dir /nfs_home/datasets/landmarks \
+    --if-timm-dataset -b 16  --data_transform FLTransform \
+    --comm_round 1000  --epochs 1 \
+    --model mobilenet_v3 \
+    --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+    --opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+    --sched step --decay-rounds 1 --decay-rate .97
+
+mpirun -np 11 -host gpu1:1,gpu3:1,gpu4:1,gpu5:1,gpu6:1,gpu7:1,gpu8:1,gpu9:1,gpu10:1,gpu11:1,gpu13:1 \
+    -mca btl_tcp_if_include 192.168.0.101/24 \
+    ~/miniconda3/bin/python ./main.py \
+    --gpu_util_parse "gpu1:1;gpu3:1;gpu4:1;gpu5:1;gpu6:1;gpu7:1;gpu8:1;gpu9:1;gpu10:1;gpu11:1;gpu13:1" \
+    --client_num_per_round 10 --client_num_in_total 233 \
+    --gpu_server_num 1 --gpu_num_per_server 1 --ci 0 \
+    --frequency_of_the_test 10 \
+    --dataset gld23k --data_dir /nfs_home/datasets/landmarks \
+    --if-timm-dataset -b 16  --data_transform FLTransform \
+    --data_load_num_workers 2 \
+    --comm_round 1000  --epochs 1 \
+    --model mobilenet_v3 \
+    --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+    --opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+    --sched step --decay-rounds 1 --decay-rate .992
+
+mpirun -np 11 -host gpu14:1,gpu15:1,gpu16:1,gpu17:1,gpu19:1,gpu20:1,gpu21:1,gpu22:1,gpu23:1,gpu24:1,gpu26:1 \
+    -mca btl_tcp_if_include 192.168.0.101/24 \
+    ~/miniconda3/bin/python ./main.py \
+    --gpu_util_parse "gpu14:1;gpu15:1;gpu16:1;gpu17:1;gpu19:1;gpu20:1;gpu21:1;gpu22:1;gpu23:1;gpu24:1;gpu26:1" \
+    --client_num_per_round 10 --client_num_in_total 233 \
+    --gpu_server_num 1 --gpu_num_per_server 1 --ci 0 \
+    --frequency_of_the_test 10 \
+    --dataset gld23k --data_dir /nfs_home/datasets/landmarks \
+    --if-timm-dataset -b 64  --data_transform FLTransform \
+    --data_load_num_workers 2 \
+    --comm_round 1000  --epochs 1 \
+    --model mobilenet_v3 \
+    --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+    --opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+    --sched step --decay-rounds 1 --decay-rate .992
+```
+
+
+
+
+
+
+
+
+
diff --git a/experiments/distributed/classification/gpuutils/DAAI_gpu_util.yaml b/experiments/distributed/classification/gpuutils/DAAI_gpu_util.yaml
new file mode 100644
index 0000000..e69de29
diff --git a/experiments/distributed/classification/gpuutils/scigpu_gpu_util.yaml b/experiments/distributed/classification/gpuutils/scigpu_gpu_util.yaml
new file mode 100644
index 0000000..8020d5a
--- /dev/null
+++ b/experiments/distributed/classification/gpuutils/scigpu_gpu_util.yaml
@@ -0,0 +1,36 @@
+local_10:
+    localhost: [2, 2, 3, 3]
+
+local_11:
+    localhost: [2, 3, 3, 3]
+
+local_20:
+    localhost: [5, 5, 5, 5]
+
+local_21:
+    localhost: [6, 5, 5, 5]
+
+
+gpu_util_11:
+    scigpu10: [2, 1 ,1 ,1]
+    scigpu11: [1, 2, 2, 1]
+
+gpu_util_21:
+    scigpu10: [2, 1, 1, 1]
+    scigpu11: [1, 1, 1, 1]
+    scigpu13: [2, 2, 1, 2]
+    scigpu14: [2, 1, 1, 1]
+
+gpu_util_41:
+    scigpu10: [5, 4 ,4 ,4]
+    scigpu11: [4, 4, 4, 4]
+    scigpu12: [8, 8 ,0 ,0]
+    scigpu14: [2, 2, 2, 2]
+
+gpu_util_51:
+    scigpu10: [5, 4 ,4 ,4]
+    scigpu11: [4, 4, 4, 4]
+    scigpu12: [8, 8 ,0 ,0]
+    scigpu13: [2, 2, 2, 2]
+    scigpu14: [2, 2, 2, 2]
+
diff --git a/experiments/distributed/classification/gpuutils/t716_gpu_util.yaml b/experiments/distributed/classification/gpuutils/t716_gpu_util.yaml
new file mode 100644
index 0000000..0fdd753
--- /dev/null
+++ b/experiments/distributed/classification/gpuutils/t716_gpu_util.yaml
@@ -0,0 +1,121 @@
+4gpus:
+    gpu1: [1]
+    gpu5: [1]
+    gpu3: [1]
+    gpu4: [1]
+
+4gpus_1:
+    gpu6: [1]
+    gpu7: [1]
+    gpu8: [1]
+    gpu9: [1]
+
+4gpus_2:
+    gpu10: [1]
+    gpu11: [1]
+    gpu13: [1]
+    gpu14: [1]
+4gpus_3:
+    gpu15: [1]
+    gpu16: [1]
+    gpu17: [1]
+    gpu19: [1]
+4gpus_4:
+    gpu20: [1]
+    gpu21: [1]
+    gpu22: [1]
+    gpu23: [1]
+
+5gpus:
+    gpu1: [1]
+    gpu2: [1]
+    gpu3: [1]
+    gpu4: [1]
+    gpu5: [1]
+
+10gpus:
+    gpu1: [2]
+    gpu2: [2]
+    gpu3: [2]
+    gpu4: [2]
+    gpu5: [2]
+
+11gpus_2:
+    gpu1: [1]
+    gpu3: [1]
+    gpu4: [1]
+    gpu5: [1]
+    gpu6: [1]
+    gpu7: [1]
+    gpu8: [1]
+    gpu9: [1]
+    gpu10: [1]
+    gpu11: [1]
+    gpu13: [1]
+
+20gpus:
+    gpu1: [2]
+    gpu2: [2]
+    gpu3: [2]
+    gpu4: [2]
+    gpu5: [2]
+    gpu6: [2]
+    gpu7: [2]
+    gpu8: [2]
+    gpu9: [2]
+    gpu10: [2]
+
+21gpus_2:
+    gpu1: [1]
+    gpu3: [1]
+    gpu4: [1]
+    gpu5: [1]
+    gpu6: [1]
+    gpu7: [1]
+    gpu8: [1]
+    gpu9: [1]
+    gpu10: [1]
+    gpu11: [1]
+    gpu13: [1]
+    gpu14: [1]
+    gpu15: [1]
+    gpu16: [1]
+    gpu17: [1]
+    gpu19: [1]
+    gpu20: [1]
+    gpu21: [1]
+    gpu22: [1]
+    gpu23: [1]
+    gpu24: [1]
+
+40gpus:
+    gpu1: [2]
+    gpu2: [2]
+    gpu3: [2]
+    gpu4: [2]
+    gpu5: [2]
+    gpu6: [2]
+    gpu7: [2]
+    gpu8: [2]
+    gpu9: [2]
+    gpu10: [2]
+    gpu11: [2]
+    gpu12: [2]
+    gpu13: [2]
+    gpu14: [2]
+    gpu15: [2]
+    gpu16: [2]
+    gpu17: [2]
+    gpu18: [2]
+    gpu19: [2]
+    gpu20: [2]
+
+
+
+
+
+
+
+
+
+
diff --git a/experiments/distributed/classification/helloworld.py b/experiments/distributed/classification/helloworld.py
new file mode 100644
index 0000000..b1de7cf
--- /dev/null
+++ b/experiments/distributed/classification/helloworld.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+"""
+Parallel Hello World
+"""
+
+from mpi4py import MPI
+import sys
+import time
+
+size = MPI.COMM_WORLD.Get_size()
+rank = MPI.COMM_WORLD.Get_rank()
+name = MPI.Get_processor_name()
+
+sys.stdout.write("Hello, World! I am process %d of %d on %s.\n" % (rank, size, name))
+time.sleep(300)
diff --git a/experiments/distributed/classification/hostfiles/scigpu_local_hostfile b/experiments/distributed/classification/hostfiles/scigpu_local_hostfile
new file mode 100644
index 0000000..75c8e45
--- /dev/null
+++ b/experiments/distributed/classification/hostfiles/scigpu_local_hostfile
@@ -0,0 +1 @@
+localhost:10
\ No newline at end of file
diff --git a/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_11 b/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_11
new file mode 100644
index 0000000..ab8648d
--- /dev/null
+++ b/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_11
@@ -0,0 +1 @@
+localhost:11
\ No newline at end of file
diff --git a/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_20 b/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_20
new file mode 100644
index 0000000..d211bf4
--- /dev/null
+++ b/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_20
@@ -0,0 +1 @@
+localhost:20
\ No newline at end of file
diff --git a/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_21 b/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_21
new file mode 100644
index 0000000..952486e
--- /dev/null
+++ b/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_21
@@ -0,0 +1 @@
+localhost:21
\ No newline at end of file
diff --git a/experiments/distributed/classification/hostfiles/t716_hostfile_10 b/experiments/distributed/classification/hostfiles/t716_hostfile_10
new file mode 100644
index 0000000..e5a064e
--- /dev/null
+++ b/experiments/distributed/classification/hostfiles/t716_hostfile_10
@@ -0,0 +1,5 @@
+gpu1 slots=2
+gpu2 slots=2
+gpu3 slots=2
+gpu4 slots=2
+gpu5 slots=2
\ No newline at end of file
diff --git a/experiments/distributed/classification/hostfiles/t716_hostfile_11_2 b/experiments/distributed/classification/hostfiles/t716_hostfile_11_2
new file mode 100644
index 0000000..e06c571
--- /dev/null
+++ b/experiments/distributed/classification/hostfiles/t716_hostfile_11_2
@@ -0,0 +1,11 @@
+gpu1 slots=1
+gpu3 slots=1
+gpu4 slots=1
+gpu5 slots=1
+gpu6 slots=1
+gpu7 slots=1
+gpu8 slots=1
+gpu9 slots=1
+gpu10 slots=1
+gpu11 slots=1
+gpu13 slots=1
\ No newline at end of file
diff --git a/experiments/distributed/classification/hostfiles/t716_hostfile_20 b/experiments/distributed/classification/hostfiles/t716_hostfile_20
new file mode 100644
index 0000000..096ab7e
--- /dev/null
+++ b/experiments/distributed/classification/hostfiles/t716_hostfile_20
@@ -0,0 +1,10 @@
+gpu1 slots=2
+gpu2 slots=2
+gpu3 slots=2
+gpu4 slots=2
+gpu5 slots=2
+gpu6 slots=2
+gpu7 slots=2
+gpu8 slots=2
+gpu9 slots=2
+gpu10 slots=2
\ No newline at end of file
diff --git a/experiments/distributed/classification/hostfiles/t716_hostfile_21_2 b/experiments/distributed/classification/hostfiles/t716_hostfile_21_2
new file mode 100644
index 0000000..d12b1c3
--- /dev/null
+++ b/experiments/distributed/classification/hostfiles/t716_hostfile_21_2
@@ -0,0 +1,21 @@
+gpu1 slots=1
+gpu3 slots=1
+gpu4 slots=1
+gpu5 slots=1
+gpu6 slots=1
+gpu7 slots=1
+gpu8 slots=1
+gpu9 slots=1
+gpu10 slots=1
+gpu11 slots=1
+gpu13 slots=1
+gpu14 slots=1
+gpu15 slots=1
+gpu16 slots=1
+gpu17 slots=1
+gpu19 slots=1
+gpu20 slots=1
+gpu21 slots=1
+gpu22 slots=1
+gpu23 slots=1
+gpu24 slots=1
\ No newline at end of file
diff --git a/experiments/distributed/classification/hostfiles/t716_hostfile_4 b/experiments/distributed/classification/hostfiles/t716_hostfile_4
new file mode 100644
index 0000000..4b82e19
--- /dev/null
+++ b/experiments/distributed/classification/hostfiles/t716_hostfile_4
@@ -0,0 +1,4 @@
+gpu1 slots=1
+gpu3 slots=1
+gpu4 slots=1
+gpu5 slots=1
\ No newline at end of file
diff --git a/experiments/distributed/classification/hostfiles/t716_hostfile_5 b/experiments/distributed/classification/hostfiles/t716_hostfile_5
new file mode 100644
index 0000000..c48ee2c
--- /dev/null
+++ b/experiments/distributed/classification/hostfiles/t716_hostfile_5
@@ -0,0 +1,5 @@
+gpu1 slots=1
+gpu3 slots=1
+gpu4 slots=1
+gpu5 slots=1
+gpu6 slots=1
\ No newline at end of file
diff --git a/experiments/distributed/classification/main.py b/experiments/distributed/classification/main.py
new file mode 100644
index 0000000..a421a6e
--- /dev/null
+++ b/experiments/distributed/classification/main.py
@@ -0,0 +1,539 @@
+import argparse
+import logging
+import os
+import random
+import socket
+import sys
+import traceback
+import yaml
+
+import numpy as np
+import psutil
+import setproctitle
+import torch
+import wandb
+from mpi4py import MPI
+
+from timm import create_model as timm_create_model
+from timm.models import resume_checkpoint, load_checkpoint, convert_splitbn_model
+
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../../")))
+
+from FedML.fedml_api.distributed.fedavg.FedAvgAPI import FedML_init, FedML_FedAvg_distributed
+
+
+from data_preprocessing.ImageNet.data_loader import load_partition_data_ImageNet
+from data_preprocessing.Landmarks.data_loader import load_partition_data_landmarks
+from data_preprocessing.cifar10.iid_data_loader import load_iid_cifar10
+from data_preprocessing.cifar10.data_loader import load_partition_data_cifar10
+from data_preprocessing.cifar100.data_loader import load_partition_data_cifar100
+from data_preprocessing.cinic10.data_loader import load_partition_data_cinic10
+
+from training.fedavg_classification_trainer import ClassificationTrainer
+
+from utils.context import (
+    raise_MPI_error
+)
+from utils.logger import (
+    logging_config
+)
+
+
+def add_args(parser):
+    """
+    parser : argparse.ArgumentParser
+    return a parser added with args required by fit
+    """
+    # Training settings
+    parser.add_argument('--model', type=str, default='mobilenet', metavar='N',
+                        help='neural network used in training')
+
+    parser.add_argument('--dataset', type=str, default='cifar10', metavar='N',
+                        help='dataset used for training')
+
+    parser.add_argument('--data_dir', type=str, default='./../../../data/cifar10',
+                        help='data directory')
+
+    parser.add_argument('--partition_method', type=str, default='hetero', metavar='N',
+                        help='how to partition the dataset on local workers')
+
+    parser.add_argument('--partition_alpha', type=float, default=0.5, metavar='PA',
+                        help='partition alpha (default: 0.5)')
+
+    parser.add_argument('--client_num_in_total', type=int, default=1000, metavar='NN',
+                        help='number of workers in a distributed cluster')
+
+    parser.add_argument('--client_num_per_round', type=int, default=4, metavar='NN',
+                        help='number of workers')
+
+    # parser.add_argument('--batch_size', type=int, default=64, metavar='N',
+    #                     help='input batch size for training (default: 64)')
+
+    parser.add_argument('--client_optimizer', type=str, default='adam',
+                        help='SGD with momentum; adam')
+
+    # parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
+    #                     help='learning rate (default: 0.001)')
+
+    parser.add_argument('--wd', help='weight decay parameter;', type=float, default=0.001)
+
+    parser.add_argument('--epochs', type=int, default=5, metavar='EP',
+                        help='how many epochs will be trained locally')
+
+    parser.add_argument('--comm_round', type=int, default=10,
+                        help='how many round of communications we shoud use')
+
+    parser.add_argument('--is_mobile', type=int, default=0,
+                        help='whether the program is running on the FedML-Mobile server side')
+
+    parser.add_argument('--frequency_of_the_test', type=int, default=1,
+                        help='the frequency of the algorithms')
+
+    parser.add_argument('--gpu_server_num', type=int, default=1,
+                        help='gpu_server_num')
+
+    parser.add_argument('--gpu_num_per_server', type=int, default=4,
+                        help='gpu_num_per_server')
+
+    parser.add_argument('--ci', type=int, default=0,
+                        help='CI')
+
+    parser.add_argument('--gpu_util_file', type=str, default=None,
+                        help='the gpu utilization file for servers and clients. If there is no \
+                        gpu_util_file, gpu will not be used.')
+    parser.add_argument('--gpu_util_key', type=str, default=None,
+                        help='the key in gpu utilization file')
+    parser.add_argument('--gpu_util_parse', type=str, default=None,
+                        help='the gpu utilization string for servers and clients. If there is no \
+                        gpu_util_parse, gpu will not be used. Note if this and gpu_util_file are \
+                        both defined, gpu_util_parse will be used but not gpu_util_file')
+
+    parser.add_argument('--pretrained',action='store_true', default=False,
+                        help='Start with pretrained version of specified network (if avail)')
+
+    parser.add_argument('--distributed', action='store_true', default=False,
+                        help='If distributed training')
+
+    parser.add_argument('--if-timm-dataset', action='store_true', default=False,
+                        help='If use timm dataset augmentation')
+
+    parser.add_argument('--data_load_num_workers', type=int, default=4,
+                        help='number of workers when loading data')
+
+
+    # logging settings
+    parser.add_argument('--level', type=str, default='INFO',
+                        help='level of logging')
+
+    # Dataset
+    parser.add_argument('--img-size', type=int, default=None, metavar='N',
+                        help='Image patch size (default: None => model default)')
+    parser.add_argument('--crop-pct', default=None, type=float,
+                        metavar='N', help='Input image center crop percent (for validation only)')
+    parser.add_argument('--data_transform', default=None, type=str, metavar='TRANSFORM',
+                        help='How to do data transform')
+    parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+                        help='Override mean pixel value of dataset')
+    parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+                        help='Override std deviation of of dataset')
+    parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+                        help='Image resize interpolation type (overrides model)')
+    parser.add_argument('-b', '--batch-size', type=int, default=32, metavar='N',
+                        help='input batch size for training (default: 32)')
+    parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N',
+                        help='ratio of validation batch size to training batch size (default: 1)')
+
+
+    # Model parameters
+    parser.add_argument('--gp', default=None, type=str, metavar='POOL',
+                        help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.')
+
+    # Optimizer parameters
+    parser.add_argument('--opt', default='sgd', type=str, metavar='OPTIMIZER',
+                        help='Optimizer (default: "sgd"')
+    parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON',
+                        help='Optimizer Epsilon (default: None, use opt default)')
+    parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+                        help='Optimizer Betas (default: None, use opt default)')
+    parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+                        help='Optimizer momentum (default: 0.9)')
+    parser.add_argument('--weight-decay', type=float, default=0.0001,
+                        help='weight decay (default: 0.0001)')
+    parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM',
+                        help='Clip gradient norm (default: None, no clipping)')
+
+
+    # Learning rate schedule parameters
+    parser.add_argument('--sched', default=None, type=str, metavar='SCHEDULER',
+                        help='LR scheduler (default: "step"')
+    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                        help='learning rate (default: 0.01)')
+    parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct',
+                        help='learning rate noise on/off epoch percentages')
+    parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT',
+                        help='learning rate noise limit percent (default: 0.67)')
+    parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV',
+                        help='learning rate noise std-dev (default: 1.0)')
+    parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT',
+                        help='learning rate cycle len multiplier (default: 1.0)')
+    parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N',
+                        help='learning rate cycle limit')
+    parser.add_argument('--warmup-lr', type=float, default=0.0001, metavar='LR',
+                        help='warmup learning rate (default: 0.0001)')
+    parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')
+    # parser.add_argument('--epochs', type=int, default=200, metavar='N',
+    #                     help='number of epochs to train (default: 2)')
+    parser.add_argument('--start-epoch', default=None, type=int, metavar='N',
+                        help='manual epoch number (useful on restarts)')
+    parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
+                        help='epoch interval to decay LR')
+    parser.add_argument('--warmup-epochs', type=int, default=3, metavar='N',
+                        help='epochs to warmup LR, if scheduler supports')
+    parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N',
+                        help='epochs to cooldown LR at min_lr, after cyclic schedule ends')
+    parser.add_argument('--patience-epochs', type=int, default=10, metavar='N',
+                        help='patience epochs for Plateau LR scheduler (default: 10')
+    parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
+                        help='LR decay rate (default: 0.1)')
+
+    parser.add_argument('--decay-rounds', type=float, default=30, metavar='N',
+                        help='round interval to decay LR')
+
+
+    # Augmentation & regularization parameters
+    parser.add_argument('--no-aug', action='store_true', default=False,
+                        help='Disable all training augmentation, override other train aug args')
+    parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT',
+                        help='Random resize scale (default: 0.08 1.0)')
+    parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO',
+                        help='Random resize aspect ratio (default: 0.75 1.33)')
+    parser.add_argument('--hflip', type=float, default=0.5,
+                        help='Horizontal flip training aug probability')
+    parser.add_argument('--vflip', type=float, default=0.,
+                        help='Vertical flip training aug probability')
+    parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT',
+                        help='Color jitter factor (default: 0.4)')
+    parser.add_argument('--aa', type=str, default=None, metavar='NAME',
+                        help='Use AutoAugment policy. "v0" or "original". (default: None)'),
+    parser.add_argument('--aug-splits', type=int, default=0,
+                        help='Number of augmentation splits (default: 0, valid: 0 or >=2)')
+    parser.add_argument('--jsd', action='store_true', default=False,
+                        help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.')
+    parser.add_argument('--reprob', type=float, default=0., metavar='PCT',
+                        help='Random erase prob (default: 0.)')
+    parser.add_argument('--remode', type=str, default='const',
+                        help='Random erase mode (default: "const")')
+    parser.add_argument('--recount', type=int, default=1,
+                        help='Random erase count (default: 1)')
+    parser.add_argument('--resplit', action='store_true', default=False,
+                        help='Do not random erase first (clean) augmentation split')
+    parser.add_argument('--mixup', type=float, default=0.0,
+                        help='mixup alpha, mixup enabled if > 0. (default: 0.)')
+    parser.add_argument('--cutmix', type=float, default=0.0,
+                        help='cutmix alpha, cutmix enabled if > 0. (default: 0.)')
+    parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None,
+                        help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
+    parser.add_argument('--mixup-prob', type=float, default=1.0,
+                        help='Probability of performing mixup or cutmix when either/both is enabled')
+    parser.add_argument('--mixup-switch-prob', type=float, default=0.5,
+                        help='Probability of switching to cutmix when both mixup and cutmix enabled')
+    parser.add_argument('--mixup-mode', type=str, default='batch',
+                        help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
+    parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N',
+                        help='Turn off mixup after this epoch, disabled if 0 (default: 0)')
+    parser.add_argument('--smoothing', type=float, default=0.1,
+                        help='Label smoothing (default: 0.1)')
+    parser.add_argument('--train-interpolation', type=str, default='random',
+                        help='Training interpolation (random, bilinear, bicubic default: "random")')
+    parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
+                        help='Dropout rate (default: 0.)')
+    parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT',
+                        help='Drop connect rate, DEPRECATED, use drop-path (default: None)')
+    parser.add_argument('--drop-path', type=float, default=None, metavar='PCT',
+                        help='Drop path rate (default: None)')
+    parser.add_argument('--drop-block', type=float, default=None, metavar='PCT',
+                        help='Drop block rate (default: None)')
+
+    # Batch norm parameters (only works with gen_efficientnet based models currently)
+    parser.add_argument('--bn-tf', type=bool, default=False,
+                        help='Use Tensorflow BatchNorm defaults for models that support it (default: False)')
+    parser.add_argument('--bn-momentum', type=float, default=None,
+                        help='BatchNorm momentum override (if not None)')
+    parser.add_argument('--bn-eps', type=float, default=None,
+                        help='BatchNorm epsilon override (if not None)')
+    parser.add_argument('--sync-bn', action='store_true',
+                        help='Enable NVIDIA Apex or Torch synchronized BatchNorm.')
+    parser.add_argument('--dist-bn', type=str, default='',
+                        help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")')
+    parser.add_argument('--split-bn', action='store_true',
+                        help='Enable separate BN layers per augmentation split.')
+
+    # Model Exponential Moving Average
+    parser.add_argument('--model-ema', action='store_true', default=False,
+                        help='Enable tracking moving average of model weights')
+    parser.add_argument('--model-ema-force-cpu', action='store_true', default=False,
+                        help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.')
+    parser.add_argument('--model-ema-decay', type=float, default=0.9998,
+                        help='decay factor for model weights moving average (default: 0.9998)')
+
+
+    args = parser.parse_args()
+    return args
+
+
+def load_data(args, dataset_name):
+    if dataset_name in ["ILSVRC2012", "ILSVRC2012-100"]:
+        logging.info("load_data. dataset_name = %s" % dataset_name)
+        train_data_num, test_data_num, train_data_global, test_data_global, \
+        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
+        class_num = load_partition_data_ImageNet(dataset=dataset_name, data_dir=args.data_dir,
+                                                 partition_method=None, partition_alpha=None,
+                                                 client_number=args.client_num_in_total, 
+                                                 batch_size=args.batch_size, args=args)
+
+    elif dataset_name == "gld23k":
+        logging.info("load_data. dataset_name = %s" % dataset_name)
+        args.client_num_in_total = 233
+        # fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_train.csv')
+        # fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_test.csv')
+        fed_train_map_file = os.path.join(args.data_dir, 'mini_gld_train_split.csv')
+        fed_test_map_file = os.path.join(args.data_dir, 'mini_gld_test.csv')
+
+        args.data_dir = os.path.join(args.data_dir, 'images')
+
+        train_data_num, test_data_num, train_data_global, test_data_global, \
+        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
+        class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir,
+                                                  fed_train_map_file=fed_train_map_file,
+                                                  fed_test_map_file=fed_test_map_file,
+                                                  partition_method=None, partition_alpha=None,
+                                                  client_number=args.client_num_in_total, 
+                                                  batch_size=args.batch_size, args=args)
+    elif dataset_name == "gld160k":
+        logging.info("load_data. dataset_name = %s" % dataset_name)
+        args.client_num_in_total = 1262
+        fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_train.csv')
+        fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_test.csv')
+        args.data_dir = os.path.join(args.data_dir, 'images')
+
+        train_data_num, test_data_num, train_data_global, test_data_global, \
+        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
+        class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir,
+                                                  fed_train_map_file=fed_train_map_file,
+                                                  fed_test_map_file=fed_test_map_file,
+                                                  partition_method=None, partition_alpha=None,
+                                                  client_number=args.client_num_in_total, 
+                                                  batch_size=args.batch_size, args=args)
+    else:
+        if dataset_name == "cifar10":
+            data_loader = load_partition_data_cifar10
+        elif dataset_name == "cifar100":
+            data_loader = load_partition_data_cifar100
+        elif dataset_name == "cinic10":
+            data_loader = load_partition_data_cinic10
+        else:
+            raise Exception("no such dataset")
+
+    dataset = [train_data_num, test_data_num, train_data_global, test_data_global,
+               train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num]
+    return dataset
+
+
+def create_model(args, model_name, output_dim):
+    logging.info("create_model. model_name = %s, output_dim = %s" % (model_name, output_dim))
+    if model_name == 'mobilenet_v3':
+        '''model_mode \in {LARGE: 5.15M, SMALL: 2.94M}'''
+        # model = MobileNetV3(model_mode='LARGE')
+        model = timm_create_model(
+        model_name="mobilenetv3_large_100",
+        pretrained=args.pretrained,
+        num_classes=output_dim,
+        drop_rate=args.drop,
+        # drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
+        drop_path_rate=args.drop_path,
+        drop_block_rate=args.drop_block,
+        global_pool=args.gp,
+        bn_tf=args.bn_tf,
+        bn_momentum=args.bn_momentum,
+        bn_eps=args.bn_eps)
+
+    elif model_name == 'efficientnet':
+        model = timm_create_model(
+        model_name="efficientnet_b0",
+        pretrained=args.pretrained,
+        num_classes=output_dim,
+        drop_rate=args.drop,
+        # drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
+        drop_path_rate=args.drop_path,
+        drop_block_rate=args.drop_block,
+        global_pool=args.gp,
+        bn_tf=args.bn_tf,
+        bn_momentum=args.bn_momentum,
+        bn_eps=args.bn_eps)
+    else:
+        raise Exception("no such model")
+    return model
+
+
+
+def init_training_device(process_ID, fl_worker_num, gpu_num_per_machine):
+    # initialize the mapping from process ID to GPU ID: <process ID, GPU ID>
+    if process_ID == 0:
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        return device
+    process_gpu_dict = dict()
+    for client_index in range(fl_worker_num):
+        gpu_index = client_index % gpu_num_per_machine
+        process_gpu_dict[client_index] = gpu_index
+
+    logging.info(process_gpu_dict)
+    device = torch.device("cuda:" + str(process_gpu_dict[process_ID - 1]) if torch.cuda.is_available() else "cpu")
+    logging.info(device)
+    return device
+
+def init_training_device_from_gpu_util_file(process_id, worker_number, gpu_util_file, gpu_util_key):
+
+    if gpu_util_file == None:
+        device = torch.device("cpu")
+        logging.info(" !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+        logging.info(" ##################  Not Indicate gpu_util_file, using cpu  #################")
+        logging.info(device)
+        #return gpu_util_map[process_id][1]
+        return device
+    else:
+        with open(gpu_util_file, 'r') as f:
+            gpu_util_yaml = yaml.load(f, Loader=yaml.FullLoader)
+            # gpu_util_num_process = 'gpu_util_' + str(worker_number)
+            # gpu_util = gpu_util_yaml[gpu_util_num_process]
+            gpu_util = gpu_util_yaml[gpu_util_key]
+            gpu_util_map = {}
+            i = 0
+            for host, gpus_util_map_host in gpu_util.items():
+                for gpu_j, num_process_on_gpu in enumerate(gpus_util_map_host):
+                    for _ in range(num_process_on_gpu):
+                        gpu_util_map[i] = (host, gpu_j)
+                        i += 1
+            logging.info("Process %d running on host: %s,gethostname: %s, gpu: %d ..." % (
+                process_id, gpu_util_map[process_id][0], socket.gethostname(), gpu_util_map[process_id][1]))
+            assert i == worker_number
+
+        device = torch.device("cuda:" + str(gpu_util_map[process_id][1]) if torch.cuda.is_available() else "cpu")
+        logging.info(device)
+        #return gpu_util_map[process_id][1]
+        return device
+
+def init_training_device_from_gpu_util_parse(process_id, worker_number, gpu_util_parse):
+    if gpu_util_parse == None:
+        device = torch.device("cpu")
+        logging.info(" !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
+        logging.info(" ##################  Not Indicate gpu_util_file, using cpu  #################")
+        logging.info(device)
+        #return gpu_util_map[process_id][1]
+        return device
+    else:
+        # example parse str `gpu_util_parse`: 
+        # "gpu1:0,1,1,2;gpu2:3,3,3;gpu3:0,0,0,1,2,4,4,0"
+        gpu_util_parse_temp = gpu_util_parse.split(';')
+        gpu_util_parse_temp = [(item.split(':')[0], item.split(':')[1]) for item in gpu_util_parse_temp ]
+
+        gpu_util = {}
+        for (host, gpus_str) in gpu_util_parse_temp:
+            gpu_util[host] = [int(num_process_on_gpu) for num_process_on_gpu in gpus_str.split(',')]
+
+        gpu_util_map = {}
+        i = 0
+        for host, gpus_util_map_host in gpu_util.items():
+            for gpu_j, num_process_on_gpu in enumerate(gpus_util_map_host):
+                for _ in range(num_process_on_gpu):
+                    gpu_util_map[i] = (host, gpu_j)
+                    i += 1
+        logging.info("Process %d running on host: %s,gethostname: %s, gpu: %d ..." % (
+            process_id, gpu_util_map[process_id][0], socket.gethostname(), gpu_util_map[process_id][1]))
+        assert i == worker_number
+
+        device = torch.device("cuda:" + str(gpu_util_map[process_id][1]) if torch.cuda.is_available() else "cpu")
+        logging.info(device)
+        #return gpu_util_map[process_id][1]
+        return device
+
+
+
+if __name__ == "__main__":
+    # initialize distributed computing (MPI)
+    comm, process_id, worker_number = FedML_init()
+
+    with raise_MPI_error():
+        # parse python script input parameters
+        parser = argparse.ArgumentParser()
+        args = add_args(parser)
+        args.rank = process_id
+        args.wd = args.weight_decay
+
+        logging.info(args)
+
+        # customize the process name
+        str_process_name = 'fedavg' + " :" + str(process_id)
+        setproctitle.setproctitle(str_process_name)
+
+        logging_config(args, process_id)
+
+        # initialize the wandb machine learning experimental tracking platform (https://www.wandb.com/).
+        name_model_ema = "-model_ema" if args.model_ema else "-no_model_ema"
+        name_aa = args.aa if args.aa is not None else "_None"
+        if process_id == 0:
+            wandb.init(
+                entity="automl",
+                project="fedcv-classification",
+                name="fedavg (d)" + str(args.partition_method) + "-" +str(args.dataset)+
+                    "-e" + str(args.epochs) + "-" + str(args.model) + "-" +
+                    args.data_transform + "-aa" + name_aa + "-" + str(args.opt) + 
+                    name_model_ema + "-bs" + str(args.batch_size) +
+                    "-lr" + str(args.lr) + "-wd" + str(args.wd),
+                config=args
+            )
+
+        # Set the random seed. The np.random seed determines the dataset partition.
+        # The torch_manual_seed determines the initial weight.
+        # We fix these two, so that we can reproduce the result.
+        random.seed(0)
+        np.random.seed(0)
+        torch.manual_seed(0)
+        torch.cuda.manual_seed_all(0)
+
+        # GPU arrangement: Please customize this function according your own topology.
+        # The GPU server list is configured at "mpi_host_file".
+        # If we have 4 machines and each has two GPUs, and your FL network has 8 workers and a central worker.
+        # The 4 machines will be assigned as follows:
+        # machine 1: worker0, worker4, worker8;
+        # machine 2: worker1, worker5;
+        # machine 3: worker2, worker6;
+        # machine 4: worker3, worker7;
+        # Therefore, we can see that workers are assigned according to the order of machine list.
+        logging.info("process_id = %d, size = %d" % (process_id, worker_number))
+        if args.gpu_util_parse is not None:
+            device = init_training_device_from_gpu_util_parse(process_id, worker_number, args.gpu_util_parse)
+        else:
+            device = init_training_device_from_gpu_util_file(process_id, worker_number, args.gpu_util_file, args.gpu_util_key)
+
+        # load data
+        dataset = load_data(args, args.dataset)
+        [train_data_num, test_data_num, train_data_global, test_data_global,
+        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] = dataset
+
+        # create model.
+        # Note if the model is DNN (e.g., ResNet), the training will be very slow.
+        # In this case, please use our FedML distributed version (./fedml_experiments/distributed_fedavg)
+        model = create_model(args, model_name=args.model, output_dim=dataset[7])
+
+        model_trainer = ClassificationTrainer(model, device, args)
+        FedML_FedAvg_distributed(process_id, worker_number, device, comm,
+                                model, train_data_num, train_data_global, test_data_global,
+                                train_data_local_num_dict, train_data_local_dict, test_data_local_dict, args, model_trainer)
+
+
+
+
+
+
diff --git a/experiments/distributed/classification/main_fedavg.py b/experiments/distributed/classification/main_fedavg.py
deleted file mode 100644
index 939db22..0000000
--- a/experiments/distributed/classification/main_fedavg.py
+++ /dev/null
@@ -1,236 +0,0 @@
-import argparse
-import logging
-import os
-import random
-import socket
-import sys
-import traceback
-
-import numpy as np
-import psutil
-import setproctitle
-import torch
-import wandb
-from mpi4py import MPI
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../../")))
-
-from FedML.fedml_api.distributed.fedavg.FedAvgAPI import FedML_init, FedML_FedAvg_distributed
-
-from data_preprocessing.ImageNet.data_loader import load_partition_data_ImageNet
-from data_preprocessing.Landmarks.data_loader import load_partition_data_landmarks
-from model.classification.efficientnet import EfficientNet
-from model.classification.mobilenet_v3 import MobileNetV3
-from training.classification_trainer import ClassificationTrainer
-
-
-def add_args(parser):
-    """
-    parser : argparse.ArgumentParser
-    return a parser added with args required by fit
-    """
-    # Training settings
-    parser.add_argument('--model', type=str, default='mobilenet', metavar='N',
-                        help='neural network used in training')
-
-    parser.add_argument('--dataset', type=str, default='cifar10', metavar='N',
-                        help='dataset used for training')
-
-    parser.add_argument('--data_dir', type=str, default='./../../../data/cifar10',
-                        help='data directory')
-
-    parser.add_argument('--partition_method', type=str, default='hetero', metavar='N',
-                        help='how to partition the dataset on local workers')
-
-    parser.add_argument('--partition_alpha', type=float, default=0.5, metavar='PA',
-                        help='partition alpha (default: 0.5)')
-
-    parser.add_argument('--client_num_in_total', type=int, default=1000, metavar='NN',
-                        help='number of workers in a distributed cluster')
-
-    parser.add_argument('--client_num_per_round', type=int, default=4, metavar='NN',
-                        help='number of workers')
-
-    parser.add_argument('--batch_size', type=int, default=64, metavar='N',
-                        help='input batch size for training (default: 64)')
-
-    parser.add_argument('--client_optimizer', type=str, default='adam',
-                        help='SGD with momentum; adam')
-
-    parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
-                        help='learning rate (default: 0.001)')
-
-    parser.add_argument('--wd', help='weight decay parameter;', type=float, default=0.001)
-
-    parser.add_argument('--epochs', type=int, default=5, metavar='EP',
-                        help='how many epochs will be trained locally')
-
-    parser.add_argument('--comm_round', type=int, default=10,
-                        help='how many round of communications we shoud use')
-
-    parser.add_argument('--is_mobile', type=int, default=0,
-                        help='whether the program is running on the FedML-Mobile server side')
-
-    parser.add_argument('--frequency_of_the_test', type=int, default=1,
-                        help='the frequency of the algorithms')
-
-    parser.add_argument('--gpu_server_num', type=int, default=1,
-                        help='gpu_server_num')
-
-    parser.add_argument('--gpu_num_per_server', type=int, default=4,
-                        help='gpu_num_per_server')
-
-    parser.add_argument('--ci', type=int, default=0,
-                        help='CI')
-    args = parser.parse_args()
-    return args
-
-
-def load_data(args, dataset_name):
-    if dataset_name == "ILSVRC2012":
-        logging.info("load_data. dataset_name = %s" % dataset_name)
-        train_data_num, test_data_num, train_data_global, test_data_global, \
-        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
-        class_num = load_partition_data_ImageNet(dataset=dataset_name, data_dir=args.data_dir,
-                                                 partition_method=None, partition_alpha=None,
-                                                 client_number=args.client_num_in_total, batch_size=args.batch_size)
-
-    elif dataset_name == "gld23k":
-        logging.info("load_data. dataset_name = %s" % dataset_name)
-        args.client_num_in_total = 233
-        fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_train.csv')
-        fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_test.csv')
-        args.data_dir = os.path.join(args.data_dir, 'images')
-
-        train_data_num, test_data_num, train_data_global, test_data_global, \
-        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
-        class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir,
-                                                  fed_train_map_file=fed_train_map_file,
-                                                  fed_test_map_file=fed_test_map_file,
-                                                  partition_method=None, partition_alpha=None,
-                                                  client_number=args.client_num_in_total, batch_size=args.batch_size)
-
-    elif dataset_name == "gld160k":
-        logging.info("load_data. dataset_name = %s" % dataset_name)
-        args.client_num_in_total = 1262
-        fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_train.csv')
-        fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_test.csv')
-        args.data_dir = os.path.join(args.data_dir, 'images')
-
-        train_data_num, test_data_num, train_data_global, test_data_global, \
-        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
-        class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir,
-                                                  fed_train_map_file=fed_train_map_file,
-                                                  fed_test_map_file=fed_test_map_file,
-                                                  partition_method=None, partition_alpha=None,
-                                                  client_number=args.client_num_in_total, batch_size=args.batch_size)
-    else:
-        raise Exception("no such dataset")
-
-    dataset = [train_data_num, test_data_num, train_data_global, test_data_global,
-               train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num]
-    return dataset
-
-
-def create_model(args, model_name, output_dim):
-    logging.info("create_model. model_name = %s, output_dim = %s" % (model_name, output_dim))
-    if model_name == 'mobilenet_v3':
-        '''model_mode \in {LARGE: 5.15M, SMALL: 2.94M}'''
-        model = MobileNetV3(model_mode='LARGE')
-    elif model_name == 'efficientnet':
-        model = EfficientNet()
-    else:
-        raise Exception("no such model")
-    return model
-
-
-def init_training_device(process_ID, fl_worker_num, gpu_num_per_machine):
-    # initialize the mapping from process ID to GPU ID: <process ID, GPU ID>
-    if process_ID == 0:
-        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-        return device
-    process_gpu_dict = dict()
-    for client_index in range(fl_worker_num):
-        gpu_index = client_index % gpu_num_per_machine
-        process_gpu_dict[client_index] = gpu_index
-
-    logging.info(process_gpu_dict)
-    device = torch.device("cuda:" + str(process_gpu_dict[process_ID - 1]) if torch.cuda.is_available() else "cpu")
-    logging.info(device)
-    return device
-
-
-if __name__ == "__main__":
-    # initialize distributed computing (MPI)
-    comm, process_id, worker_number = FedML_init()
-
-    # parse python script input parameters
-    parser = argparse.ArgumentParser()
-    args = add_args(parser)
-    logging.info(args)
-
-    # customize the process name
-    str_process_name = "FedAvg (distributed):" + str(process_id)
-    setproctitle.setproctitle(str_process_name)
-
-    # customize the log format
-    # logging.basicConfig(level=logging.INFO,
-    logging.basicConfig(level=logging.DEBUG,
-                        format=str(
-                            process_id) + ' - %(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
-                        datefmt='%a, %d %b %Y %H:%M:%S')
-    hostname = socket.gethostname()
-    logging.info("#############process ID = " + str(process_id) +
-                 ", host name = " + hostname + "########" +
-                 ", process ID = " + str(os.getpid()) +
-                 ", process Name = " + str(psutil.Process(os.getpid())))
-
-    # initialize the wandb machine learning experimental tracking platform (https://www.wandb.com/).
-    if process_id == 0:
-        wandb.init(
-            # project="federated_nas",
-            project="fedml",
-            name="FedAVG(d)" + str(args.partition_method) + "r" + str(args.comm_round) + "-e" + str(
-                args.epochs) + "-lr" + str(
-                args.lr),
-            config=args
-        )
-
-    # Set the random seed. The np.random seed determines the dataset partition.
-    # The torch_manual_seed determines the initial weight.
-    # We fix these two, so that we can reproduce the result.
-    random.seed(0)
-    np.random.seed(0)
-    torch.manual_seed(0)
-    torch.cuda.manual_seed_all(0)
-
-    # GPU arrangement: Please customize this function according your own topology.
-    # The GPU server list is configured at "mpi_host_file".
-    # If we have 4 machines and each has two GPUs, and your FL network has 8 workers and a central worker.
-    # The 4 machines will be assigned as follows:
-    # machine 1: worker0, worker4, worker8;
-    # machine 2: worker1, worker5;
-    # machine 3: worker2, worker6;
-    # machine 4: worker3, worker7;
-    # Therefore, we can see that workers are assigned according to the order of machine list.
-    logging.info("process_id = %d, size = %d" % (process_id, worker_number))
-    device = init_training_device(process_id, worker_number - 1, args.gpu_num_per_server)
-
-    # load data
-    dataset = load_data(args, args.dataset)
-    [train_data_num, test_data_num, train_data_global, test_data_global,
-     train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] = dataset
-
-    # create model.
-    # Note if the model is DNN (e.g., ResNet), the training will be very slow.
-    # In this case, please use our FedML distributed version (./fedml_experiments/distributed_fedavg)
-    model = create_model(args, model_name=args.model, output_dim=dataset[7])
-
-    # define my own trainer
-    model_trainer = ClassificationTrainer(model)
-
-    # start "federated averaging (FedAvg)"
-    FedML_FedAvg_distributed(process_id, worker_number, device, comm,
-                             model, train_data_num, train_data_global, test_data_global,
-                             train_data_local_num_dict, train_data_local_dict, test_data_local_dict,
-                             args, model_trainer)
\ No newline at end of file
diff --git a/experiments/distributed/classification/mpi_host_file b/experiments/distributed/classification/mpi_host_file
index cf5c6bd..9f3c558 100644
--- a/experiments/distributed/classification/mpi_host_file
+++ b/experiments/distributed/classification/mpi_host_file
@@ -1 +1 @@
-ChaoyangHe-GPU-RTX2080Tix4
+ChaoyangHe-GPU-RTX2080Tix4
\ No newline at end of file
diff --git a/experiments/distributed/classification/parse.py b/experiments/distributed/classification/parse.py
new file mode 100644
index 0000000..556608c
--- /dev/null
+++ b/experiments/distributed/classification/parse.py
@@ -0,0 +1,25 @@
+def parse(process_id, worker_number, gpu_util_parse):
+    gpu_util_parse_temp = gpu_util_parse.split(';')
+    gpu_util_parse_temp = [(item.split(':')[0], item.split(':')[1]) for item in gpu_util_parse_temp ]
+
+    gpu_util = {}
+    for (host, gpus_str) in gpu_util_parse_temp:
+        gpu_util[host] = [int(num_process_on_gpu) for num_process_on_gpu in gpus_str.split(',')]
+
+    gpu_util_map = {}
+    i = 0
+    for host, gpus_util_map_host in gpu_util.items():
+        for gpu_j, num_process_on_gpu in enumerate(gpus_util_map_host):
+            for _ in range(num_process_on_gpu):
+                gpu_util_map[i] = (host, gpu_j)
+                i += 1
+    print(gpu_util_map)
+
+
+parse(0, 10, "local:2,2,0,6")
+
+
+
+
+
+
diff --git a/experiments/distributed/classification/run_fedavg_distributed_pytorch.sh b/experiments/distributed/classification/run_fedavg_distributed_pytorch.sh
index 8e98eef..b6f1578 100644
--- a/experiments/distributed/classification/run_fedavg_distributed_pytorch.sh
+++ b/experiments/distributed/classification/run_fedavg_distributed_pytorch.sh
@@ -1,37 +1,19 @@
 #!/usr/bin/env bash
 
-CLIENT_NUM=$1
-WORKER_NUM=$2
-SERVER_NUM=$3
-GPU_NUM_PER_SERVER=$4
-MODEL=$5
-DISTRIBUTION=$6
-ROUND=$7
-EPOCH=$8
-BATCH_SIZE=$9
-LR=${10}
-DATASET=${11}
-DATA_DIR=${12}
-CLIENT_OPTIMIZER=${13}
-CI=${14}
+WORKER_NUM=$1
+MPI_HOST_FILE=$2
+PYTHON=$3
+ARGS=$4
+
 
 PROCESS_NUM=`expr $WORKER_NUM + 1`
 echo $PROCESS_NUM
+echo $MPI_HOST_FILE
+
 
-hostname > mpi_host_file
 
-mpirun -np $PROCESS_NUM -hostfile ./mpi_host_file python3 ./main_fedavg.py \
-  --gpu_server_num $SERVER_NUM \
-  --gpu_num_per_server $GPU_NUM_PER_SERVER \
-  --model $MODEL \
-  --dataset $DATASET \
-  --data_dir $DATA_DIR \
-  --partition_method $DISTRIBUTION  \
-  --client_num_in_total $CLIENT_NUM \
+mpirun -np $PROCESS_NUM -hostfile ./$MPI_HOST_FILE $PYTHON ./main.py \
   --client_num_per_round $WORKER_NUM \
-  --comm_round $ROUND \
-  --epochs $EPOCH \
-  --client_optimizer $CLIENT_OPTIMIZER \
-  --batch_size $BATCH_SIZE \
-  --lr $LR \
-  --ci $CI
+  $ARGS
+
+
diff --git a/experiments/distributed/classification/run_with_conf.sh b/experiments/distributed/classification/run_with_conf.sh
new file mode 100755
index 0000000..25dd94d
--- /dev/null
+++ b/experiments/distributed/classification/run_with_conf.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+WORKER_NUM=$1
+MPI_HOST_FILE=$2
+DATASET=$3
+DATA_DIR=$4
+ARGS=$5
+
+
+source configs/cluster.conf
+PYTHON=`cat configs/cluster.conf | grep PYTHON | awk -F "=" '{print $2}'`
+data_dir=`cat configs/cluster.conf | grep $DATA_DIR | awk -F "=" '{print $2}'`
+
+
+PROCESS_NUM=`expr $WORKER_NUM + 1`
+echo $PROCESS_NUM
+echo $MPI_HOST_FILE
+echo $PYTHON
+echo $data_dir
+
+
+
+
+mpirun -np $PROCESS_NUM -hostfile ./$MPI_HOST_FILE \
+  $PYTHON ./main.py \
+  --data_dir $data_dir --dataset $DATASET \
+  --client_num_per_round $WORKER_NUM \
+  $ARGS
+
+
diff --git a/experiments/distributed/classification/run_with_conf_t716.sh b/experiments/distributed/classification/run_with_conf_t716.sh
new file mode 100644
index 0000000..ef67922
--- /dev/null
+++ b/experiments/distributed/classification/run_with_conf_t716.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+WORKER_NUM=$1
+MPI_HOST_FILE=$2
+DATASET=$3
+DATA_DIR=$4
+ARGS=$5
+
+
+source configs/cluster.conf
+PYTHON=`cat configs/cluster.conf | grep PYTHON | awk -F "=" '{print $2}'`
+data_dir=`cat configs/cluster.conf | grep $DATA_DIR | awk -F "=" '{print $2}'`
+
+
+PROCESS_NUM=`expr $WORKER_NUM + 1`
+echo $PROCESS_NUM
+echo $MPI_HOST_FILE
+echo $PYTHON
+echo $data_dir
+
+
+
+
+mpirun -np $PROCESS_NUM -hostfile ./$MPI_HOST_FILE \
+  -mca btl_tcp_if_include 192.168.0.101/24 \
+  $PYTHON ./main.py \
+  --data_dir $data_dir --dataset $DATASET \
+  --client_num_per_round $WORKER_NUM \
+  $ARGS
+
+
diff --git a/experiments/distributed/classification/sbatch_run.sh b/experiments/distributed/classification/sbatch_run.sh
new file mode 100644
index 0000000..37f1e02
--- /dev/null
+++ b/experiments/distributed/classification/sbatch_run.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+#SBATCH -o /apps/mpi/myjob.out
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=2
+mpirun python /apps/mpi/helloworld.py
+
+
+
diff --git a/experiments/distributed/classification/slurm-5793.out b/experiments/distributed/classification/slurm-5793.out
new file mode 100644
index 0000000..abf1d78
--- /dev/null
+++ b/experiments/distributed/classification/slurm-5793.out
@@ -0,0 +1,4 @@
+Traceback (most recent call last):
+  File "/var/spool/slurmd/job05793/slurm_script", line 6, in <module>
+    from mpi4py import MPI
+ModuleNotFoundError: No module named 'mpi4py'
diff --git a/experiments/standalone/__init__.py b/experiments/standalone/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/experiments/standalone/classification/.main.py.swp b/experiments/standalone/classification/.main.py.swp
new file mode 100644
index 0000000..ff42f13
Binary files /dev/null and b/experiments/standalone/classification/.main.py.swp differ
diff --git a/experiments/standalone/classification/README.md b/experiments/standalone/classification/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/experiments/standalone/classification/__init__.py b/experiments/standalone/classification/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/experiments/standalone/classification/client.py b/experiments/standalone/classification/client.py
new file mode 100644
index 0000000..6e4f5db
--- /dev/null
+++ b/experiments/standalone/classification/client.py
@@ -0,0 +1,40 @@
+import logging
+
+
+class Client:
+
+    def __init__(self, client_idx, local_training_data, local_test_data, local_sample_number, args, device,
+                 model_trainer):
+        self.client_idx = client_idx
+        self.local_training_data = local_training_data
+        self.local_test_data = local_test_data
+        self.local_sample_number = local_sample_number
+        logging.info("self.local_sample_number = " + str(self.local_sample_number))
+
+        self.args = args
+        self.device = device
+        self.model_trainer = model_trainer
+
+    def update_local_dataset(self, client_idx, local_training_data, local_test_data, local_sample_number):
+        self.client_idx = client_idx
+        self.local_training_data = local_training_data
+        self.local_test_data = local_test_data
+        self.local_sample_number = local_sample_number
+
+    def get_sample_number(self):
+        return self.local_sample_number
+
+    def train(self, w_global, round_idx):
+        self.args.round_idx = round_idx
+        self.model_trainer.set_model_params(w_global)
+        self.model_trainer.train(self.local_training_data, self.device, self.args)
+        weights = self.model_trainer.get_model_params()
+        return weights
+
+    def local_test(self, b_use_test_dataset):
+        if b_use_test_dataset:
+            test_data = self.local_test_data
+        else:
+            test_data = self.local_training_data
+        metrics = self.model_trainer.test(test_data, self.device, self.args)
+        return metrics
diff --git a/experiments/standalone/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md b/experiments/standalone/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md
new file mode 100644
index 0000000..2c24943
--- /dev/null
+++ b/experiments/standalone/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md	
@@ -0,0 +1,61 @@
+# ILSVRC2012-100
+
+
+```
+~/py36/bin/python ./main.py \
+--gpu 0 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 32  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .997
+
+~/py36/bin/python ./main.py \
+--gpu 1 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 32  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .997
+
+~/py36/bin/python ./main.py \
+--gpu 2 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .997
+
+~/py36/bin/python ./main.py \
+--gpu 3 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 128 --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .997
+
+```
+
+
+
+
+
+
+
+
diff --git a/experiments/standalone/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md b/experiments/standalone/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md
new file mode 100644
index 0000000..554ef6c
--- /dev/null
+++ b/experiments/standalone/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md	
@@ -0,0 +1,202 @@
+# ILSVRC2012-100
+
+
+
+# 10 clients
+```
+
+# DAAI
+
+~/py36/bin/python ./main.py \
+--gpu 0 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 16  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+
+~/py36/bin/python ./main.py \
+--gpu 1 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 64  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+~/py36/bin/python ./main.py \
+--gpu 2 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 64  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+~/py36/bin/python ./main.py \
+--gpu 3 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 16  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+
+~/py36/bin/python ./main.py \
+--gpu 0 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 256  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+~/py36/bin/python ./main.py \
+--gpu 1 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+~/py36/bin/python ./main.py \
+--gpu 2 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+~/py36/bin/python ./main.py \
+--gpu 3 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+# scigpu
+~/anaconda3/envs/py36/bin/python ./main.py \
+--gpu 2 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset \
+--if-timm-dataset -b 16  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+
+
+~/py36/bin/python ./main.py \
+--gpu 0 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 256  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+~/py36/bin/python ./main.py \
+--gpu 1 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+# ==================================================================================================
+~/py36/bin/python ./main.py \
+--gpu 0 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 32  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .997
+
+~/py36/bin/python ./main.py \
+--gpu 1 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 32  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .997
+
+~/py36/bin/python ./main.py \
+--gpu 2 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .997
+
+~/py36/bin/python ./main.py \
+--gpu 3 \
+--client_num_per_round 20 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \
+--if-timm-dataset -b 128 --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .997
+```
+
+
+
+
+
+
+
diff --git a/experiments/standalone/classification/experiment_scripts/cifar100 EfficientNet.md b/experiments/standalone/classification/experiment_scripts/cifar100 EfficientNet.md
new file mode 100644
index 0000000..eaec2ca
--- /dev/null
+++ b/experiments/standalone/classification/experiment_scripts/cifar100 EfficientNet.md	
@@ -0,0 +1,30 @@
+# gld23k
+
+# 10 clients
+```
+# DAAI
+
+~/py36/bin/python ./main.py \
+--gpu 3 \
+--client_num_per_round 10 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 64  --data_transform FLTransform \
+--comm_round 3000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .9992
+
+
+```
+
+
+
+
+
+
+
+
+
+
diff --git a/experiments/standalone/classification/experiment_scripts/cifar100 MobileNetV3.md b/experiments/standalone/classification/experiment_scripts/cifar100 MobileNetV3.md
new file mode 100644
index 0000000..19df2ff
--- /dev/null
+++ b/experiments/standalone/classification/experiment_scripts/cifar100 MobileNetV3.md	
@@ -0,0 +1,32 @@
+# gld23k
+
+# 10 clients
+```
+# DAAI
+
+~/py36/bin/python ./main.py \
+--gpu 3 \
+--client_num_per_round 10 --client_num_in_total 100 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 64  --data_transform FLTransform \
+--comm_round 3000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .9992
+
+
+
+
+```
+
+
+
+
+
+
+
+
+
+
diff --git a/experiments/standalone/classification/experiment_scripts/gld23k EfficientNet.md b/experiments/standalone/classification/experiment_scripts/gld23k EfficientNet.md
new file mode 100644
index 0000000..0c011e7
--- /dev/null
+++ b/experiments/standalone/classification/experiment_scripts/gld23k EfficientNet.md	
@@ -0,0 +1,255 @@
+# gld23k
+
+
+# 4 clients
+```
+
+# DAAI
+
+
+~/py36/bin/python ./main.py \
+--gpu 0 \
+--client_num_per_round 4 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 32  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+~/py36/bin/python ./main.py \
+--gpu 2 \
+--client_num_per_round 4 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 32  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+~/py36/bin/python ./main.py \
+--gpu 3 \
+--client_num_per_round 4 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 32  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+
+
+~/py36/bin/python ./main.py \
+--gpu 0 \
+--client_num_per_round 4 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+
+~/py36/bin/python ./main.py \
+--gpu 1 \
+--client_num_per_round 4 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+```
+
+
+# 10 clients
+
+```
+~/py36/bin/python ./main.py \
+--gpu 2 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+~/py36/bin/python ./main.py \
+--gpu 3 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+~/py36/bin/python ./main.py \
+--gpu 1 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .997
+
+
+~/py36/bin/python ./main.py \
+--gpu 2 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .997
+
+~/py36/bin/python ./main.py \
+--gpu 3 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 32  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .997
+
+
+# =========================================================================
+# high decay-rate
+
+~/py36/bin/python ./main.py \
+--gpu 0 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 32  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .999
+
+~/py36/bin/python ./main.py \
+--gpu 1 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .999
+
+~/py36/bin/python ./main.py \
+--gpu 2 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 32  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .999
+
+~/py36/bin/python ./main.py \
+--gpu 3 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.1 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .999
+
+
+
+~/py36/bin/python ./main.py \
+--gpu 0 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 32  --data_transform FLTransform \
+--comm_round 3000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .9992
+
+~/py36/bin/python ./main.py \
+--gpu 1 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 3000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .9992
+
+~/py36/bin/python ./main.py \
+--gpu 2 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 16  --data_transform FLTransform \
+--comm_round 3000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .9992
+
+~/py36/bin/python ./main.py \
+--gpu 3 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 16  --data_transform FLTransform \
+--comm_round 3000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.005 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .9992
+
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/experiments/standalone/classification/experiment_scripts/gld23k MobileNetV3.md b/experiments/standalone/classification/experiment_scripts/gld23k MobileNetV3.md
new file mode 100644
index 0000000..ba67014
--- /dev/null
+++ b/experiments/standalone/classification/experiment_scripts/gld23k MobileNetV3.md	
@@ -0,0 +1,173 @@
+# gld23k
+
+
+
+# 10 clients
+```
+
+# DAAI
+
+~/py36/bin/python ./main.py \
+--gpu 0 \
+--client_num_per_round 4 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 32  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+~/py36/bin/python ./main.py \
+--gpu 3 \
+--client_num_per_round 4 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 32  --data_transform FLTransform \
+--comm_round 1000  --epochs 1 \
+--model efficientnet \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .992
+
+# =========================================================================
+# high decay-rate
+
+~/py36/bin/python ./main.py \
+--gpu 2 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .997
+
+~/py36/bin/python ./main.py \
+--gpu 3 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .997
+
+~/py36/bin/python ./main.py \
+--gpu 0 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 32  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .999
+
+~/py36/bin/python ./main.py \
+--gpu 1 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 32  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .999
+
+~/py36/bin/python ./main.py \
+--gpu 2 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .999
+
+~/py36/bin/python ./main.py \
+--gpu 3 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 2000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.1 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .999
+
+
+~/py36/bin/python ./main.py \
+--gpu 0 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 32  --data_transform FLTransform \
+--comm_round 3000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .9992
+
+~/py36/bin/python ./main.py \
+--gpu 1 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 128  --data_transform FLTransform \
+--comm_round 3000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .9992
+
+~/py36/bin/python ./main.py \
+--gpu 2 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 16  --data_transform FLTransform \
+--comm_round 3000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .9992
+
+~/py36/bin/python ./main.py \
+--gpu 3 \
+--client_num_per_round 10 --client_num_in_total 233 \
+--frequency_of_the_test 10 \
+--dataset gld23k --data_dir /home/datasets/landmarks \
+--if-timm-dataset -b 16  --data_transform FLTransform \
+--comm_round 3000  --epochs 1 \
+--model mobilenet_v3 \
+--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \
+--opt rmsproptf --lr 0.005 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \
+--sched step --decay-rounds 1 --decay-rate .9992
+
+
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/experiments/standalone/classification/fedavg_api.py b/experiments/standalone/classification/fedavg_api.py
new file mode 100644
index 0000000..c4d3154
--- /dev/null
+++ b/experiments/standalone/classification/fedavg_api.py
@@ -0,0 +1,216 @@
+import copy
+import logging
+import random
+
+import numpy as np
+import torch
+import wandb
+
+from client import Client
+
+
+class FedAvgAPI(object):
+    def __init__(self, dataset, device, args, model_trainer):
+        self.device = device
+        self.args = args
+        [train_data_num, test_data_num, train_data_global, test_data_global,
+         train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] = dataset
+        self.train_global = train_data_global
+        self.test_global = test_data_global
+        self.val_global = None
+        self.train_data_num_in_total = train_data_num
+        self.test_data_num_in_total = test_data_num
+
+        self.client_list = []
+        self.train_data_local_num_dict = train_data_local_num_dict
+        self.train_data_local_dict = train_data_local_dict
+        self.test_data_local_dict = test_data_local_dict
+
+        self.model_trainer = model_trainer
+        self._setup_clients(train_data_local_num_dict, train_data_local_dict, test_data_local_dict, model_trainer)
+
+    def _setup_clients(self, train_data_local_num_dict, train_data_local_dict, test_data_local_dict, model_trainer):
+        logging.info("############setup_clients (START)#############")
+        for client_idx in range(self.args.client_num_per_round):
+            c = Client(client_idx, train_data_local_dict[client_idx], test_data_local_dict[client_idx],
+                       train_data_local_num_dict[client_idx], self.args, self.device, model_trainer)
+            self.client_list.append(c)
+        logging.info("############setup_clients (END)#############")
+
+    def train(self):
+        w_global = self.model_trainer.get_model_params()
+        for round_idx in range(self.args.comm_round):
+
+            logging.info("################Communication round : {}".format(round_idx))
+
+            w_locals = []
+
+            """
+            for scalability: following the original FedAvg algorithm, we uniformly sample a fraction of clients in each round.
+            Instead of changing the 'Client' instances, our implementation keeps the 'Client' instances and then updates their local dataset 
+            """
+            client_indexes = self._client_sampling(round_idx, self.args.client_num_in_total,
+                                                   self.args.client_num_per_round)
+            logging.info("client_indexes = " + str(client_indexes))
+
+            for idx, client in enumerate(self.client_list):
+                # update dataset
+                client_idx = client_indexes[idx]
+                client.update_local_dataset(client_idx, self.train_data_local_dict[client_idx],
+                                            self.test_data_local_dict[client_idx],
+                                            self.train_data_local_num_dict[client_idx])
+
+                # train on new dataset
+                w = client.train(w_global, round_idx)
+                # self.logger.info("local weights = " + str(w))
+                w_locals.append((client.get_sample_number(), copy.deepcopy(w)))
+
+            # update global weights
+            w_global = self._aggregate(w_locals)
+            self.model_trainer.set_model_params(w_global)
+
+            # test results
+            # at last round
+            if round_idx == self.args.comm_round - 1:
+                self._local_test_on_all_clients(round_idx)
+            # per {frequency_of_the_test} round
+            elif round_idx % self.args.frequency_of_the_test == 0:
+                if self.args.dataset.startswith("stackoverflow"):
+                    self._local_test_on_validation_set(round_idx)
+                else:
+                    self._local_test_on_all_clients(round_idx)
+
+    def _client_sampling(self, round_idx, client_num_in_total, client_num_per_round):
+        if client_num_in_total == client_num_per_round:
+            client_indexes = [client_index for client_index in range(client_num_in_total)]
+        else:
+            num_clients = min(client_num_per_round, client_num_in_total)
+            np.random.seed(round_idx)  # make sure for each comparison, we are selecting the same clients each round
+            client_indexes = np.random.choice(range(client_num_in_total), num_clients, replace=False)
+        logging.info("client_indexes = %s" % str(client_indexes))
+        return client_indexes
+
+    def _generate_validation_set(self, num_samples=10000):
+        test_data_num  = len(self.test_global.dataset)
+        sample_indices = random.sample(range(test_data_num), min(num_samples, test_data_num))
+        subset = torch.utils.data.Subset(self.test_global.dataset, sample_indices)
+        sample_testset = torch.utils.data.DataLoader(subset, batch_size=self.args.batch_size)
+        self.val_global = sample_testset
+
+    def _aggregate(self, w_locals):
+        training_num = 0
+        for idx in range(len(w_locals)):
+            (sample_num, averaged_params) = w_locals[idx]
+            training_num += sample_num
+
+        (sample_num, averaged_params) = w_locals[0]
+        for k in averaged_params.keys():
+            for i in range(0, len(w_locals)):
+                local_sample_number, local_model_params = w_locals[i]
+                w = local_sample_number / training_num
+                if i == 0:
+                    averaged_params[k] = local_model_params[k] * w
+                else:
+                    averaged_params[k] += local_model_params[k] * w
+        return averaged_params
+
+    def _local_test_on_all_clients(self, round_idx):
+
+        logging.info("################local_test_on_all_clients : {}".format(round_idx))
+
+        # train_metrics = {
+        #     'num_samples': [],
+        #     'num_correct': [],
+        #     'losses': []
+        # }
+
+        test_metrics = {
+            'num_samples': [],
+            'num_correct': [],
+            'losses': []
+        }
+
+        client = self.client_list[0]
+
+        for client_idx in range(self.args.client_num_in_total):
+            """
+            Note: for datasets like "fed_CIFAR100" and "fed_shakespheare",
+            the training client number is larger than the testing client number
+            """
+            if self.args.dataset in ['gld23k', 'gld160k'] and client_idx > 1:
+                break
+            if self.test_data_local_dict[client_idx] is None:
+                continue
+            client.update_local_dataset(0, self.train_data_local_dict[client_idx],
+                                        self.test_data_local_dict[client_idx],
+                                        self.train_data_local_num_dict[client_idx])
+            # # train data
+            # train_local_metrics = client.local_test(False)
+            # train_metrics['num_samples'].append(copy.deepcopy(train_local_metrics['test_total']))
+            # train_metrics['num_correct'].append(copy.deepcopy(train_local_metrics['test_correct']))
+            # train_metrics['losses'].append(copy.deepcopy(train_local_metrics['test_loss']))
+
+            # test data
+            test_local_metrics = client.local_test(True)
+            test_metrics['num_samples'].append(copy.deepcopy(test_local_metrics['test_total']))
+            test_metrics['num_correct'].append(copy.deepcopy(test_local_metrics['test_correct']))
+            test_metrics['losses'].append(copy.deepcopy(test_local_metrics['test_loss']))
+
+            """
+            Note: CI environment is CPU-based computing. 
+            The training speed for RNN training is to slow in this setting, so we only test a client to make sure there is no programming error.
+            """
+            if self.args.ci == 1:
+                break
+
+        # test on training dataset
+        # train_acc = sum(train_metrics['num_correct']) / sum(train_metrics['num_samples'])
+        # train_loss = sum(train_metrics['losses']) / sum(train_metrics['num_samples'])
+
+        # test on test dataset
+        test_acc = sum(test_metrics['num_correct']) / sum(test_metrics['num_samples'])
+        test_loss = sum(test_metrics['losses']) / sum(test_metrics['num_samples'])
+
+        # stats = {'training_acc': train_acc, 'training_loss': train_loss}
+        # wandb.log({"Train/Acc": train_acc, "round": round_idx})
+        # wandb.log({"Train/Loss": train_loss, "round": round_idx})
+        # logging.info(stats)
+
+        stats = {'test_acc': test_acc, 'test_loss': test_loss}
+        wandb.log({"Test/Acc": test_acc, "round": round_idx})
+        wandb.log({"Test/Loss": test_loss, "round": round_idx})
+        logging.info(stats)
+
+
+    def _local_test_on_validation_set(self, round_idx):
+
+        logging.info("################local_test_on_validation_set : {}".format(round_idx))
+
+        if self.val_global is None:
+            self._generate_validation_set()
+
+        client = self.client_list[0]
+        client.update_local_dataset(0, None, self.val_global, None)
+        # test data
+        test_metrics = client.local_test(True)
+
+        if self.args.dataset == "stackoverflow_nwp":
+            test_acc = test_metrics['test_correct'] / test_metrics['test_total']
+            test_loss = test_metrics['test_loss'] / test_metrics['test_total']
+            stats = {'test_acc': test_acc, 'test_loss': test_loss}
+            wandb.log({"Test/Acc": test_acc, "round": round_idx})
+            wandb.log({"Test/Loss": test_loss, "round": round_idx})
+        elif self.args.dataset == "stackoverflow_lr":
+            test_acc = test_metrics['test_correct'] / test_metrics['test_total']
+            test_pre = test_metrics['test_precision'] / test_metrics['test_total']
+            test_rec = test_metrics['test_recall'] / test_metrics['test_total']
+            test_loss = test_metrics['test_loss'] / test_metrics['test_total']
+            stats = {'test_acc': test_acc, 'test_pre': test_pre, 'test_rec': test_rec, 'test_loss': test_loss}
+            wandb.log({"Test/Acc": test_acc, "round": round_idx})
+            wandb.log({"Test/Pre": test_pre, "round": round_idx})
+            wandb.log({"Test/Rec": test_rec, "round": round_idx})
+            wandb.log({"Test/Loss": test_loss, "round": round_idx})
+        else:
+            raise Exception("Unknown format to log metrics for dataset {}!"%self.args.dataset)
+
+        logging.info(stats)
diff --git a/experiments/standalone/classification/main.py b/experiments/standalone/classification/main.py
new file mode 100644
index 0000000..65023bb
--- /dev/null
+++ b/experiments/standalone/classification/main.py
@@ -0,0 +1,436 @@
+import argparse
+import logging
+import os
+import random
+import socket
+import sys
+import traceback
+import yaml
+
+import numpy as np
+import psutil
+import setproctitle
+import torch
+import wandb
+
+
+from timm import create_model as timm_create_model
+from timm.models import resume_checkpoint, load_checkpoint, convert_splitbn_model
+
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../../")))
+
+# from FedML.fedml_api.standalone.fedavg.fedavg_api import FedAvgAPI
+from fedavg_api import FedAvgAPI
+
+from data_preprocessing.ImageNet.data_loader import load_partition_data_ImageNet
+from data_preprocessing.Landmarks.data_loader import load_partition_data_landmarks
+from data_preprocessing.cifar10.iid_data_loader import load_iid_cifar10
+from data_preprocessing.cifar10.data_loader import load_partition_data_cifar10
+from data_preprocessing.cifar100.data_loader import load_partition_data_cifar100
+from data_preprocessing.cinic10.data_loader import load_partition_data_cinic10
+
+from training.fedavg_classification_trainer import ClassificationTrainer
+
+from utils.logger import (
+    logging_config
+)
+
+
+
+def add_args(parser):
+    """
+    parser : argparse.ArgumentParser
+    return a parser added with args required by fit
+    """
+    # Training settings
+    parser.add_argument('--model', type=str, default='mobilenet', metavar='N',
+                        help='neural network used in training')
+
+    parser.add_argument('--dataset', type=str, default='cifar10', metavar='N',
+                        help='dataset used for training')
+
+    parser.add_argument('--data_dir', type=str, default='./../../../data/cifar10',
+                        help='data directory')
+
+    parser.add_argument('--partition_method', type=str, default='hetero', metavar='N',
+                        help='how to partition the dataset on local workers')
+
+    parser.add_argument('--partition_alpha', type=float, default=0.5, metavar='PA',
+                        help='partition alpha (default: 0.5)')
+
+    parser.add_argument('--client_num_in_total', type=int, default=1000, metavar='NN',
+                        help='number of workers in a distributed cluster')
+
+    parser.add_argument('--client_num_per_round', type=int, default=4, metavar='NN',
+                        help='number of workers')
+
+    # parser.add_argument('--batch_size', type=int, default=64, metavar='N',
+    #                     help='input batch size for training (default: 64)')
+
+    parser.add_argument('--client_optimizer', type=str, default='adam',
+                        help='SGD with momentum; adam')
+
+    # parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
+    #                     help='learning rate (default: 0.001)')
+
+    parser.add_argument('--wd', help='weight decay parameter;', type=float, default=0.00001)
+
+    parser.add_argument('--epochs', type=int, default=5, metavar='EP',
+                        help='how many epochs will be trained locally')
+
+    parser.add_argument('--comm_round', type=int, default=10,
+                        help='how many round of communications we shoud use')
+
+    parser.add_argument('--is_mobile', type=int, default=0,
+                        help='whether the program is running on the FedML-Mobile server side')
+
+    parser.add_argument('--frequency_of_the_test', type=int, default=1,
+                        help='the frequency of the algorithms')
+
+    parser.add_argument('--gpu_server_num', type=int, default=1,
+                        help='gpu_server_num')
+
+    parser.add_argument('--gpu_num_per_server', type=int, default=4,
+                        help='gpu_num_per_server')
+
+    parser.add_argument('--gpu', type=int, default=0,
+                        help='gpu')
+
+    parser.add_argument('--ci', type=int, default=0,
+                        help='CI')
+
+    parser.add_argument('--gpu_util_file', type=str, default=None,
+                        help='the gpu utilization file for servers and clients. If there is no \
+                        gpu_util_file, gpu will not be used.')
+    parser.add_argument('--gpu_util_key', type=str, default=None,
+                        help='the key in gpu utilization file')
+    parser.add_argument('--gpu_util_parse', type=str, default=None,
+                        help='the gpu utilization string for servers and clients. If there is no \
+                        gpu_util_parse, gpu will not be used. Note if this and gpu_util_file are \
+                        both defined, gpu_util_parse will be used but not gpu_util_file')
+
+    parser.add_argument('--pretrained',action='store_true', default=False,
+                        help='Start with pretrained version of specified network (if avail)')
+
+    parser.add_argument('--distributed', action='store_true', default=False,
+                        help='If distributed training')
+
+    parser.add_argument('--if-timm-dataset', action='store_true', default=False,
+                        help='If use timm dataset augmentation')
+
+    parser.add_argument('--data_load_num_workers', type=int, default=4,
+                        help='number of workers when loading data')
+
+
+    # logging settings
+    parser.add_argument('--level', type=str, default='INFO',
+                        help='level of logging')
+
+    # Dataset
+    parser.add_argument('--img-size', type=int, default=None, metavar='N',
+                        help='Image patch size (default: None => model default)')
+    parser.add_argument('--crop-pct', default=None, type=float,
+                        metavar='N', help='Input image center crop percent (for validation only)')
+    parser.add_argument('--data_transform', default=None, type=str, metavar='TRANSFORM',
+                        help='How to do data transform')
+    parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+                        help='Override mean pixel value of dataset')
+    parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD',
+                        help='Override std deviation of of dataset')
+    parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+                        help='Image resize interpolation type (overrides model)')
+    parser.add_argument('-b', '--batch-size', type=int, default=32, metavar='N',
+                        help='input batch size for training (default: 32)')
+    parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N',
+                        help='ratio of validation batch size to training batch size (default: 1)')
+
+
+    # Model parameters
+    parser.add_argument('--gp', default=None, type=str, metavar='POOL',
+                        help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.')
+
+    # Optimizer parameters
+    parser.add_argument('--opt', default='sgd', type=str, metavar='OPTIMIZER',
+                        help='Optimizer (default: "sgd"')
+    parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON',
+                        help='Optimizer Epsilon (default: None, use opt default)')
+    parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA',
+                        help='Optimizer Betas (default: None, use opt default)')
+    parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
+                        help='Optimizer momentum (default: 0.9)')
+    parser.add_argument('--weight-decay', type=float, default=0.00001,
+                        help='weight decay (default: 0.0001)')
+    parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM',
+                        help='Clip gradient norm (default: None, no clipping)')
+
+
+    # Learning rate schedule parameters
+    parser.add_argument('--sched', default=None, type=str, metavar='SCHEDULER',
+                        help='LR scheduler (default: "step"')
+    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                        help='learning rate (default: 0.01)')
+    parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct',
+                        help='learning rate noise on/off epoch percentages')
+    parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT',
+                        help='learning rate noise limit percent (default: 0.67)')
+    parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV',
+                        help='learning rate noise std-dev (default: 1.0)')
+    parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT',
+                        help='learning rate cycle len multiplier (default: 1.0)')
+    parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N',
+                        help='learning rate cycle limit')
+    parser.add_argument('--warmup-lr', type=float, default=0.0001, metavar='LR',
+                        help='warmup learning rate (default: 0.0001)')
+    parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0 (1e-5)')
+    # parser.add_argument('--epochs', type=int, default=200, metavar='N',
+    #                     help='number of epochs to train (default: 2)')
+    parser.add_argument('--start-epoch', default=None, type=int, metavar='N',
+                        help='manual epoch number (useful on restarts)')
+    parser.add_argument('--decay-epochs', type=float, default=30, metavar='N',
+                        help='epoch interval to decay LR')
+    parser.add_argument('--warmup-epochs', type=int, default=3, metavar='N',
+                        help='epochs to warmup LR, if scheduler supports')
+    parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N',
+                        help='epochs to cooldown LR at min_lr, after cyclic schedule ends')
+    parser.add_argument('--patience-epochs', type=int, default=10, metavar='N',
+                        help='patience epochs for Plateau LR scheduler (default: 10')
+    parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE',
+                        help='LR decay rate (default: 0.1)')
+
+    parser.add_argument('--decay-rounds', type=float, default=30, metavar='N',
+                        help='round interval to decay LR')
+
+
+    # Augmentation & regularization parameters
+    parser.add_argument('--no-aug', action='store_true', default=False,
+                        help='Disable all training augmentation, override other train aug args')
+    parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT',
+                        help='Random resize scale (default: 0.08 1.0)')
+    parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO',
+                        help='Random resize aspect ratio (default: 0.75 1.33)')
+    parser.add_argument('--hflip', type=float, default=0.5,
+                        help='Horizontal flip training aug probability')
+    parser.add_argument('--vflip', type=float, default=0.,
+                        help='Vertical flip training aug probability')
+    parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT',
+                        help='Color jitter factor (default: 0.4)')
+    parser.add_argument('--aa', type=str, default=None, metavar='NAME',
+                        help='Use AutoAugment policy. "v0" or "original". (default: None)'),
+    parser.add_argument('--aug-splits', type=int, default=0,
+                        help='Number of augmentation splits (default: 0, valid: 0 or >=2)')
+    parser.add_argument('--jsd', action='store_true', default=False,
+                        help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.')
+    parser.add_argument('--reprob', type=float, default=0., metavar='PCT',
+                        help='Random erase prob (default: 0.)')
+    parser.add_argument('--remode', type=str, default='const',
+                        help='Random erase mode (default: "const")')
+    parser.add_argument('--recount', type=int, default=1,
+                        help='Random erase count (default: 1)')
+    parser.add_argument('--resplit', action='store_true', default=False,
+                        help='Do not random erase first (clean) augmentation split')
+    parser.add_argument('--mixup', type=float, default=0.0,
+                        help='mixup alpha, mixup enabled if > 0. (default: 0.)')
+    parser.add_argument('--cutmix', type=float, default=0.0,
+                        help='cutmix alpha, cutmix enabled if > 0. (default: 0.)')
+    parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None,
+                        help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)')
+    parser.add_argument('--mixup-prob', type=float, default=1.0,
+                        help='Probability of performing mixup or cutmix when either/both is enabled')
+    parser.add_argument('--mixup-switch-prob', type=float, default=0.5,
+                        help='Probability of switching to cutmix when both mixup and cutmix enabled')
+    parser.add_argument('--mixup-mode', type=str, default='batch',
+                        help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"')
+    parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N',
+                        help='Turn off mixup after this epoch, disabled if 0 (default: 0)')
+    parser.add_argument('--smoothing', type=float, default=0.1,
+                        help='Label smoothing (default: 0.1)')
+    parser.add_argument('--train-interpolation', type=str, default='random',
+                        help='Training interpolation (random, bilinear, bicubic default: "random")')
+    parser.add_argument('--drop', type=float, default=0.0, metavar='PCT',
+                        help='Dropout rate (default: 0.)')
+    parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT',
+                        help='Drop connect rate, DEPRECATED, use drop-path (default: None)')
+    parser.add_argument('--drop-path', type=float, default=None, metavar='PCT',
+                        help='Drop path rate (default: None)')
+    parser.add_argument('--drop-block', type=float, default=None, metavar='PCT',
+                        help='Drop block rate (default: None)')
+
+    # Batch norm parameters (only works with gen_efficientnet based models currently)
+    parser.add_argument('--bn-tf', type=bool, default=False,
+                        help='Use Tensorflow BatchNorm defaults for models that support it (default: False)')
+    parser.add_argument('--bn-momentum', type=float, default=None,
+                        help='BatchNorm momentum override (if not None)')
+    parser.add_argument('--bn-eps', type=float, default=None,
+                        help='BatchNorm epsilon override (if not None)')
+    parser.add_argument('--sync-bn', action='store_true',
+                        help='Enable NVIDIA Apex or Torch synchronized BatchNorm.')
+    parser.add_argument('--dist-bn', type=str, default='',
+                        help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")')
+    parser.add_argument('--split-bn', action='store_true',
+                        help='Enable separate BN layers per augmentation split.')
+
+    # Model Exponential Moving Average
+    parser.add_argument('--model-ema', action='store_true', default=False,
+                        help='Enable tracking moving average of model weights')
+    parser.add_argument('--model-ema-force-cpu', action='store_true', default=False,
+                        help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.')
+    parser.add_argument('--model-ema-decay', type=float, default=0.9998,
+                        help='decay factor for model weights moving average (default: 0.9998)')
+
+
+    args = parser.parse_args()
+    return args
+
+
+def load_data(args, dataset_name):
+    if dataset_name in ["ILSVRC2012", "ILSVRC2012-100"]:
+        logging.info("load_data. dataset_name = %s" % dataset_name)
+        train_data_num, test_data_num, train_data_global, test_data_global, \
+        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
+        class_num = load_partition_data_ImageNet(dataset=dataset_name, data_dir=args.data_dir,
+                                                 partition_method=None, partition_alpha=None,
+                                                 client_number=args.client_num_in_total, 
+                                                 batch_size=args.batch_size, args=args)
+
+    elif dataset_name == "gld23k":
+        logging.info("load_data. dataset_name = %s" % dataset_name)
+        args.client_num_in_total = 233
+        # fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_train.csv')
+        # fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_test.csv')
+        fed_train_map_file = os.path.join(args.data_dir, 'mini_gld_train_split.csv')
+        fed_test_map_file = os.path.join(args.data_dir, 'mini_gld_test.csv')
+
+        args.data_dir = os.path.join(args.data_dir, 'images')
+
+        train_data_num, test_data_num, train_data_global, test_data_global, \
+        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
+        class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir,
+                                                  fed_train_map_file=fed_train_map_file,
+                                                  fed_test_map_file=fed_test_map_file,
+                                                  partition_method=None, partition_alpha=None,
+                                                  client_number=args.client_num_in_total, 
+                                                  batch_size=args.batch_size, args=args)
+    elif dataset_name == "gld160k":
+        logging.info("load_data. dataset_name = %s" % dataset_name)
+        args.client_num_in_total = 1262
+        fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_train.csv')
+        fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_test.csv')
+        args.data_dir = os.path.join(args.data_dir, 'images')
+
+        train_data_num, test_data_num, train_data_global, test_data_global, \
+        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
+        class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir,
+                                                  fed_train_map_file=fed_train_map_file,
+                                                  fed_test_map_file=fed_test_map_file,
+                                                  partition_method=None, partition_alpha=None,
+                                                  client_number=args.client_num_in_total, 
+                                                  batch_size=args.batch_size, args=args)
+    else:
+        if dataset_name == "cifar10":
+            data_loader = load_partition_data_cifar10
+        elif dataset_name == "cifar100":
+            data_loader = load_partition_data_cifar100
+        elif dataset_name == "cinic10":
+            data_loader = load_partition_data_cinic10
+        else:
+            raise Exception("no such dataset")
+
+        train_data_num, test_data_num, train_data_global, test_data_global, \
+        train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \
+        class_num = data_loader(args.dataset, args.data_dir, args.partition_method,
+                                args.partition_alpha, args.client_num_in_total, args.batch_size)
+
+    dataset = [train_data_num, test_data_num, train_data_global, test_data_global,
+               train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num]
+    return dataset
+
+
+def create_model(args, model_name, output_dim):
+    logging.info("create_model. model_name = %s, output_dim = %s" % (model_name, output_dim))
+    if model_name == 'mobilenet_v3':
+        '''model_mode \in {LARGE: 5.15M, SMALL: 2.94M}'''
+        # model = MobileNetV3(model_mode='LARGE')
+        model = timm_create_model(
+        model_name="mobilenetv3_large_100",
+        pretrained=args.pretrained,
+        num_classes=output_dim,
+        drop_rate=args.drop,
+        # drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
+        drop_path_rate=args.drop_path,
+        drop_block_rate=args.drop_block,
+        global_pool=args.gp,
+        bn_tf=args.bn_tf,
+        bn_momentum=args.bn_momentum,
+        bn_eps=args.bn_eps)
+
+    elif model_name == 'efficientnet':
+        model = timm_create_model(
+        model_name="efficientnet_b0",
+        pretrained=args.pretrained,
+        num_classes=output_dim,
+        drop_rate=args.drop,
+        # drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
+        drop_path_rate=args.drop_path,
+        drop_block_rate=args.drop_block,
+        global_pool=args.gp,
+        bn_tf=args.bn_tf,
+        bn_momentum=args.bn_momentum,
+        bn_eps=args.bn_eps)
+    else:
+        raise Exception("no such model")
+    return model
+
+
+if __name__ == "__main__":
+    logging.basicConfig()
+    logger = logging.getLogger()
+    logger.setLevel(logging.DEBUG)
+
+    # parser = add_args(argparse.ArgumentParser(description='FedAvg-standalone'))
+    # parser = argparse.ArgumentParser(description='FedAvg-standalone')
+
+    parser = argparse.ArgumentParser()
+    args = add_args(parser)
+    args.rank = 0
+    args.wd = args.weight_decay
+
+    logger.info(args)
+    device = torch.device("cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu")
+    logger.info(device)
+
+    # initialize the wandb machine learning experimental tracking platform (https://www.wandb.com/).
+    name_model_ema = "-model_ema" if args.model_ema else "-no_model_ema"
+    name_aa = args.aa if args.aa is not None else "_None"
+    wandb.init(
+        entity="automl",
+        project="fedcv-classification",
+        name="fedavg (d)" + str(args.partition_method) + "-" +str(args.dataset)+
+                    "-e" + str(args.epochs) + "-" + str(args.model) + "-" +
+                    args.data_transform + "-aa" + name_aa + "-" + str(args.opt) + 
+                    name_model_ema + "-bs" + str(args.batch_size) +
+                    "-lr" + str(args.lr) + "-wd" + str(args.wd),
+        config=args
+    )
+
+    # Set the random seed. The np.random seed determines the dataset partition.
+    # The torch_manual_seed determines the initial weight.
+    # We fix these two, so that we can reproduce the result.
+    random.seed(0)
+    np.random.seed(0)
+    torch.manual_seed(0)
+    torch.cuda.manual_seed_all(0)
+
+    # load data
+    dataset = load_data(args, args.dataset)
+
+    # create model.
+    # Note if the model is DNN (e.g., ResNet), the training will be very slow.
+    # In this case, please use our FedML distributed version (./fedml_experiments/distributed_fedavg)
+    model = create_model(args, model_name=args.model, output_dim=dataset[7])
+    model_trainer = ClassificationTrainer(model, device, args)
+    logging.info(model)
+
+    fedavgAPI = FedAvgAPI(dataset, device, args, model_trainer)
+    fedavgAPI.train()
diff --git a/experiments/standalone/yolov5/_init.py b/experiments/standalone/yolov5/_init.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/experiments/standalone/yolov5/_init.py
@@ -0,0 +1 @@
+
diff --git a/experiments/standalone/yolov5/client.py b/experiments/standalone/yolov5/client.py
new file mode 100644
index 0000000..847bed3
--- /dev/null
+++ b/experiments/standalone/yolov5/client.py
@@ -0,0 +1,590 @@
+import logging
+
+import os
+import torch
+from torch import nn
+
+
+import time
+from pathlib import Path
+from threading import Thread
+from warnings import warn
+
+import math
+import random
+import numpy as np
+import torch.distributed as dist
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.optim.lr_scheduler as lr_scheduler
+import torch.utils.data
+import yaml
+# from apex import amp
+from torch.cuda import amp
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+
+import test  # import test.py to get mAP after each epoch
+# from fedml_api.model.object_detection.yolov5.models.experimental import
+from fedml_api.model.object_detection.yolov5.models.experimental import attempt_load
+from fedml_api.model.object_detection.yolov5.models.yolo import Model
+from fedml_api.model.object_detection.yolov5.utils.autoanchor import check_anchors
+# from utils.datasets import create_dataloader
+from fedml_api.model.object_detection.yolov5.utils.general import labels_to_class_weights, increment_path, labels_to_image_weights, init_seeds, \
+    fitness, strip_optimizer, get_latest_run, check_dataset, check_file, check_git_status, check_img_size, \
+    print_mutation, set_logging
+from fedml_api.model.object_detection.yolov5.utils.google_utils import attempt_download
+from fedml_api.model.object_detection.yolov5.utils.loss import compute_loss
+from fedml_api.model.object_detection.yolov5.utils.plots import plot_images, plot_labels, plot_results, plot_evolution
+from fedml_api.model.object_detection.yolov5.utils.torch_utils import ModelEMA, select_device, intersect_dicts, torch_distributed_zero_first
+
+from fedml_api.data_preprocessing.coco_detection.datasets import partition_data
+from fedml_api.data_preprocessing.coco_detection.datasets import create_dataloader
+
+
+
+logger = logging.getLogger(__name__)
+
+
+class Client:
+
+    def __init__(self, client_idx, local_training_data, local_sample_number, opt, device, model, tb_writer, wandb, hyp):
+        self.client_idx = client_idx
+        self.local_training_data = local_training_data
+        # self.local_test_data = local_test_data
+        self.local_sample_number = local_sample_number
+        logging.info("self.local_sample_number = " + str(self.local_sample_number))
+
+        self.opt = opt
+        self.device = device
+        self.model = model
+        self.hyp = hyp
+        self.tb_writer = tb_writer
+        self.wandb = wandb
+
+        '''
+        stackoverflow_lr is the task of multi-label classification
+        please refer to following links for detailed explainations on cross-entropy and corresponding implementation of tff research:
+        https://towardsdatascience.com/cross-entropy-for-classification-d98e7f974451
+        https://github.com/google-research/federated/blob/49a43456aa5eaee3e1749855eed89c0087983541/optimization/stackoverflow_lr/federated_stackoverflow_lr.py#L131
+        '''
+        # if self.args.dataset == "stackoverflow_lr":
+        #     self.criterion = nn.BCELoss(reduction = 'sum').to(device)
+        # else:
+        #     self.criterion = nn.CrossEntropyLoss().to(device)
+
+    def update_local_dataset(self, client_idx, local_training_data, local_sample_number):
+        self.client_idx = client_idx
+        self.local_training_data = local_training_data
+        # self.local_test_data = local_test_data
+        self.local_sample_number = local_sample_number
+
+    def get_sample_number(self):
+        return self.local_sample_number
+
+    def train(self, w_global, dataset, dataloader, wandb):
+        self.wandb = wandb
+        logger.info(f'Hyperparameters {self.hyp}')
+        save_dir, epochs, batch_size, total_batch_size, weights, rank = \
+            Path(
+                self.opt.save_dir), self.opt.epochs, self.opt.batch_size, self.opt.total_batch_size, self.opt.weights, self.opt.global_rank
+
+        # Directories
+        wdir = save_dir / 'weights'
+        wdir.mkdir(parents=True, exist_ok=True)  # make dir
+        last = wdir / 'last.pt'
+        best = wdir / 'best.pt'
+        results_file = save_dir / 'results.txt'
+
+        # Save run settings
+        with open(save_dir / 'hyp.yaml', 'w') as f:
+            yaml.dump(self.hyp, f, sort_keys=False)
+        with open(save_dir / 'opt.yaml', 'w') as f:
+            yaml.dump(vars(self.opt), f, sort_keys=False)
+
+        # Configure
+        plots = not self.opt.evolve  # create plots
+        cuda = self.device.type != 'cpu'
+        init_seeds(2 + rank)
+        with open(self.opt.data) as f:
+            data_dict = yaml.load(f, Loader=yaml.FullLoader)  # data dict
+        with torch_distributed_zero_first(rank):
+            check_dataset(data_dict)  # check
+        train_path = data_dict['train']
+        test_path = data_dict['val']
+        nc, names = (1, ['item']) if self.opt.single_cls else (
+            int(data_dict['nc']), data_dict['names'])  # number classes, names
+        assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data)  # check
+
+        # Model
+        pretrained = weights.endswith('.pt')
+        if pretrained:
+            with torch_distributed_zero_first(rank):
+                attempt_download(weights)  # download if not found locally
+            ckpt = torch.load(weights, map_location=self.device)  # load checkpoint
+            if self.hyp.get('anchors'):
+                ckpt['model'].yaml['anchors'] = round(self.hyp['anchors'])  # force autoanchor
+            model = Model(self.opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(self.device)  # create
+            exclude = ['anchor'] if self.opt.cfg or self.hyp.get('anchors') else []  # exclude keys
+            state_dict = ckpt['model'].float().state_dict()  # to FP32
+            state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude)  # intersect
+            model.load_state_dict(state_dict, strict=False)  # load
+            logger.info(
+                'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights))  # report
+        else:
+            model = Model(self.opt.cfg, ch=3, nc=nc).to(self.device)  # create
+
+        # Freeze
+        freeze = []  # parameter names to freeze (full or partial)
+        for k, v in model.named_parameters():
+            v.requires_grad = True  # train all layers
+            if any(x in k for x in freeze):
+                print('freezing %s' % k)
+                v.requires_grad = False
+
+        # Optimizer
+        nbs = 64  # nominal batch size
+        accumulate = max(round(nbs / total_batch_size), 1)  # accumulate loss before optimizing
+        self.hyp['weight_decay'] *= total_batch_size * accumulate / nbs  # scale weight_decay
+
+        pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
+        for k, v in model.named_modules():
+            if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
+                pg2.append(v.bias)  # biases
+            if isinstance(v, nn.BatchNorm2d):
+                pg0.append(v.weight)  # no decay
+            elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
+                pg1.append(v.weight)  # apply decay
+
+        if self.opt.adam:
+            optimizer = optim.Adam(pg0, lr=self.hyp['lr0'], betas=(self.hyp['momentum'], 0.999))  # adjust beta1 to momentum
+        else:
+            optimizer = optim.SGD(pg0, lr=self.hyp['lr0'], momentum=self.hyp['momentum'], nesterov=True)
+
+        optimizer.add_param_group({'params': pg1, 'weight_decay': self.hyp['weight_decay']})  # add pg1 with weight_decay
+        optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
+        logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
+        del pg0, pg1, pg2
+
+        # Scheduler https://arxiv.org/pdf/1812.01187.pdf
+        # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
+        lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - self.hyp['lrf']) + self.hyp['lrf']  # cosine
+        scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
+        # plot_lr_scheduler(optimizer, scheduler, epochs)
+
+        # Logging
+        if self.wandb and self.wandb.run is None:
+            self.opt.hyp = self.hyp  # add hyperparameters
+            wandb_run = self.wandb.init(config=self.opt, resume="allow",
+                                        project='YOLOv5' if self.opt.project == 'runs/train' else Path(
+                                            self.opt.project).stem,
+                                        name=save_dir.stem,
+                                        id=ckpt.get('wandb_id') if 'ckpt' in locals() else None)
+        loggers = {'wandb': self.wandb}  # loggers dict
+
+        # Resume
+        start_epoch, best_fitness = 0, 0.0
+        if pretrained:
+            # Optimizer
+            if ckpt['optimizer'] is not None:
+                optimizer.load_state_dict(ckpt['optimizer'])
+                best_fitness = ckpt['best_fitness']
+
+            # Results
+            if ckpt.get('training_results') is not None:
+                with open(results_file, 'w') as file:
+                    file.write(ckpt['training_results'])  # write results.txt
+
+            # Epochs
+            start_epoch = ckpt['epoch'] + 1
+            if self.opt.resume:
+                assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs)
+            if epochs < start_epoch:
+                logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
+                            (weights, ckpt['epoch'], epochs))
+                epochs += ckpt['epoch']  # finetune additional epochs
+
+            del ckpt, state_dict
+
+        # Image sizes
+        gs = int(max(model.stride))  # grid size (max stride)
+        imgsz, imgsz_test = [check_img_size(x, gs) for x in self.opt.img_size]  # verify imgsz are gs-multiples
+
+        # DP mode
+        if cuda and rank == -1 and torch.cuda.device_count() > 1:
+            model = torch.nn.DataParallel(model)
+
+        # SyncBatchNorm
+        if self.opt.sync_bn and cuda and rank != -1:
+            model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(self.device)
+            logger.info('Using SyncBatchNorm()')
+
+        # EMA
+        ema = ModelEMA(model) if rank in [-1, 0] else None
+
+        # DDP mode
+        if cuda and rank != -1:
+            model = DDP(model, device_ids=[self.opt.local_rank], output_device=self.opt.local_rank)
+
+        # Trainloader
+        # dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, self.opt,
+        #                                         hyp=self.hyp, augment=True, cache=self.opt.cache_images, rect=self.opt.rect,
+        #                                         rank=rank,
+        #                                         world_size=self.opt.world_size, workers=self.opt.workers,
+        #                                         image_weights=self.opt.image_weights)
+
+        # client
+        # client_number = self.opt.client_number
+        # partition = self.opt.partition
+        # net_dataidx_map = partition_data(train_path, partition=partition, n_nets=client_number)
+        # train_data_loader_dict = dict()
+        # for i in range(client_number):
+        #     dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, self.opt,
+        #                                             hyp=self.hyp, augment=True, cache=self.opt.cache_images,
+        #                                             rect=self.opt.rect,
+        #                                             rank=rank,
+        #                                             world_size=self.opt.world_size, workers=self.opt.workers,
+        #                                             image_weights=self.opt.image_weights,
+        #                                             net_dataidx_map=net_dataidx_map[i])
+        #
+        #     train_data_loader_dict[i] = dataloader
+            # self.client_list.append(Client(i, train_data_loader_dict[i], len(dataset), self.opt, self.device, model))
+
+        # TODO: train_client
+        # client sampling
+        # client train
+        # logging info
+
+        # train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)])
+        # client_number_per_round = self.opt.client_num_per_round
+
+        mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class
+        nb = len(dataloader)  # number of batches
+        print("nb:", nb)
+        assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (
+            mlc, nc, self.opt.data, nc - 1)
+
+        # Process 0
+        if rank in [-1, 0]:
+            ema.updates = start_epoch * nb // accumulate  # set EMA updates
+            testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, self.opt,  # testloader
+                                           hyp=self.hyp, cache=self.opt.cache_images and not self.opt.notest, rect=True,
+                                           rank=-1, world_size=self.opt.world_size, workers=self.opt.workers, pad=0.5)[
+                0]
+
+            if not self.opt.resume:
+                labels = np.concatenate(dataset.labels, 0)
+                c = torch.tensor(labels[:, 0])  # classes
+                # cf = torch.bincount(c.long(), minlength=nc) + 1.  # frequency
+                # model._initialize_biases(cf.to(self.device))
+                if plots:
+                    Thread(target=plot_labels, args=(labels, save_dir, loggers), daemon=True).start()
+                    if self.tb_writer:
+                        self.tb_writer.add_histogram('classes', c, 0)
+
+                # Anchors
+                if not self.opt.noautoanchor:
+                    check_anchors(dataset, model=model, thr=self.hyp['anchor_t'], imgsz=imgsz)
+
+        # Model parameters
+        self.hyp['cls'] *= nc / 80.  # scale coco-tuned hyp['cls'] to current dataset
+        model.nc = nc  # attach number of classes to model
+        model.hyp = self.hyp  # attach hyperparameters to model
+        model.gr = 1.0  # iou loss ratio (obj_loss = 1.0 or iou)
+        model.class_weights = labels_to_class_weights(dataset.labels, nc).to(self.device)  # attach class weights
+        model.names = names
+
+        # Start training
+        t0 = time.time()
+        nw = max(round(self.hyp['warmup_epochs'] * nb), 1000)  # number of warmup iterations, max(3 epochs, 1k iterations)
+        # nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training
+        maps = np.zeros(nc)  # mAP per class
+        results = (0, 0, 0, 0, 0, 0, 0)  # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)
+        scheduler.last_epoch = start_epoch - 1  # do not move
+        scaler = amp.GradScaler(enabled=cuda)
+        logger.info('Image sizes %g train, %g test\n'
+                    'Using %g dataloader workers\nLogging results to %s\n'
+                    'Starting training for %g epochs...' % (
+                        imgsz, imgsz_test, dataloader.num_workers, save_dir, epochs))
+        model = self.model
+        model.load_state_dict(w_global)
+        model.to(self.device)
+        for epoch in range(start_epoch,
+                           epochs):  # epoch ------------------------------------------------------------------
+            model.train()
+
+            # client_indexes = client_sampling(epoch, client_number, client_number_per_round)
+            # logging.info("client_indexes = " + str(client_indexes))
+
+            # Update image weights (optional)
+            if self.opt.image_weights:
+                # Generate indices
+                if rank in [-1, 0]:
+                    cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2  # class weights
+                    iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw)  # image weights
+                    dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n)  # rand weighted idx
+                # Broadcast if DDP
+                if rank != -1:
+                    indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int()
+                    dist.broadcast(indices, 0)
+                    if rank != 0:
+                        dataset.indices = indices.cpu().numpy()
+
+            # Update mosaic border
+            # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs)
+            # dataset.mosaic_border = [b - imgsz, -b]  # height, width borders
+
+            mloss = torch.zeros(4, device=self.device)  # mean losses
+            if rank != -1:
+                dataloader.sampler.set_epoch(epoch)
+            pbar = enumerate(dataloader)
+            logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size'))
+            if rank in [-1, 0]:
+                pbar = tqdm(pbar, total=nb)  # progress bar
+            optimizer.zero_grad()
+            for i, (
+                    imgs, targets, paths,
+                    _) in pbar:  # batch -------------------------------------------------------------
+                ni = i + nb * epoch  # number integrated batches (since train start)
+                imgs = imgs.to(self.device, non_blocking=True).float() / 255.0  # uint8 to float32, 0-255 to 0.0-1.0
+
+                # Warmup
+                if ni <= nw:
+                    xi = [0, nw]  # x interp
+                    # model.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)
+                    accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round())
+                    for j, x in enumerate(optimizer.param_groups):
+                        # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
+                        x['lr'] = np.interp(ni, xi,
+                                            [self.hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)])
+                        if 'momentum' in x:
+                            x['momentum'] = np.interp(ni, xi, [self.hyp['warmup_momentum'],self.hyp['momentum']])
+
+                # Multi-scale
+                if self.opt.multi_scale:
+                    sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs  # size
+                    sf = sz / max(imgs.shape[2:])  # scale factor
+                    if sf != 1:
+                        ns = [math.ceil(x * sf / gs) * gs for x in
+                              imgs.shape[2:]]  # new shape (stretched to gs-multiple)
+                        imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
+
+                # Forward
+                with amp.autocast(enabled=cuda):
+                    pred = model(imgs)  # forward
+                    loss, loss_items = compute_loss(pred, targets.to(self.device), model)  # loss scaled by batch_size
+                    if rank != -1:
+                        loss *= self.opt.world_size  # gradient averaged between devices in DDP mode
+
+                # Backward
+                scaler.scale(loss).backward()
+
+                # Optimize
+                # if ni % accumulate == 0:
+                #     scaler.step(optimizer)  # optimizer.step
+                #     scaler.update()
+                #     optimizer.zero_grad()
+                #     if ema:
+                #         ema.update(model)
+
+                # Print
+                if rank in [-1, 0]:
+                    mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
+                    mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0)  # (GB)
+                    s = ('%10s' * 2 + '%10.4g' * 6) % (
+                        '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1])
+                    pbar.set_description(s)
+
+                    # Plot
+                    if plots and ni < 3:
+                        f = save_dir / f'train_batch{ni}.jpg'  # filename
+                        Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start()
+                        # if tb_writer:
+                        #     tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
+                        #     tb_writer.add_graph(model, imgs)  # add model to tensorboard
+                    elif plots and ni == 3 and self.wandb:
+                        self.wandb.log(
+                            {"Mosaics": [self.wandb.Image(str(x), caption=x.name) for x in
+                                         save_dir.glob('train*.jpg')]})
+
+                # end batch ------------------------------------------------------------------------------------------------
+            # end epoch ----------------------------------------------------------------------------------------------------
+
+            # Scheduler
+            lr = [x['lr'] for x in optimizer.param_groups]  # for tensorboard
+            scheduler.step()
+
+            # DDP process 0 or single-GPU
+            if rank in [-1, 0]:
+                # mAP
+                if ema:
+                    ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride'])
+                final_epoch = epoch + 1 == epochs
+                if not self.opt.notest or final_epoch:  # Calculate mAP
+                    results, maps, times = test.test(self.opt.data,
+                                                     batch_size=total_batch_size,
+                                                     imgsz=imgsz_test,
+                                                     model=ema.ema,
+                                                     single_cls=self.opt.single_cls,
+                                                     dataloader=testloader,
+                                                     save_dir=save_dir,
+                                                     plots=plots and final_epoch,
+                                                     log_imgs=self.opt.log_imgs if self.wandb else 0)
+
+                # Write
+                with open(results_file, 'a') as f:
+                    f.write(s + '%10.4g' * 7 % results + '\n')  # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)
+                if len(self.opt.name) and self.opt.bucket:
+                    os.system(
+                        'gsutil cp %s gs://%s/results/results%s.txt' % (results_file, self.opt.bucket, self.opt.name))
+
+                # Log
+                tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss',  # train loss
+                        'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
+                        'val/box_loss', 'val/obj_loss', 'val/cls_loss',  # val loss
+                        'x/lr0', 'x/lr1', 'x/lr2']  # params
+                for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
+                    if self.tb_writer:
+                        self.tb_writer.add_scalar(tag, x, epoch)  # tensorboard
+                    if self.wandb:
+                        self.wandb.log({tag: x})  # W&B
+
+                # Update best mAP
+                fi = fitness(np.array(results).reshape(1, -1))  # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
+                if fi > best_fitness:
+                    best_fitness = fi
+
+                # Save model
+                save = (not self.opt.nosave) or (final_epoch and not self.opt.evolve)
+                if save:
+                    with open(results_file, 'r') as f:  # create checkpoint
+                        ckpt = {'epoch': epoch,
+                                'best_fitness': best_fitness,
+                                'training_results': f.read(),
+                                'model': ema.ema,
+                                'optimizer': None if final_epoch else optimizer.state_dict(),
+                                'wandb_id': wandb_run.id if self.wandb else None}
+
+                    # Save last, best and delete
+                    torch.save(ckpt, last)
+                    if best_fitness == fi:
+                        torch.save(ckpt, best)
+                    del ckpt
+            # end epoch ----------------------------------------------------------------------------------------------------
+        # end training
+
+        if rank in [-1, 0]:
+            # Strip optimizers
+            for f in [last, best]:
+                if f.exists():  # is *.pt
+                    strip_optimizer(f)  # strip optimizer
+                    os.system(
+                        'gsutil cp %s gs://%s/weights' % (f, self.opt.bucket)) if self.opt.bucket else None  # upload
+
+            # Plots
+            if plots:
+                plot_results(save_dir=save_dir)  # save as results.png
+                if self.wandb:
+                    files = ['results.png', 'precision_recall_curve.png', 'confusion_matrix.png']
+                    self.wandb.log({"Results": [self.wandb.Image(str(save_dir / f), caption=f) for f in files
+                                                if (save_dir / f).exists()]})
+            logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
+
+            # Test best.pt
+            # if self.opt.data.endswith('coco.yaml') and nc == 80:  # if COCO
+            #     results, _, _ = test.test(self.opt.data,
+            #                               batch_size=total_batch_size,
+            #                               imgsz=imgsz_test,
+            #                               model=attempt_load(best if best.exists() else last, self.device).half(),
+            #                               single_cls=self.opt.single_cls,
+            #                               dataloader=testloader,
+            #                               save_dir=save_dir,
+            #                               save_json=True,  # use pycocotools
+            #                               plots=False)
+
+        else:
+            dist.destroy_process_group()
+
+        self.wandb.run.finish() if self.wandb and self.wandb.run else None
+        torch.cuda.empty_cache()
+        return model.cpu().state_dict(), mloss #, results
+
+    # def train(self, w_global):
+    #     self.model.train()
+    #     self.model.load_state_dict(w_global)
+    #     self.model.to(self.device)
+    #
+    #     # train and update
+    #     if self.args.client_optimizer == "sgd":
+    #         optimizer = torch.optim.SGD(self.model.parameters(), lr=self.args.lr)
+    #     else:
+    #         optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.args.lr,
+    #                                           weight_decay=self.args.wd, amsgrad=True)
+    #
+    #     epoch_loss = []
+    #     for epoch in range(self.args.epochs):
+    #         batch_loss = []
+    #         for batch_idx, (x, labels) in enumerate(self.local_training_data):
+    #             x, labels = x.to(self.device), labels.to(self.device)
+    #             # logging.info("x.size = " + str(x.size()))
+    #             # logging.info("labels.size = " + str(labels.size()))
+    #             self.model.zero_grad()
+    #             log_probs = self.model(x)
+    #             loss = self.criterion(log_probs, labels)
+    #             loss.backward()
+    #
+    #             # to avoid nan loss
+    #             # torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5)
+    #
+    #             optimizer.step()
+    #             # logging.info('Update Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+    #             #     epoch, (batch_idx + 1) * self.args.batch_size, len(self.local_training_data) * self.args.batch_size,
+    #             #            100. * (batch_idx + 1) / len(self.local_training_data), loss.item()))
+    #             batch_loss.append(loss.item())
+    #         epoch_loss.append(sum(batch_loss) / len(batch_loss))
+    #         # logging.info('Client Index = {}\tEpoch: {}\tLoss: {:.6f}'.format(
+    #         #     self.client_idx, epoch, sum(epoch_loss) / len(epoch_loss)))
+    #     return self.model.cpu().state_dict(), sum(epoch_loss) / len(epoch_loss)
+
+    def local_test(self, model_global, b_use_test_dataset=False):
+        model_global.eval()
+        model_global.to(self.device)
+        metrics = {
+            'test_correct': 0,
+            'test_loss' : 0,
+            'test_precision': 0,
+            'test_recall': 0,
+            'test_total' : 0
+        }
+        if b_use_test_dataset:
+            test_data = self.local_test_data
+        else:
+            test_data = self.local_training_data
+        with torch.no_grad():
+            for batch_idx, (x, target) in enumerate(test_data):
+                x = x.to(self.device)
+                target = target.to(self.device)
+                pred = model_global(x)
+                loss = self.criterion(pred, target)
+
+                if self.args.dataset == "stackoverflow_lr":
+                    predicted = (pred > .5).int()
+                    correct = predicted.eq(target).sum(axis = -1).eq(target.size(1)).sum()
+                    true_positive = ((target * predicted) > .1).int().sum(axis = -1)
+                    precision = true_positive / (predicted.sum(axis = -1) + 1e-13)
+                    recall = true_positive / (target.sum(axis = -1)  + 1e-13)
+                    metrics['test_precision'] += precision.sum().item()
+                    metrics['test_recall'] += recall.sum().item()
+                else:
+                    _, predicted = torch.max(pred, 1)
+                    correct = predicted.eq(target).sum()
+
+                metrics['test_correct'] += correct.item()
+                metrics['test_loss'] += loss.item() * target.size(0)
+                if len(target.size()) == 1: #
+                    metrics['test_total'] += target.size(0)
+                elif len(target.size()) == 2: # for tasks of next word prediction
+                    metrics['test_total'] += target.size(0) * target.size(1)
+
+        return metrics
diff --git a/experiments/standalone/yolov5/test.py b/experiments/standalone/yolov5/test.py
new file mode 100644
index 0000000..1377726
--- /dev/null
+++ b/experiments/standalone/yolov5/test.py
@@ -0,0 +1,343 @@
+import argparse
+import json
+import os
+from pathlib import Path
+from threading import Thread
+
+import numpy as np
+import torch
+import yaml
+from tqdm import tqdm
+
+import sys
+sys.path.append('fedml/FedML-master')
+
+from fedml_api.model.object_detection.yolov5.models.experimental import attempt_load
+from fedml_api.model.object_detection.yolov5.utils.datasets import create_dataloader
+from fedml_api.model.object_detection.yolov5.utils.general import coco80_to_coco91_class, check_dataset, check_file, check_img_size, box_iou, \
+    non_max_suppression, scale_coords, xyxy2xywh, xywh2xyxy, set_logging, increment_path
+from fedml_api.model.object_detection.yolov5.utils.loss import compute_loss
+from fedml_api.model.object_detection.yolov5.utils.metrics import ap_per_class, ConfusionMatrix
+from fedml_api.model.object_detection.yolov5.utils.plots import plot_images, output_to_target, plot_study_txt
+from fedml_api.model.object_detection.yolov5.utils.torch_utils import select_device, time_synchronized
+
+
+def test(data,
+         weights=None,
+         batch_size=32,
+         imgsz=640,
+         conf_thres=0.001,
+         iou_thres=0.6,  # for NMS
+         save_json=False,
+         single_cls=False,
+         augment=False,
+         verbose=False,
+         model=None,
+         dataloader=None,
+         save_dir=Path(''),  # for saving images
+         save_txt=False,  # for auto-labelling
+         save_hybrid=False,  # for hybrid auto-labelling
+         save_conf=False,  # save auto-label confidences
+         plots=True,
+         log_imgs=0):  # number of logged images
+
+    # Initialize/load model and set device
+    training = model is not None
+    if training:  # called by train.py
+        device = next(model.parameters()).device  # get model device
+
+    else:  # called directly
+        set_logging()
+        device = select_device(opt.device, batch_size=batch_size)
+
+        device = 'cpu'
+        # Directories
+        save_dir = Path(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok))  # increment run
+        (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True)  # make dir
+
+        # Load model
+        model = attempt_load(weights, map_location=device)  # load FP32 model
+        imgsz = check_img_size(imgsz, s=model.stride.max())  # check img_size
+
+        # Multi-GPU disabled, incompatible with .half() https://github.com/ultralytics/yolov5/issues/99
+        # if device.type != 'cpu' and torch.cuda.device_count() > 1:
+        #     model = nn.DataParallel(model)
+
+    device = torch.device('cpu')
+    # Half
+    half = device.type != 'cpu'  # half precision only supported on CUDA
+    if half:
+        model.half()
+
+    # Configure
+    model = model.to(device)
+    model.eval()
+    is_coco = data.endswith('coco.yaml')  # is COCO dataset
+    with open(data) as f:
+        data = yaml.load(f, Loader=yaml.FullLoader)  # model dict
+    check_dataset(data)  # check
+    nc = 1 if single_cls else int(data['nc'])  # number of classes
+    iouv = torch.linspace(0.5, 0.95, 10).to(device)  # iou vector for mAP@0.5:0.95
+    niou = iouv.numel()
+
+    # Logging
+    log_imgs, wandb = min(log_imgs, 100), None  # ceil
+    try:
+        import wandb  # Weights & Biases
+    except ImportError:
+        log_imgs = 0
+
+    # Dataloader
+    if not training:
+        img = torch.zeros((1, 3, imgsz, imgsz), device=device)  # init img
+        _ = model(img.half() if half else img) if device.type != 'cpu' else None  # run once
+        path = data['test'] if opt.task == 'test' else data['val']  # path to val/test images
+        dataloader = create_dataloader(path, imgsz, batch_size, model.stride.max(), opt, pad=0.5, rect=True)[0]
+
+    seen = 0
+    confusion_matrix = ConfusionMatrix(nc=nc)
+    names = {k: v for k, v in enumerate(model.names if hasattr(model, 'names') else model.module.names)}
+    coco91class = coco80_to_coco91_class()
+    s = ('%20s' + '%12s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@.5', 'mAP@.5:.95')
+    p, r, f1, mp, mr, map50, map, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0.
+    loss = torch.zeros(3, device=device)
+    jdict, stats, ap, ap_class, wandb_images = [], [], [], [], []
+    for batch_i, (img, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)):
+        img = img.to(device, non_blocking=True)
+        img = img.half() if half else img.float()  # uint8 to fp16/32
+        img /= 255.0  # 0 - 255 to 0.0 - 1.0
+        targets = targets.to(device)
+        nb, _, height, width = img.shape  # batch size, channels, height, width
+
+        with torch.no_grad():
+            # Run model
+            t = time_synchronized()
+            inf_out, train_out = model(img, augment=augment)  # inference and training outputs
+            t0 += time_synchronized() - t
+
+            # Compute loss
+            if training:
+                loss += compute_loss([x.float() for x in train_out], targets, model)[1][:3]  # box, obj, cls
+
+            # Run NMS
+            targets[:, 2:] *= torch.Tensor([width, height, width, height]).to(device)  # to pixels
+            lb = [targets[targets[:, 0] == i, 1:] for i in range(nb)] if save_hybrid else []  # for autolabelling
+            t = time_synchronized()
+            inf_out = inf_out.cpu()
+            output = non_max_suppression(inf_out, conf_thres=conf_thres, iou_thres=iou_thres, labels=lb)
+            t1 += time_synchronized() - t
+
+        # Statistics per image
+        targets = targets.cpu()
+        for si, pred in enumerate(output):
+            labels = targets[targets[:, 0] == si, 1:]
+            nl = len(labels)
+            tcls = labels[:, 0].tolist() if nl else []  # target class
+            path = Path(paths[si])
+            seen += 1
+
+            if len(pred) == 0:
+                if nl:
+                    stats.append((torch.zeros(0, niou, dtype=torch.bool), torch.Tensor(), torch.Tensor(), tcls))
+                continue
+
+            # Predictions
+            predn = pred.clone()
+            scale_coords(img[si].shape[1:], predn[:, :4], shapes[si][0], shapes[si][1])  # native-space pred
+
+            # Append to text file
+            if save_txt:
+                gn = torch.tensor(shapes[si][0])[[1, 0, 1, 0]]  # normalization gain whwh
+                for *xyxy, conf, cls in predn.tolist():
+                    xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
+                    line = (cls, *xywh, conf) if save_conf else (cls, *xywh)  # label format
+                    with open(save_dir / 'labels' / (path.stem + '.txt'), 'a') as f:
+                        f.write(('%g ' * len(line)).rstrip() % line + '\n')
+
+            # W&B logging
+            if plots and len(wandb_images) < log_imgs:
+                box_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]},
+                             "class_id": int(cls),
+                             "box_caption": "%s %.3f" % (names[cls], conf),
+                             "scores": {"class_score": conf},
+                             "domain": "pixel"} for *xyxy, conf, cls in pred.tolist()]
+                boxes = {"predictions": {"box_data": box_data, "class_labels": names}}  # inference-space
+                wandb_images.append(wandb.Image(img[si], boxes=boxes, caption=path.name))
+
+            # Append to pycocotools JSON dictionary
+            if save_json:
+                # [{"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}, ...
+                image_id = int(path.stem) if path.stem.isnumeric() else path.stem
+                box = xyxy2xywh(predn[:, :4])  # xywh
+                box[:, :2] -= box[:, 2:] / 2  # xy center to top-left corner
+                for p, b in zip(pred.tolist(), box.tolist()):
+                    jdict.append({'image_id': image_id,
+                                  'category_id': coco91class[int(p[5])] if is_coco else int(p[5]),
+                                  'bbox': [round(x, 3) for x in b],
+                                  'score': round(p[4], 5)})
+
+            # Assign all predictions as incorrect
+            correct = torch.zeros(pred.shape[0], niou, dtype=torch.bool, device=device)
+            if nl:
+                detected = []  # target indices
+                tcls_tensor = labels[:, 0]
+
+                # target boxes
+                tbox = xywh2xyxy(labels[:, 1:5])
+                scale_coords(img[si].shape[1:], tbox, shapes[si][0], shapes[si][1])  # native-space labels
+                if plots:
+                    confusion_matrix.process_batch(pred, torch.cat((labels[:, 0:1], tbox), 1))
+
+                # Per target class
+                for cls in torch.unique(tcls_tensor):
+                    ti = (cls == tcls_tensor).nonzero(as_tuple=False).view(-1)  # prediction indices
+                    pi = (cls == pred[:, 5]).nonzero(as_tuple=False).view(-1)  # target indices
+
+                    # Search for detections
+                    if pi.shape[0]:
+                        # Prediction to target ious
+                        ious, i = box_iou(predn[pi, :4], tbox[ti]).max(1)  # best ious, indices
+
+                        # Append detections
+                        detected_set = set()
+                        for j in (ious > iouv[0]).nonzero(as_tuple=False):
+                            d = ti[i[j]]  # detected target
+                            if d.item() not in detected_set:
+                                detected_set.add(d.item())
+                                detected.append(d)
+                                correct[pi[j]] = ious[j] > iouv  # iou_thres is 1xn
+                                if len(detected) == nl:  # all targets already located in image
+                                    break
+
+            # Append statistics (correct, conf, pcls, tcls)
+            stats.append((correct.cpu(), pred[:, 4].cpu(), pred[:, 5].cpu(), tcls))
+
+        # Plot images
+        if plots and batch_i < 3:
+            f = save_dir / f'test_batch{batch_i}_labels.jpg'  # labels
+            Thread(target=plot_images, args=(img, targets, paths, f, names), daemon=True).start()
+            f = save_dir / f'test_batch{batch_i}_pred.jpg'  # predictions
+            Thread(target=plot_images, args=(img, output_to_target(output), paths, f, names), daemon=True).start()
+
+    # Compute statistics
+    stats = [np.concatenate(x, 0) for x in zip(*stats)]  # to numpy
+    if len(stats) and stats[0].any():
+        p, r, ap, f1, ap_class = ap_per_class(*stats, plot=plots, save_dir=save_dir, names=names)
+        p, r, ap50, ap = p[:, 0], r[:, 0], ap[:, 0], ap.mean(1)  # [P, R, AP@0.5, AP@0.5:0.95]
+        mp, mr, map50, map = p.mean(), r.mean(), ap50.mean(), ap.mean()
+        nt = np.bincount(stats[3].astype(np.int64), minlength=nc)  # number of targets per class
+    else:
+        nt = torch.zeros(1)
+
+    # Print results
+    pf = '%20s' + '%12.3g' * 6  # print format
+    print(pf % ('all', seen, nt.sum(), mp, mr, map50, map))
+
+    # Print results per class
+    if verbose and nc > 1 and len(stats):
+        for i, c in enumerate(ap_class):
+            print(pf % (names[c], seen, nt[c], p[i], r[i], ap50[i], ap[i]))
+
+    # Print speeds
+    t = tuple(x / seen * 1E3 for x in (t0, t1, t0 + t1)) + (imgsz, imgsz, batch_size)  # tuple
+    if not training:
+        print('Speed: %.1f/%.1f/%.1f ms inference/NMS/total per %gx%g image at batch-size %g' % t)
+
+    # Plots
+    if plots:
+        confusion_matrix.plot(save_dir=save_dir, names=list(names.values()))
+        if wandb and wandb.run:
+            wandb.log({"Images": wandb_images})
+            wandb.log({"Validation": [wandb.Image(str(f), caption=f.name) for f in sorted(save_dir.glob('test*.jpg'))]})
+
+    # Save JSON
+    if save_json and len(jdict):
+        w = Path(weights[0] if isinstance(weights, list) else weights).stem if weights is not None else ''  # weights
+        anno_json = '../coco/annotations/instances_val2017.json'  # annotations json
+        pred_json = str(save_dir / f"{w}_predictions.json")  # predictions json
+        print('\nEvaluating pycocotools mAP... saving %s...' % pred_json)
+        with open(pred_json, 'w') as f:
+            json.dump(jdict, f)
+
+        try:  # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb
+            from pycocotools.coco import COCO
+            from pycocotools.cocoeval import COCOeval
+
+            anno = COCO(anno_json)  # init annotations api
+            pred = anno.loadRes(pred_json)  # init predictions api
+            eval = COCOeval(anno, pred, 'bbox')
+            if is_coco:
+                eval.params.imgIds = [int(Path(x).stem) for x in dataloader.dataset.img_files]  # image IDs to evaluate
+            eval.evaluate()
+            eval.accumulate()
+            eval.summarize()
+            map, map50 = eval.stats[:2]  # update results (mAP@0.5:0.95, mAP@0.5)
+        except Exception as e:
+            print(f'pycocotools unable to run: {e}')
+
+    # Return results
+    if not training:
+        s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
+        print(f"Results saved to {save_dir}{s}")
+    model.float()  # for training
+    model.cuda()
+    maps = np.zeros(nc) + map
+    for i, c in enumerate(ap_class):
+        maps[c] = ap[i]
+    return (mp, mr, map50, map, *(loss.cpu() / len(dataloader)).tolist()), maps, t
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(prog='test.py')
+    parser.add_argument('--weights', nargs='+', type=str, default='yolov5s.pt', help='model.pt path(s)')
+    parser.add_argument('--data', type=str, default='data/coco128.yaml', help='*.data path')
+    parser.add_argument('--batch-size', type=int, default=32, help='size of each image batch')
+    parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)')
+    parser.add_argument('--conf-thres', type=float, default=0.001, help='object confidence threshold')
+    parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS')
+    parser.add_argument('--task', default='val', help="'val', 'test', 'study'")
+    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
+    parser.add_argument('--single-cls', action='store_true', help='treat as single-class dataset')
+    parser.add_argument('--augment', action='store_true', help='augmented inference')
+    parser.add_argument('--verbose', action='store_true', help='report mAP by class')
+    parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
+    parser.add_argument('--save-hybrid', action='store_true', help='save label+prediction hybrid results to *.txt')
+    parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels')
+    parser.add_argument('--save-json', action='store_true', help='save a cocoapi-compatible JSON results file')
+    parser.add_argument('--project', default='runs/test', help='save to project/name')
+    parser.add_argument('--name', default='exp', help='save to project/name')
+    parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
+    opt = parser.parse_args()
+    opt.save_json |= opt.data.endswith('coco.yaml')
+    opt.data = check_file(opt.data)  # check file
+    print(opt)
+    opt.device = 'cpu'
+    if opt.task in ['val', 'test']:  # run normally
+        test(opt.data,
+             opt.weights,
+             opt.batch_size,
+             opt.img_size,
+             opt.conf_thres,
+             opt.iou_thres,
+             opt.save_json,
+             opt.single_cls,
+             opt.augment,
+             opt.verbose,
+             save_txt=opt.save_txt | opt.save_hybrid,
+             save_hybrid=opt.save_hybrid,
+             save_conf=opt.save_conf,
+             )
+
+    elif opt.task == 'study':  # run over a range of settings and save/plot
+        for weights in ['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt']:
+            f = 'study_%s_%s.txt' % (Path(opt.data).stem, Path(weights).stem)  # filename to save to
+            x = list(range(320, 800, 64))  # x axis
+            y = []  # y axis
+            for i in x:  # img-size
+                print('\nRunning %s point %s...' % (f, i))
+                r, _, t = test(opt.data, weights, opt.batch_size, i, opt.conf_thres, opt.iou_thres, opt.save_json,
+                               plots=False)
+                y.append(r + t)  # results and times
+            np.savetxt(f, y, fmt='%10.4g')  # save
+        os.system('zip -r study.zip study_*.txt')
+        plot_study_txt(f, x)  # plot
diff --git a/experiments/standalone/yolov5/train.py b/experiments/standalone/yolov5/train.py
new file mode 100644
index 0000000..27e642d
--- /dev/null
+++ b/experiments/standalone/yolov5/train.py
@@ -0,0 +1,656 @@
+import argparse
+import logging
+import os
+import random
+import copy
+import time
+from pathlib import Path
+from threading import Thread
+from warnings import warn
+import collections
+import sys
+sys.path.append('fedml/FedML-master')
+
+
+import math
+import numpy as np
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.optim.lr_scheduler as lr_scheduler
+import torch.utils.data
+import yaml
+# from apex import amp
+from torch.cuda import amp
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+
+import test  # import test.py to get mAP after each epoch
+# from fedml_api.model.object_detection.yolov5.models.experimental import
+from fedml_api.model.object_detection.yolov5.models.experimental import attempt_load
+from fedml_api.model.object_detection.yolov5.models.yolo import Model
+from models.yolo import Model
+from fedml_api.model.object_detection.yolov5.utils.autoanchor import check_anchors
+# from utils.datasets import create_dataloader
+from fedml_api.model.object_detection.yolov5.utils.general import labels_to_class_weights, increment_path, labels_to_image_weights, init_seeds, \
+    fitness, strip_optimizer, get_latest_run, check_dataset, check_file, check_git_status, check_img_size, \
+    print_mutation, set_logging
+from fedml_api.model.object_detection.yolov5.utils.google_utils import attempt_download
+from fedml_api.model.object_detection.yolov5.utils.loss import compute_loss
+from fedml_api.model.object_detection.yolov5.utils.plots import plot_images, plot_labels, plot_results, plot_evolution
+from fedml_api.model.object_detection.yolov5.utils.torch_utils import ModelEMA, select_device, intersect_dicts, torch_distributed_zero_first
+
+from fedml_api.data_preprocessing.coco_detection.datasets import partition_data
+from fedml_api.data_preprocessing.coco_detection.datasets import create_dataloader
+from fedml_api.standalone.fedavg_yolo.client import Client
+logger = logging.getLogger(__name__)
+
+sys.path.insert(0, 'fedml/FedML-master/fedml_api/standalone/fedavg_yolo/data/')
+
+try:
+    import wandb
+except ImportError:
+    wandb = None
+    logger.info("Install Weights & Biases for experiment logging via 'pip install wandb' (recommended)")
+
+def aggregate(w_locals):
+    training_num = 0
+    for idx in range(len(w_locals)):
+        (sample_num, averaged_params) = w_locals[idx]
+        training_num += sample_num
+
+    (sample_num, averaged_params) = w_locals[0]
+    for k in averaged_params.keys():
+        for i in range(0, len(w_locals)):
+            local_sample_number, local_model_params = w_locals[i]
+            w = local_sample_number / training_num
+            if i == 0:
+                averaged_params[k] = local_model_params[k] * w
+            else:
+                averaged_params[k] += local_model_params[k] * w
+    return averaged_params
+
+def client_sampling(round_idx, client_num_in_total, client_num_per_round):
+    if client_num_in_total == client_num_per_round:
+        client_indexes = [client_index for client_index in range(client_num_in_total)]
+    else:
+        num_clients = min(client_num_per_round, client_num_in_total)
+        np.random.seed(round_idx)  # make sure for each comparison, we are selecting the same clients each round
+        client_indexes = np.random.choice(range(client_num_in_total), num_clients, replace=False)
+    logging.info("client_indexes = %s" % str(client_indexes))
+    return client_indexes
+
+def train(hyp, opt, device, tb_writer=None, wandb=None):
+    logger.info(f'Hyperparameters {hyp}')
+    save_dir, epochs, batch_size, total_batch_size, weights, rank = \
+        Path(opt.save_dir), opt.comm_round, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank
+
+    # Directories
+    wdir = save_dir / 'weights'
+    wdir.mkdir(parents=True, exist_ok=True)  # make dir
+    last = wdir / 'last.pt'
+    best = wdir / 'best.pt'
+    results_file = save_dir / 'results.txt'
+
+    # Save run settings
+    with open(save_dir / 'hyp.yaml', 'w') as f:
+        yaml.dump(hyp, f, sort_keys=False)
+    with open(save_dir / 'opt.yaml', 'w') as f:
+        yaml.dump(vars(opt), f, sort_keys=False)
+
+    # Configure
+    plots = not opt.evolve  # create plots
+    cuda = device.type != 'cpu'
+    init_seeds(2 + rank)
+    with open(opt.data) as f:
+        data_dict = yaml.load(f, Loader=yaml.FullLoader)  # data dict
+    with torch_distributed_zero_first(rank):
+        check_dataset(data_dict)  # check
+    train_path = data_dict['train']
+    test_path = data_dict['val']
+    nc, names = (1, ['item']) if opt.single_cls else (int(data_dict['nc']), data_dict['names'])  # number classes, names
+    assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data)  # check
+
+    # Model
+    pretrained = weights.endswith('.pt')
+    if pretrained:
+        with torch_distributed_zero_first(rank):
+            attempt_download(weights)  # download if not found locally
+        ckpt = torch.load(weights, map_location=device)  # load checkpoint
+        if hyp.get('anchors'):
+            ckpt['model'].yaml['anchors'] = round(hyp['anchors'])  # force autoanchor
+        model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device)  # create
+        # ckpt = collections.defaultdict()
+
+        exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else []  # exclude keys
+        state_dict = ckpt['model'].float().state_dict()  # to FP32
+        state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude)  # intersect
+        model.load_state_dict(state_dict, strict=False)  # load
+        logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights))  # report
+    else:
+        model = Model(opt.cfg, ch=3, nc=nc).to(device)  # create
+        # model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True).to(device)
+    # Freeze
+    freeze = []  # parameter names to freeze (full or partial)
+    for k, v in model.named_parameters():
+        v.requires_grad = True  # train all layers
+        if any(x in k for x in freeze):
+            print('freezing %s' % k)
+            v.requires_grad = False
+
+    # Optimizer
+    nbs = 64  # nominal batch size
+    accumulate = max(round(nbs / total_batch_size), 1)  # accumulate loss before optimizing
+    hyp['weight_decay'] *= total_batch_size * accumulate / nbs  # scale weight_decay
+
+    pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
+    for k, v in model.named_modules():
+        if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
+            pg2.append(v.bias)  # biases
+        if isinstance(v, nn.BatchNorm2d):
+            pg0.append(v.weight)  # no decay
+        elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
+            pg1.append(v.weight)  # apply decay
+
+    if opt.adam:
+        optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999))  # adjust beta1 to momentum
+    else:
+        optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
+
+    optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']})  # add pg1 with weight_decay
+    optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
+    logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
+    del pg0, pg1, pg2
+
+    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
+    # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
+    lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp['lrf']) + hyp['lrf']  # cosine
+    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
+    # plot_lr_scheduler(optimizer, scheduler, epochs)
+
+    # Logging
+    if wandb and wandb.run is None:
+        opt.hyp = hyp  # add hyperparameters
+        wandb_run = wandb.init(config=opt, resume="allow",
+                               project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem,
+                               name=save_dir.stem,
+                               id=ckpt.get('wandb_id') if 'ckpt' in locals() else None)
+    loggers = {'wandb': wandb}  # loggers dict
+
+    # Resume
+    start_epoch, best_fitness = 0, 0.0
+    if pretrained:
+        # Optimizer
+        if ckpt['optimizer'] is not None:
+            optimizer.load_state_dict(ckpt['optimizer'])
+            best_fitness = ckpt['best_fitness']
+
+        # Results
+        if ckpt.get('training_results') is not None:
+            with open(results_file, 'w') as file:
+                file.write(ckpt['training_results'])  # write results.txt
+
+        # Epochs
+        start_epoch = ckpt['epoch'] + 1
+        print("start_epoch:", start_epoch)
+        # start_epoch = 1 #250
+        if opt.resume:
+            assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs)
+        if epochs < start_epoch:
+            logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
+                        (weights, ckpt['epoch'], epochs))
+            epochs += ckpt['epoch']  # finetune additional epochs
+
+        del ckpt, state_dict
+    # start_epoch = 0
+    # Image sizes
+    gs = int(max(model.stride))  # grid size (max stride)
+    imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size]  # verify imgsz are gs-multiples
+
+    # DP mode
+    if cuda and rank == -1 and torch.cuda.device_count() > 1:
+        model = torch.nn.DataParallel(model)
+
+    # SyncBatchNorm
+    if opt.sync_bn and cuda and rank != -1:
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
+        logger.info('Using SyncBatchNorm()')
+
+    # EMA
+    ema = ModelEMA(model) if rank in [-1, 0] else None
+
+    # DDP mode
+    if cuda and rank != -1:
+        model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank)
+
+    # Trainloader
+    dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt,
+                                            hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank,
+                                            world_size=opt.world_size, workers=opt.workers,
+                                            image_weights=opt.image_weights)
+    mlc = np.concatenate(dataset.labels, 0)[:, 0].max()  # max label class
+    nb = len(dataloader)  # number of batches
+    assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1)
+
+    # Process 0
+    if rank in [-1, 0]:
+        ema.updates = start_epoch * nb // accumulate  # set EMA updates
+        testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt,  # testloader
+                                       hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True,
+                                       rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5)[0]
+
+        if not opt.resume:
+            labels = np.concatenate(dataset.labels, 0)
+            c = torch.tensor(labels[:, 0])  # classes
+            # cf = torch.bincount(c.long(), minlength=nc) + 1.  # frequency
+            # model._initialize_biases(cf.to(device))
+            if plots:
+                Thread(target=plot_labels, args=(labels, save_dir, loggers), daemon=True).start()
+                if tb_writer:
+                    tb_writer.add_histogram('classes', c, 0)
+
+            # Anchors
+            if not opt.noautoanchor:
+                check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
+
+    # Model parameters
+    hyp['cls'] *= nc / 80.  # scale coco-tuned hyp['cls'] to current dataset
+    model.nc = nc  # attach number of classes to model
+    model.hyp = hyp  # attach hyperparameters to model
+    model.gr = 1.0  # iou loss ratio (obj_loss = 1.0 or iou)
+    model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device)  # attach class weights
+    model.names = names
+
+    # Start training
+    t0 = time.time()
+    nw = max(round(hyp['warmup_epochs'] * nb), 1000)  # number of warmup iterations, max(3 epochs, 1k iterations)
+    # nw = min(nw, (epochs - start_epoch) / 2 * nb)  # limit warmup to < 1/2 of training
+    maps = np.zeros(nc)  # mAP per class
+    results = (0, 0, 0, 0, 0, 0, 0)  # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)
+    scheduler.last_epoch = start_epoch - 1  # do not move
+    scaler = amp.GradScaler(enabled=cuda)
+    logger.info('Image sizes %g train, %g test\n'
+                'Using %g dataloader workers\nLogging results to %s\n'
+                'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, save_dir, epochs))
+
+    # client
+    client_list = []
+    client_number = opt.client_number
+    partition = opt.partition
+    net_dataidx_map = partition_data(train_path, partition=partition, n_nets=client_number)
+    train_data_loader_dict = dict()
+    train_data_num_dict = dict()
+    train_dataset_dict = dict()
+    for i in range(client_number):
+        print("net_dataidx_map trainer:", net_dataidx_map[i])
+        dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt,
+                                                hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect,
+                                                rank=rank,
+                                                world_size=opt.world_size, workers=opt.workers,
+                                                image_weights=opt.image_weights,
+                                                net_dataidx_map=net_dataidx_map[i])
+        train_dataset_dict[i] = dataset
+        train_data_num_dict[i] = len(dataset)
+        train_data_loader_dict[i] = dataloader
+        client_list.append(
+            Client(i, train_data_loader_dict[i], len(dataset), opt, device, model, tb_writer=tb_writer,
+                   hyp=hyp, wandb=wandb))
+    # fedml
+    w_global = model.state_dict()
+    print("comm_round:", opt.comm_round)
+    for round_idx in range(start_epoch, opt.comm_round):
+        logging.info("################Communication round : {}".format(round_idx))
+        w_locals, loss_locals = [], []
+
+        client_indexes = client_sampling(round_idx, opt.client_number, opt.client_num_per_round)
+        logging.info("client_indexes = " + str(client_indexes))
+
+        for idx, client in enumerate(client_list):
+            client_idx = client_indexes[idx]
+            client.update_local_dataset(client_idx, train_data_loader_dict[client_idx], train_data_num_dict[client_idx])
+
+            client_model = client.model
+            client_model.to(device)
+            client_model.train()
+
+            client_dataset = train_dataset_dict[client_idx]
+            client_dataloader = train_data_loader_dict[client_idx]
+            nb = len(client_dataloader)
+
+            if opt.image_weights:
+                # Generate indices
+                if rank in [-1, 0]:
+                    cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2  # class weights
+                    iw = labels_to_image_weights(client_dataset.labels, nc=nc, class_weights=cw)  # image weights
+                    client_dataset.indices = random.choices(range(client_dataset.n), weights=iw, k=client_dataset.n)  # rand weighted idx
+                # Broadcast if DDP
+                if rank != -1:
+                    indices = (torch.tensor(client_dataset.indices) if rank == 0 else torch.zeros(client_dataset.n)).int()
+                    dist.broadcast(indices, 0)
+                    if rank != 0:
+                        client_dataset.indices = indices.cpu().numpy()
+
+
+            mloss = torch.zeros(4, device=device)  # mean losses
+            if rank != -1:
+                client_dataloader.sampler.set_epoch(round_idx)
+            pbar = enumerate(client_dataloader)
+            logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size'))
+            if rank in [-1, 0]:
+                pbar = tqdm(pbar, total=nb)  # progress bar
+            optimizer.zero_grad()
+            for i, (
+            imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
+                ni = i + nb * round_idx  # number integrated batches (since train start)
+                imgs = imgs.to(device, non_blocking=True).float() / 255.0  # uint8 to float32, 0-255 to 0.0-1.0
+
+                # Warmup
+                if ni <= nw:
+                    xi = [0, nw]  # x interp
+                    # model.gr = np.interp(ni, xi, [0.0, 1.0])  # iou loss ratio (obj_loss = 1.0 or iou)
+                    accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round())
+                    for j, x in enumerate(optimizer.param_groups):
+                        # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
+                        x['lr'] = np.interp(ni, xi,
+                                            [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(round_idx)])
+                        if 'momentum' in x:
+                            x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']])
+
+                # Multi-scale
+                if opt.multi_scale:
+                    sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs  # size
+                    sf = sz / max(imgs.shape[2:])  # scale factor
+                    if sf != 1:
+                        ns = [math.ceil(x * sf / gs) * gs for x in
+                              imgs.shape[2:]]  # new shape (stretched to gs-multiple)
+                        imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
+
+                # Forward
+                # print("imgs:", imgs.dtype)
+                with amp.autocast(enabled=cuda):
+                    pred = client_model(imgs)  # forward
+                    loss, loss_items = compute_loss(pred, targets.to(device), client_model)  # loss scaled by batch_size
+                    if rank != -1:
+                        loss *= opt.world_size  # gradient averaged between devices in DDP mode
+
+                # Backward
+                scaler.scale(loss).backward()
+
+                # Optimize
+                if ni % accumulate == 0:
+                    scaler.step(optimizer)  # optimizer.step
+                    scaler.update()
+                    optimizer.zero_grad()
+                    if ema:
+                        ema.update(model)
+
+                # Print
+                if rank in [-1, 0]:
+                    mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
+                    mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0)  # (GB)
+                    s = ('%10s' * 2 + '%10.4g' * 6) % (
+                        '%g/%g' % (round_idx, opt.comm_round - 1), mem, *mloss, targets.shape[0], imgs.shape[-1])
+                    pbar.set_description(s)
+
+                    # Plot
+                    if plots and ni < 3:
+                        f = save_dir / f'train_batch{ni}.jpg'  # filename
+                        Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start()
+                        # if tb_writer:
+                        #     tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch)
+                        #     tb_writer.add_graph(model, imgs)  # add model to tensorboard
+                    elif plots and ni == 3 and wandb:
+                        wandb.log(
+                            {"Mosaics": [wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg')]})
+
+            w_locals.append((client.get_sample_number(), copy.deepcopy(client_model.cpu().state_dict())))
+
+        w_global = aggregate(w_locals)
+
+
+
+        lr = [x['lr'] for x in optimizer.param_groups]  # for tensorboard
+        scheduler.step()
+
+        if round_idx % opt.frequency_of_the_test == 0 or round_idx == opt.comm_round - 1:
+            model.load_state_dict(w_global)
+
+        else:
+            continue
+        # DDP process 0 or single-GPU
+        if rank in [-1, 0]:
+            # mAP
+            if ema:
+                ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride'])
+            final_epoch = round_idx + 1 == opt.comm_round
+            if not opt.notest or final_epoch:  # Calculate mAP
+                results, maps, times = test.test(opt.data,
+                                                 batch_size=total_batch_size,
+                                                 imgsz=imgsz_test,
+                                                 model=ema.ema,
+                                                 single_cls=opt.single_cls,
+                                                 dataloader=testloader,
+                                                 save_dir=save_dir,
+                                                 plots=plots and final_epoch,
+                                                 log_imgs=opt.log_imgs if wandb else 0)
+
+            # Write
+            with open(results_file, 'a') as f:
+                f.write(s + '%10.4g' * 7 % results + '\n')  # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls)
+            if len(opt.name) and opt.bucket:
+                os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name))
+
+            # Log
+            tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss',  # train loss
+                    'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
+                    'val/box_loss', 'val/obj_loss', 'val/cls_loss',  # val loss
+                    'x/lr0', 'x/lr1', 'x/lr2']  # params
+            for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
+                if tb_writer:
+                    tb_writer.add_scalar(tag, x, round_idx)  # tensorboard
+                if wandb:
+                    wandb.log({tag: x})  # W&B
+
+            # Update best mAP
+            fi = fitness(np.array(results).reshape(1, -1))  # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
+            if fi > best_fitness:
+                best_fitness = fi
+
+            # Save model
+            save = (not opt.nosave) or (final_epoch and not opt.evolve)
+            if save:
+                with open(results_file, 'r') as f:  # create checkpoint
+                    ckpt = {'epoch': round_idx,
+                            'best_fitness': best_fitness,
+                            'training_results': f.read(),
+                            'model': ema.ema,
+                            'optimizer': None if final_epoch else optimizer.state_dict(),
+                            'wandb_id': wandb_run.id if wandb else None}
+
+                # Save last, best and delete
+                torch.save(ckpt, last)
+                if best_fitness == fi:
+                    torch.save(ckpt, best)
+                del ckpt
+
+
+   
+
+    wandb.run.finish() if wandb and wandb.run else None
+    torch.cuda.empty_cache()
+    return results
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--weights', type=str, default='yolov5s.pt', help='initial weights path')
+    parser.add_argument('--cfg', type=str, default='', help='model.yaml path')
+    parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path')
+    parser.add_argument('--hyp', type=str, default='data/hyp.scratch.yaml', help='hyperparameters path')
+    parser.add_argument('--epochs', type=int, default=400)
+    parser.add_argument('--batch-size', type=int, default=1, help='total batch size for all GPUs')
+    parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='[train, test] image sizes')
+    parser.add_argument('--rect', action='store_true', help='rectangular training')
+    parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training')
+    parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
+    parser.add_argument('--notest', action='store_true', help='only test final epoch')
+    parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check')
+    parser.add_argument('--evolve', action='store_true', help='evolve hyperparameters')
+    parser.add_argument('--bucket', type=str, default='', help='gsutil bucket')
+    parser.add_argument('--cache-images', action='store_true', help='cache images for faster training')
+    parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training')
+    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
+    parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
+    parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
+    parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer')
+    parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
+    parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')
+    parser.add_argument('--log-imgs', type=int, default=4, help='number of images for W&B logging, max 100')
+    parser.add_argument('--workers', type=int, default=1, help='maximum number of dataloader workers')
+    parser.add_argument('--project', default='runs/train', help='save to project/name')
+    parser.add_argument('--name', default='exp', help='save to project/name')
+    parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
+
+    # fedml
+    parser.add_argument('--partition', type=str, default='homo', help='hyperparameters path')
+    parser.add_argument('--client_number', type=int, default=8, help='maximum number of dataloader workers')
+    parser.add_argument('--client_num_per_round', type=int, default=8, help='maximum number of dataloader workers')
+    parser.add_argument('--comm_round', type=int, default=400, help='maximum number of dataloader workers')
+    parser.add_argument('--frequency_of_the_test', type=int, default=10)
+
+    opt = parser.parse_args()
+
+    # Set DDP variables
+    opt.total_batch_size = opt.batch_size
+    opt.world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1
+    opt.global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1
+    set_logging(opt.global_rank)
+    if opt.global_rank in [-1, 0]:
+        check_git_status()
+
+    # Resume
+    if opt.resume:  # resume an interrupted run
+        ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run()  # specified or most recent path
+        assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
+        with open(Path(ckpt).parent.parent / 'opt.yaml') as f:
+            opt = argparse.Namespace(**yaml.load(f, Loader=yaml.FullLoader))  # replace
+        opt.cfg, opt.weights, opt.resume = '', ckpt, True
+        logger.info('Resuming training from %s' % ckpt)
+    else:
+        # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml')
+        opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp)  # check files
+        assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
+        opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size)))  # extend to 2 sizes (train, test)
+        opt.name = 'evolve' if opt.evolve else opt.name
+        opt.save_dir = increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok | opt.evolve)  # increment run
+
+    # DDP mode
+    device = select_device(opt.device, batch_size=opt.batch_size)
+    if opt.local_rank != -1:
+        assert torch.cuda.device_count() > opt.local_rank
+        torch.cuda.set_device(opt.local_rank)
+        device = torch.device('cuda', opt.local_rank)
+        dist.init_process_group(backend='nccl', init_method='env://')  # distributed backend
+        assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count'
+        opt.batch_size = opt.total_batch_size // opt.world_size
+
+    # Hyperparameters
+    with open(opt.hyp) as f:
+        hyp = yaml.load(f, Loader=yaml.FullLoader)  # load hyps
+        if 'box' not in hyp:
+            warn('Compatibility: %s missing "box" which was renamed from "giou" in %s' %
+                 (opt.hyp, 'https://github.com/ultralytics/yolov5/pull/1120'))
+            hyp['box'] = hyp.pop('giou')
+
+    # Train
+    logger.info(opt)
+    if not opt.evolve:
+        tb_writer = None  # init loggers
+        if opt.global_rank in [-1, 0]:
+            logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.project}", view at http://localhost:6006/')
+            tb_writer = SummaryWriter(opt.save_dir)  # Tensorboard
+        train(hyp, opt, device, tb_writer, wandb)
+
+    # Evolve hyperparameters (optional)
+    else:
+        # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit)
+        meta = {'lr0': (1, 1e-5, 1e-1),  # initial learning rate (SGD=1E-2, Adam=1E-3)
+                'lrf': (1, 0.01, 1.0),  # final OneCycleLR learning rate (lr0 * lrf)
+                'momentum': (0.3, 0.6, 0.98),  # SGD momentum/Adam beta1
+                'weight_decay': (1, 0.0, 0.001),  # optimizer weight decay
+                'warmup_epochs': (1, 0.0, 5.0),  # warmup epochs (fractions ok)
+                'warmup_momentum': (1, 0.0, 0.95),  # warmup initial momentum
+                'warmup_bias_lr': (1, 0.0, 0.2),  # warmup initial bias lr
+                'box': (1, 0.02, 0.2),  # box loss gain
+                'cls': (1, 0.2, 4.0),  # cls loss gain
+                'cls_pw': (1, 0.5, 2.0),  # cls BCELoss positive_weight
+                'obj': (1, 0.2, 4.0),  # obj loss gain (scale with pixels)
+                'obj_pw': (1, 0.5, 2.0),  # obj BCELoss positive_weight
+                'iou_t': (0, 0.1, 0.7),  # IoU training threshold
+                'anchor_t': (1, 2.0, 8.0),  # anchor-multiple threshold
+                'anchors': (2, 2.0, 10.0),  # anchors per output grid (0 to ignore)
+                'fl_gamma': (0, 0.0, 2.0),  # focal loss gamma (efficientDet default gamma=1.5)
+                'hsv_h': (1, 0.0, 0.1),  # image HSV-Hue augmentation (fraction)
+                'hsv_s': (1, 0.0, 0.9),  # image HSV-Saturation augmentation (fraction)
+                'hsv_v': (1, 0.0, 0.9),  # image HSV-Value augmentation (fraction)
+                'degrees': (1, 0.0, 45.0),  # image rotation (+/- deg)
+                'translate': (1, 0.0, 0.9),  # image translation (+/- fraction)
+                'scale': (1, 0.0, 0.9),  # image scale (+/- gain)
+                'shear': (1, 0.0, 10.0),  # image shear (+/- deg)
+                'perspective': (0, 0.0, 0.001),  # image perspective (+/- fraction), range 0-0.001
+                'flipud': (1, 0.0, 1.0),  # image flip up-down (probability)
+                'fliplr': (0, 0.0, 1.0),  # image flip left-right (probability)
+                'mosaic': (1, 0.0, 1.0),  # image mixup (probability)
+                'mixup': (1, 0.0, 1.0)}  # image mixup (probability)
+
+        assert opt.local_rank == -1, 'DDP mode not implemented for --evolve'
+        opt.notest, opt.nosave = True, True  # only test/save final epoch
+        # ei = [isinstance(x, (int, float)) for x in hyp.values()]  # evolvable indices
+        yaml_file = Path(opt.save_dir) / 'hyp_evolved.yaml'  # save best result here
+        if opt.bucket:
+            os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket)  # download evolve.txt if exists
+
+        for _ in range(300):  # generations to evolve
+            if Path('evolve.txt').exists():  # if evolve.txt exists: select best hyps and mutate
+                # Select parent(s)
+                parent = 'single'  # parent selection method: 'single' or 'weighted'
+                x = np.loadtxt('evolve.txt', ndmin=2)
+                n = min(5, len(x))  # number of previous results to consider
+                x = x[np.argsort(-fitness(x))][:n]  # top n mutations
+                w = fitness(x) - fitness(x).min()  # weights
+                if parent == 'single' or len(x) == 1:
+                    # x = x[random.randint(0, n - 1)]  # random selection
+                    x = x[random.choices(range(n), weights=w)[0]]  # weighted selection
+                elif parent == 'weighted':
+                    x = (x * w.reshape(n, 1)).sum(0) / w.sum()  # weighted combination
+
+                # Mutate
+                mp, s = 0.8, 0.2  # mutation probability, sigma
+                npr = np.random
+                npr.seed(int(time.time()))
+                g = np.array([x[0] for x in meta.values()])  # gains 0-1
+                ng = len(meta)
+                v = np.ones(ng)
+                while all(v == 1):  # mutate until a change occurs (prevent duplicates)
+                    v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0)
+                for i, k in enumerate(hyp.keys()):  # plt.hist(v.ravel(), 300)
+                    hyp[k] = float(x[i + 7] * v[i])  # mutate
+
+            # Constrain to limits
+            for k, v in meta.items():
+                hyp[k] = max(hyp[k], v[1])  # lower limit
+                hyp[k] = min(hyp[k], v[2])  # upper limit
+                hyp[k] = round(hyp[k], 5)  # significant digits
+
+            # Train mutation
+            results = train(hyp.copy(), opt, device, wandb=wandb)
+
+            # Write mutation results
+            print_mutation(hyp.copy(), results, yaml_file, opt.bucket)
+
+        # Plot results
+        plot_evolution(yaml_file)
+        print(f'Hyperparameter evolution complete. Best results saved as: {yaml_file}\n'
+              f'Command to train a new model with these hyperparameters: $ python train.py --hyp {yaml_file}')
diff --git a/model/classification/timm_models.md b/model/classification/timm_models.md
new file mode 100644
index 0000000..7430b09
--- /dev/null
+++ b/model/classification/timm_models.md
@@ -0,0 +1,20 @@
+# EfficientNet-B0 with RandAugment - 77.7 top-1, 95.3 top-5
+Michael Klachko achieved these results with the command line for B2 adapted for larger batch size, with the recommended B0 dropout rate of 0.2.
+
+```
+./distributed_train.sh 2 /imagenet/ --model efficientnet_b0 -b 384 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .048
+```
+
+# MobileNetV3-Large-100 - 75.766 top-1, 92,542 top-5
+```
+./distributed_train.sh 2 /imagenet/ --model mobilenetv3_large_100 -b 512 --sched step --epochs 600 --decay-epochs 2.4 --decay-rate .973 --opt rmsproptf --opt-eps .001 -j 7 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .064 --lr-noise 0.42 0.9
+```
+
+
+
+
+
+
+
+
+
diff --git a/model/detection/yolov5/utils/activations.py b/model/detection/yolov5/utils/activations.py
new file mode 100644
index 0000000..24f5a30
--- /dev/null
+++ b/model/detection/yolov5/utils/activations.py
@@ -0,0 +1,72 @@
+# Activation functions
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+# Swish https://arxiv.org/pdf/1905.02244.pdf ---------------------------------------------------------------------------
+class Swish(nn.Module):  #
+    @staticmethod
+    def forward(x):
+        return x * torch.sigmoid(x)
+
+
+class Hardswish(nn.Module):  # export-friendly version of nn.Hardswish()
+    @staticmethod
+    def forward(x):
+        # return x * F.hardsigmoid(x)  # for torchscript and CoreML
+        return x * F.hardtanh(x + 3, 0., 6.) / 6.  # for torchscript, CoreML and ONNX
+
+
+class MemoryEfficientSwish(nn.Module):
+    class F(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x):
+            ctx.save_for_backward(x)
+            return x * torch.sigmoid(x)
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            x = ctx.saved_tensors[0]
+            sx = torch.sigmoid(x)
+            return grad_output * (sx * (1 + x * (1 - sx)))
+
+    def forward(self, x):
+        return self.F.apply(x)
+
+
+# Mish https://github.com/digantamisra98/Mish --------------------------------------------------------------------------
+class Mish(nn.Module):
+    @staticmethod
+    def forward(x):
+        return x * F.softplus(x).tanh()
+
+
+class MemoryEfficientMish(nn.Module):
+    class F(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x):
+            ctx.save_for_backward(x)
+            return x.mul(torch.tanh(F.softplus(x)))  # x * tanh(ln(1 + exp(x)))
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            x = ctx.saved_tensors[0]
+            sx = torch.sigmoid(x)
+            fx = F.softplus(x).tanh()
+            return grad_output * (fx + x * sx * (1 - fx * fx))
+
+    def forward(self, x):
+        return self.F.apply(x)
+
+
+# FReLU https://arxiv.org/abs/2007.11824 -------------------------------------------------------------------------------
+class FReLU(nn.Module):
+    def __init__(self, c1, k=3):  # ch_in, kernel
+        super().__init__()
+        self.conv = nn.Conv2d(c1, c1, k, 1, 1, groups=c1, bias=False)
+        self.bn = nn.BatchNorm2d(c1)
+
+    def forward(self, x):
+        return torch.max(x, self.bn(self.conv(x)))
diff --git a/model/detection/yolov5/utils/datasets.py b/model/detection/yolov5/utils/datasets.py
new file mode 100644
index 0000000..313180f
--- /dev/null
+++ b/model/detection/yolov5/utils/datasets.py
@@ -0,0 +1,933 @@
+# Dataset utils and dataloaders
+
+import glob
+import logging
+import math
+import os
+import random
+import shutil
+import time
+from itertools import repeat
+from multiprocessing.pool import ThreadPool
+from pathlib import Path
+from threading import Thread
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image, ExifTags
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+from .general import xyxy2xywh, xywh2xyxy
+from .torch_utils import torch_distributed_zero_first
+
+# Parameters
+help_url = 'https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data'
+img_formats = ['bmp', 'jpg', 'jpeg', 'png', 'tif', 'tiff', 'dng']  # acceptable image suffixes
+vid_formats = ['mov', 'avi', 'mp4', 'mpg', 'mpeg', 'm4v', 'wmv', 'mkv']  # acceptable video suffixes
+logger = logging.getLogger(__name__)
+
+# Get orientation exif tag
+for orientation in ExifTags.TAGS.keys():
+    if ExifTags.TAGS[orientation] == 'Orientation':
+        break
+
+
+def get_hash(files):
+    # Returns a single hash value of a list of files
+    return sum(os.path.getsize(f) for f in files if os.path.isfile(f))
+
+
+def exif_size(img):
+    # Returns exif-corrected PIL size
+    s = img.size  # (width, height)
+    try:
+        rotation = dict(img._getexif().items())[orientation]
+        if rotation == 6:  # rotation 270
+            s = (s[1], s[0])
+        elif rotation == 8:  # rotation 90
+            s = (s[1], s[0])
+    except:
+        pass
+
+    return s
+
+
+def create_dataloader(path, imgsz, batch_size, stride, opt, hyp=None, augment=False, cache=False, pad=0.0, rect=False,
+                      rank=-1, world_size=1, workers=8, image_weights=False):
+    # Make sure only the first process in DDP process the dataset first, and the following others can use the cache
+    with torch_distributed_zero_first(rank):
+        dataset = LoadImagesAndLabels(path, imgsz, batch_size,
+                                      augment=augment,  # augment images
+                                      hyp=hyp,  # augmentation hyperparameters
+                                      rect=rect,  # rectangular training
+                                      cache_images=cache,
+                                      single_cls=opt.single_cls,
+                                      stride=int(stride),
+                                      pad=pad,
+                                      rank=rank,
+                                      image_weights=image_weights)
+
+    batch_size = min(batch_size, len(dataset))
+    nw = min([os.cpu_count() // world_size, batch_size if batch_size > 1 else 0, workers])  # number of workers
+    sampler = torch.utils.data.distributed.DistributedSampler(dataset) if rank != -1 else None
+    loader = torch.utils.data.DataLoader if image_weights else InfiniteDataLoader
+    # Use torch.utils.data.DataLoader() if dataset.properties will update during training else InfiniteDataLoader()
+    dataloader = loader(dataset,
+                        batch_size=batch_size,
+                        num_workers=nw,
+                        sampler=sampler,
+                        pin_memory=True,
+                        collate_fn=LoadImagesAndLabels.collate_fn)
+    return dataloader, dataset
+
+
+class InfiniteDataLoader(torch.utils.data.dataloader.DataLoader):
+    """ Dataloader that reuses workers
+
+    Uses same syntax as vanilla DataLoader
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        object.__setattr__(self, 'batch_sampler', _RepeatSampler(self.batch_sampler))
+        self.iterator = super().__iter__()
+
+    def __len__(self):
+        return len(self.batch_sampler.sampler)
+
+    def __iter__(self):
+        for i in range(len(self)):
+            yield next(self.iterator)
+
+
+class _RepeatSampler(object):
+    """ Sampler that repeats forever
+
+    Args:
+        sampler (Sampler)
+    """
+
+    def __init__(self, sampler):
+        self.sampler = sampler
+
+    def __iter__(self):
+        while True:
+            yield from iter(self.sampler)
+
+
+class LoadImages:  # for inference
+    def __init__(self, path, img_size=640):
+        p = str(Path(path))  # os-agnostic
+        p = os.path.abspath(p)  # absolute path
+        if '*' in p:
+            files = sorted(glob.glob(p, recursive=True))  # glob
+        elif os.path.isdir(p):
+            files = sorted(glob.glob(os.path.join(p, '*.*')))  # dir
+        elif os.path.isfile(p):
+            files = [p]  # files
+        else:
+            raise Exception('ERROR: %s does not exist' % p)
+
+        images = [x for x in files if x.split('.')[-1].lower() in img_formats]
+        videos = [x for x in files if x.split('.')[-1].lower() in vid_formats]
+        ni, nv = len(images), len(videos)
+
+        self.img_size = img_size
+        self.files = images + videos
+        self.nf = ni + nv  # number of files
+        self.video_flag = [False] * ni + [True] * nv
+        self.mode = 'image'
+        if any(videos):
+            self.new_video(videos[0])  # new video
+        else:
+            self.cap = None
+        assert self.nf > 0, 'No images or videos found in %s. Supported formats are:\nimages: %s\nvideos: %s' % \
+                            (p, img_formats, vid_formats)
+
+    def __iter__(self):
+        self.count = 0
+        return self
+
+    def __next__(self):
+        if self.count == self.nf:
+            raise StopIteration
+        path = self.files[self.count]
+
+        if self.video_flag[self.count]:
+            # Read video
+            self.mode = 'video'
+            ret_val, img0 = self.cap.read()
+            if not ret_val:
+                self.count += 1
+                self.cap.release()
+                if self.count == self.nf:  # last video
+                    raise StopIteration
+                else:
+                    path = self.files[self.count]
+                    self.new_video(path)
+                    ret_val, img0 = self.cap.read()
+
+            self.frame += 1
+            print('video %g/%g (%g/%g) %s: ' % (self.count + 1, self.nf, self.frame, self.nframes, path), end='')
+
+        else:
+            # Read image
+            self.count += 1
+            img0 = cv2.imread(path)  # BGR
+            assert img0 is not None, 'Image Not Found ' + path
+            print('image %g/%g %s: ' % (self.count, self.nf, path), end='')
+
+        # Padded resize
+        img = letterbox(img0, new_shape=self.img_size)[0]
+
+        # Convert
+        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
+        img = np.ascontiguousarray(img)
+
+        return path, img, img0, self.cap
+
+    def new_video(self, path):
+        self.frame = 0
+        self.cap = cv2.VideoCapture(path)
+        self.nframes = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+    def __len__(self):
+        return self.nf  # number of files
+
+
+class LoadWebcam:  # for inference
+    def __init__(self, pipe='0', img_size=640):
+        self.img_size = img_size
+
+        if pipe.isnumeric():
+            pipe = eval(pipe)  # local camera
+        # pipe = 'rtsp://192.168.1.64/1'  # IP camera
+        # pipe = 'rtsp://username:password@192.168.1.64/1'  # IP camera with login
+        # pipe = 'http://wmccpinetop.axiscam.net/mjpg/video.mjpg'  # IP golf camera
+
+        self.pipe = pipe
+        self.cap = cv2.VideoCapture(pipe)  # video capture object
+        self.cap.set(cv2.CAP_PROP_BUFFERSIZE, 3)  # set buffer size
+
+    def __iter__(self):
+        self.count = -1
+        return self
+
+    def __next__(self):
+        self.count += 1
+        if cv2.waitKey(1) == ord('q'):  # q to quit
+            self.cap.release()
+            cv2.destroyAllWindows()
+            raise StopIteration
+
+        # Read frame
+        if self.pipe == 0:  # local camera
+            ret_val, img0 = self.cap.read()
+            img0 = cv2.flip(img0, 1)  # flip left-right
+        else:  # IP camera
+            n = 0
+            while True:
+                n += 1
+                self.cap.grab()
+                if n % 30 == 0:  # skip frames
+                    ret_val, img0 = self.cap.retrieve()
+                    if ret_val:
+                        break
+
+        # Print
+        assert ret_val, 'Camera Error %s' % self.pipe
+        img_path = 'webcam.jpg'
+        print('webcam %g: ' % self.count, end='')
+
+        # Padded resize
+        img = letterbox(img0, new_shape=self.img_size)[0]
+
+        # Convert
+        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
+        img = np.ascontiguousarray(img)
+
+        return img_path, img, img0, None
+
+    def __len__(self):
+        return 0
+
+
+class LoadStreams:  # multiple IP or RTSP cameras
+    def __init__(self, sources='streams.txt', img_size=640):
+        self.mode = 'stream'
+        self.img_size = img_size
+
+        if os.path.isfile(sources):
+            with open(sources, 'r') as f:
+                sources = [x.strip() for x in f.read().strip().splitlines() if len(x.strip())]
+        else:
+            sources = [sources]
+
+        n = len(sources)
+        self.imgs = [None] * n
+        self.sources = sources
+        for i, s in enumerate(sources):
+            # Start the thread to read frames from the video stream
+            print('%g/%g: %s... ' % (i + 1, n, s), end='')
+            cap = cv2.VideoCapture(eval(s) if s.isnumeric() else s)
+            assert cap.isOpened(), 'Failed to open %s' % s
+            w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+            h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+            fps = cap.get(cv2.CAP_PROP_FPS) % 100
+            _, self.imgs[i] = cap.read()  # guarantee first frame
+            thread = Thread(target=self.update, args=([i, cap]), daemon=True)
+            print(' success (%gx%g at %.2f FPS).' % (w, h, fps))
+            thread.start()
+        print('')  # newline
+
+        # check for common shapes
+        s = np.stack([letterbox(x, new_shape=self.img_size)[0].shape for x in self.imgs], 0)  # inference shapes
+        self.rect = np.unique(s, axis=0).shape[0] == 1  # rect inference if all shapes equal
+        if not self.rect:
+            print('WARNING: Different stream shapes detected. For optimal performance supply similarly-shaped streams.')
+
+    def update(self, index, cap):
+        # Read next stream frame in a daemon thread
+        n = 0
+        while cap.isOpened():
+            n += 1
+            # _, self.imgs[index] = cap.read()
+            cap.grab()
+            if n == 4:  # read every 4th frame
+                _, self.imgs[index] = cap.retrieve()
+                n = 0
+            time.sleep(0.01)  # wait time
+
+    def __iter__(self):
+        self.count = -1
+        return self
+
+    def __next__(self):
+        self.count += 1
+        img0 = self.imgs.copy()
+        if cv2.waitKey(1) == ord('q'):  # q to quit
+            cv2.destroyAllWindows()
+            raise StopIteration
+
+        # Letterbox
+        img = [letterbox(x, new_shape=self.img_size, auto=self.rect)[0] for x in img0]
+
+        # Stack
+        img = np.stack(img, 0)
+
+        # Convert
+        img = img[:, :, :, ::-1].transpose(0, 3, 1, 2)  # BGR to RGB, to bsx3x416x416
+        img = np.ascontiguousarray(img)
+
+        return self.sources, img, img0, None
+
+    def __len__(self):
+        return 0  # 1E12 frames = 32 streams at 30 FPS for 30 years
+
+
+def img2label_paths(img_paths):
+    # Define label paths as a function of image paths
+    sa, sb = os.sep + 'images' + os.sep, os.sep + 'labels' + os.sep  # /images/, /labels/ substrings
+    return [x.replace(sa, sb, 1).replace('.' + x.split('.')[-1], '.txt') for x in img_paths]
+
+
+class LoadImagesAndLabels(Dataset):  # for training/testing
+    def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False,
+                 cache_images=False, single_cls=False, stride=32, pad=0.0, rank=-1):
+        self.img_size = img_size
+        self.augment = augment
+        self.hyp = hyp
+        self.image_weights = image_weights
+        self.rect = False if image_weights else rect
+        self.mosaic = self.augment and not self.rect  # load 4 images at a time into a mosaic (only during training)
+        self.mosaic_border = [-img_size // 2, -img_size // 2]
+        self.stride = stride
+
+        try:
+            f = []  # image files
+            for p in path if isinstance(path, list) else [path]:
+                p = Path(p)  # os-agnostic
+                if p.is_dir():  # dir
+                    f += glob.glob(str(p / '**' / '*.*'), recursive=True)
+                elif p.is_file():  # file
+                    with open(p, 'r') as t:
+                        t = t.read().strip().splitlines()
+                        parent = str(p.parent) + os.sep
+                        f += [x.replace('./', parent) if x.startswith('./') else x for x in t]  # local to global path
+                else:
+                    raise Exception('%s does not exist' % p)
+            self.img_files = sorted([x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in img_formats])
+            assert self.img_files, 'No images found'
+        except Exception as e:
+            raise Exception('Error loading data from %s: %s\nSee %s' % (path, e, help_url))
+
+        # Check cache
+        self.label_files = img2label_paths(self.img_files)  # labels
+        cache_path = Path(self.label_files[0]).parent.with_suffix('.cache')  # cached labels
+        if cache_path.is_file():
+            cache = torch.load(cache_path)  # load
+            if cache['hash'] != get_hash(self.label_files + self.img_files) or 'results' not in cache:  # changed
+                cache = self.cache_labels(cache_path)  # re-cache
+        else:
+            cache = self.cache_labels(cache_path)  # cache
+
+        # Display cache
+        [nf, nm, ne, nc, n] = cache.pop('results')  # found, missing, empty, corrupted, total
+        desc = f"Scanning '{cache_path}' for images and labels... {nf} found, {nm} missing, {ne} empty, {nc} corrupted"
+        tqdm(None, desc=desc, total=n, initial=n)
+        assert nf > 0 or not augment, f'No labels found in {cache_path}. Can not train without labels. See {help_url}'
+
+        # Read cache
+        cache.pop('hash')  # remove hash
+        labels, shapes = zip(*cache.values())
+        self.labels = list(labels)
+        self.shapes = np.array(shapes, dtype=np.float64)
+        self.img_files = list(cache.keys())  # update
+        self.label_files = img2label_paths(cache.keys())  # update
+        if single_cls:
+            for x in self.labels:
+                x[:, 0] = 0
+
+        n = len(shapes)  # number of images
+        bi = np.floor(np.arange(n) / batch_size).astype(np.int)  # batch index
+        nb = bi[-1] + 1  # number of batches
+        self.batch = bi  # batch index of image
+        self.n = n
+        self.indices = range(n)
+
+        # Rectangular Training
+        if self.rect:
+            # Sort by aspect ratio
+            s = self.shapes  # wh
+            ar = s[:, 1] / s[:, 0]  # aspect ratio
+            irect = ar.argsort()
+            self.img_files = [self.img_files[i] for i in irect]
+            self.label_files = [self.label_files[i] for i in irect]
+            self.labels = [self.labels[i] for i in irect]
+            self.shapes = s[irect]  # wh
+            ar = ar[irect]
+
+            # Set training image shapes
+            shapes = [[1, 1]] * nb
+            for i in range(nb):
+                ari = ar[bi == i]
+                mini, maxi = ari.min(), ari.max()
+                if maxi < 1:
+                    shapes[i] = [maxi, 1]
+                elif mini > 1:
+                    shapes[i] = [1, 1 / mini]
+
+            self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(np.int) * stride
+
+        # Cache images into memory for faster training (WARNING: large datasets may exceed system RAM)
+        self.imgs = [None] * n
+        if cache_images:
+            gb = 0  # Gigabytes of cached images
+            self.img_hw0, self.img_hw = [None] * n, [None] * n
+            results = ThreadPool(8).imap(lambda x: load_image(*x), zip(repeat(self), range(n)))  # 8 threads
+            pbar = tqdm(enumerate(results), total=n)
+            for i, x in pbar:
+                self.imgs[i], self.img_hw0[i], self.img_hw[i] = x  # img, hw_original, hw_resized = load_image(self, i)
+                gb += self.imgs[i].nbytes
+                pbar.desc = 'Caching images (%.1fGB)' % (gb / 1E9)
+
+    def cache_labels(self, path=Path('./labels.cache')):
+        # Cache dataset labels, check images and read shapes
+        x = {}  # dict
+        nm, nf, ne, nc = 0, 0, 0, 0  # number missing, found, empty, duplicate
+        pbar = tqdm(zip(self.img_files, self.label_files), desc='Scanning images', total=len(self.img_files))
+        for i, (im_file, lb_file) in enumerate(pbar):
+            try:
+                # verify images
+                im = Image.open(im_file)
+                im.verify()  # PIL verify
+                shape = exif_size(im)  # image size
+                assert (shape[0] > 9) & (shape[1] > 9), 'image size <10 pixels'
+
+                # verify labels
+                if os.path.isfile(lb_file):
+                    nf += 1  # label found
+                    with open(lb_file, 'r') as f:
+                        l = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32)  # labels
+                    if len(l):
+                        assert l.shape[1] == 5, 'labels require 5 columns each'
+                        assert (l >= 0).all(), 'negative labels'
+                        assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels'
+                        assert np.unique(l, axis=0).shape[0] == l.shape[0], 'duplicate labels'
+                    else:
+                        ne += 1  # label empty
+                        l = np.zeros((0, 5), dtype=np.float32)
+                else:
+                    nm += 1  # label missing
+                    l = np.zeros((0, 5), dtype=np.float32)
+                x[im_file] = [l, shape]
+            except Exception as e:
+                nc += 1
+                print('WARNING: Ignoring corrupted image and/or label %s: %s' % (im_file, e))
+
+            pbar.desc = f"Scanning '{path.parent / path.stem}' for images and labels... " \
+                        f"{nf} found, {nm} missing, {ne} empty, {nc} corrupted"
+
+        if nf == 0:
+            print(f'WARNING: No labels found in {path}. See {help_url}')
+
+        x['hash'] = get_hash(self.label_files + self.img_files)
+        x['results'] = [nf, nm, ne, nc, i + 1]
+        torch.save(x, path)  # save for next time
+        logging.info(f"New cache created: {path}")
+        return x
+
+    def __len__(self):
+        return len(self.img_files)
+
+    # def __iter__(self):
+    #     self.count = -1
+    #     print('ran dataset iter')
+    #     #self.shuffled_vector = np.random.permutation(self.nF) if self.augment else np.arange(self.nF)
+    #     return self
+
+    def __getitem__(self, index):
+        index = self.indices[index]  # linear, shuffled, or image_weights
+
+        hyp = self.hyp
+        mosaic = self.mosaic and random.random() < hyp['mosaic']
+        if mosaic:
+            # Load mosaic
+            img, labels = load_mosaic(self, index)
+            shapes = None
+
+            # MixUp https://arxiv.org/pdf/1710.09412.pdf
+            if random.random() < hyp['mixup']:
+                img2, labels2 = load_mosaic(self, random.randint(0, self.n - 1))
+                r = np.random.beta(8.0, 8.0)  # mixup ratio, alpha=beta=8.0
+                img = (img * r + img2 * (1 - r)).astype(np.uint8)
+                labels = np.concatenate((labels, labels2), 0)
+
+        else:
+            # Load image
+            img, (h0, w0), (h, w) = load_image(self, index)
+
+            # Letterbox
+            shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size  # final letterboxed shape
+            img, ratio, pad = letterbox(img, shape, auto=False, scaleup=self.augment)
+            shapes = (h0, w0), ((h / h0, w / w0), pad)  # for COCO mAP rescaling
+
+            # Load labels
+            labels = []
+            x = self.labels[index]
+            if x.size > 0:
+                # Normalized xywh to pixel xyxy format
+                labels = x.copy()
+                labels[:, 1] = ratio[0] * w * (x[:, 1] - x[:, 3] / 2) + pad[0]  # pad width
+                labels[:, 2] = ratio[1] * h * (x[:, 2] - x[:, 4] / 2) + pad[1]  # pad height
+                labels[:, 3] = ratio[0] * w * (x[:, 1] + x[:, 3] / 2) + pad[0]
+                labels[:, 4] = ratio[1] * h * (x[:, 2] + x[:, 4] / 2) + pad[1]
+
+        if self.augment:
+            # Augment imagespace
+            if not mosaic:
+                img, labels = random_perspective(img, labels,
+                                                 degrees=hyp['degrees'],
+                                                 translate=hyp['translate'],
+                                                 scale=hyp['scale'],
+                                                 shear=hyp['shear'],
+                                                 perspective=hyp['perspective'])
+
+            # Augment colorspace
+            augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v'])
+
+            # Apply cutouts
+            # if random.random() < 0.9:
+            #     labels = cutout(img, labels)
+
+        nL = len(labels)  # number of labels
+        if nL:
+            labels[:, 1:5] = xyxy2xywh(labels[:, 1:5])  # convert xyxy to xywh
+            labels[:, [2, 4]] /= img.shape[0]  # normalized height 0-1
+            labels[:, [1, 3]] /= img.shape[1]  # normalized width 0-1
+
+        if self.augment:
+            # flip up-down
+            if random.random() < hyp['flipud']:
+                img = np.flipud(img)
+                if nL:
+                    labels[:, 2] = 1 - labels[:, 2]
+
+            # flip left-right
+            if random.random() < hyp['fliplr']:
+                img = np.fliplr(img)
+                if nL:
+                    labels[:, 1] = 1 - labels[:, 1]
+
+        labels_out = torch.zeros((nL, 6))
+        if nL:
+            labels_out[:, 1:] = torch.from_numpy(labels)
+
+        # Convert
+        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
+        img = np.ascontiguousarray(img)
+
+        return torch.from_numpy(img), labels_out, self.img_files[index], shapes
+
+    @staticmethod
+    def collate_fn(batch):
+        img, label, path, shapes = zip(*batch)  # transposed
+        for i, l in enumerate(label):
+            l[:, 0] = i  # add target image index for build_targets()
+        return torch.stack(img, 0), torch.cat(label, 0), path, shapes
+
+
+# Ancillary functions --------------------------------------------------------------------------------------------------
+def load_image(self, index):
+    # loads 1 image from dataset, returns img, original hw, resized hw
+    img = self.imgs[index]
+    if img is None:  # not cached
+        path = self.img_files[index]
+        img = cv2.imread(path)  # BGR
+        assert img is not None, 'Image Not Found ' + path
+        h0, w0 = img.shape[:2]  # orig hw
+        r = self.img_size / max(h0, w0)  # resize image to img_size
+        if r != 1:  # always resize down, only resize up if training with augmentation
+            interp = cv2.INTER_AREA if r < 1 and not self.augment else cv2.INTER_LINEAR
+            img = cv2.resize(img, (int(w0 * r), int(h0 * r)), interpolation=interp)
+        return img, (h0, w0), img.shape[:2]  # img, hw_original, hw_resized
+    else:
+        return self.imgs[index], self.img_hw0[index], self.img_hw[index]  # img, hw_original, hw_resized
+
+
+def augment_hsv(img, hgain=0.5, sgain=0.5, vgain=0.5):
+    r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1  # random gains
+    hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))
+    dtype = img.dtype  # uint8
+
+    x = np.arange(0, 256, dtype=np.int16)
+    lut_hue = ((x * r[0]) % 180).astype(dtype)
+    lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
+    lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
+
+    img_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))).astype(dtype)
+    cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)  # no return needed
+
+    # Histogram equalization
+    # if random.random() < 0.2:
+    #     for i in range(3):
+    #         img[:, :, i] = cv2.equalizeHist(img[:, :, i])
+
+
+def load_mosaic(self, index):
+    # loads images in a mosaic
+
+    labels4 = []
+    s = self.img_size
+    yc, xc = [int(random.uniform(-x, 2 * s + x)) for x in self.mosaic_border]  # mosaic center x, y
+    indices = [index] + [self.indices[random.randint(0, self.n - 1)] for _ in range(3)]  # 3 additional image indices
+    for i, index in enumerate(indices):
+        # Load image
+        img, _, (h, w) = load_image(self, index)
+
+        # place img in img4
+        if i == 0:  # top left
+            img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8)  # base image with 4 tiles
+            x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc  # xmin, ymin, xmax, ymax (large image)
+            x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h  # xmin, ymin, xmax, ymax (small image)
+        elif i == 1:  # top right
+            x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
+            x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
+        elif i == 2:  # bottom left
+            x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
+            x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h)
+        elif i == 3:  # bottom right
+            x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
+            x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
+
+        img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
+        padw = x1a - x1b
+        padh = y1a - y1b
+
+        # Labels
+        x = self.labels[index]
+        labels = x.copy()
+        if x.size > 0:  # Normalized xywh to pixel xyxy format
+            labels[:, 1] = w * (x[:, 1] - x[:, 3] / 2) + padw
+            labels[:, 2] = h * (x[:, 2] - x[:, 4] / 2) + padh
+            labels[:, 3] = w * (x[:, 1] + x[:, 3] / 2) + padw
+            labels[:, 4] = h * (x[:, 2] + x[:, 4] / 2) + padh
+        labels4.append(labels)
+
+    # Concat/clip labels
+    if len(labels4):
+        labels4 = np.concatenate(labels4, 0)
+        np.clip(labels4[:, 1:], 0, 2 * s, out=labels4[:, 1:])  # use with random_perspective
+        # img4, labels4 = replicate(img4, labels4)  # replicate
+
+    # Augment
+    img4, labels4 = random_perspective(img4, labels4,
+                                       degrees=self.hyp['degrees'],
+                                       translate=self.hyp['translate'],
+                                       scale=self.hyp['scale'],
+                                       shear=self.hyp['shear'],
+                                       perspective=self.hyp['perspective'],
+                                       border=self.mosaic_border)  # border to remove
+
+    return img4, labels4
+
+
+def replicate(img, labels):
+    # Replicate labels
+    h, w = img.shape[:2]
+    boxes = labels[:, 1:].astype(int)
+    x1, y1, x2, y2 = boxes.T
+    s = ((x2 - x1) + (y2 - y1)) / 2  # side length (pixels)
+    for i in s.argsort()[:round(s.size * 0.5)]:  # smallest indices
+        x1b, y1b, x2b, y2b = boxes[i]
+        bh, bw = y2b - y1b, x2b - x1b
+        yc, xc = int(random.uniform(0, h - bh)), int(random.uniform(0, w - bw))  # offset x, y
+        x1a, y1a, x2a, y2a = [xc, yc, xc + bw, yc + bh]
+        img[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b]  # img4[ymin:ymax, xmin:xmax]
+        labels = np.append(labels, [[labels[i, 0], x1a, y1a, x2a, y2a]], axis=0)
+
+    return img, labels
+
+
+def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True):
+    # Resize image to a 32-pixel-multiple rectangle https://github.com/ultralytics/yolov3/issues/232
+    shape = img.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better test mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    ratio = r, r  # width, height ratios
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, 32), np.mod(dh, 32)  # wh padding
+    elif scaleFill:  # stretch
+        dw, dh = 0.0, 0.0
+        new_unpad = (new_shape[1], new_shape[0])
+        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
+    return img, ratio, (dw, dh)
+
+
+def random_perspective(img, targets=(), degrees=10, translate=.1, scale=.1, shear=10, perspective=0.0, border=(0, 0)):
+    # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10))
+    # targets = [cls, xyxy]
+
+    height = img.shape[0] + border[0] * 2  # shape(h,w,c)
+    width = img.shape[1] + border[1] * 2
+
+    # Center
+    C = np.eye(3)
+    C[0, 2] = -img.shape[1] / 2  # x translation (pixels)
+    C[1, 2] = -img.shape[0] / 2  # y translation (pixels)
+
+    # Perspective
+    P = np.eye(3)
+    P[2, 0] = random.uniform(-perspective, perspective)  # x perspective (about y)
+    P[2, 1] = random.uniform(-perspective, perspective)  # y perspective (about x)
+
+    # Rotation and Scale
+    R = np.eye(3)
+    a = random.uniform(-degrees, degrees)
+    # a += random.choice([-180, -90, 0, 90])  # add 90deg rotations to small rotations
+    s = random.uniform(1 - scale, 1 + scale)
+    # s = 2 ** random.uniform(-scale, scale)
+    R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
+
+    # Shear
+    S = np.eye(3)
+    S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # x shear (deg)
+    S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180)  # y shear (deg)
+
+    # Translation
+    T = np.eye(3)
+    T[0, 2] = random.uniform(0.5 - translate, 0.5 + translate) * width  # x translation (pixels)
+    T[1, 2] = random.uniform(0.5 - translate, 0.5 + translate) * height  # y translation (pixels)
+
+    # Combined rotation matrix
+    M = T @ S @ R @ P @ C  # order of operations (right to left) is IMPORTANT
+    if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any():  # image changed
+        if perspective:
+            img = cv2.warpPerspective(img, M, dsize=(width, height), borderValue=(114, 114, 114))
+        else:  # affine
+            img = cv2.warpAffine(img, M[:2], dsize=(width, height), borderValue=(114, 114, 114))
+
+    # Visualize
+    # import matplotlib.pyplot as plt
+    # ax = plt.subplots(1, 2, figsize=(12, 6))[1].ravel()
+    # ax[0].imshow(img[:, :, ::-1])  # base
+    # ax[1].imshow(img2[:, :, ::-1])  # warped
+
+    # Transform label coordinates
+    n = len(targets)
+    if n:
+        # warp points
+        xy = np.ones((n * 4, 3))
+        xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+        xy = xy @ M.T  # transform
+        if perspective:
+            xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8)  # rescale
+        else:  # affine
+            xy = xy[:, :2].reshape(n, 8)
+
+        # create new boxes
+        x = xy[:, [0, 2, 4, 6]]
+        y = xy[:, [1, 3, 5, 7]]
+        xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+
+        # # apply angle-based reduction of bounding boxes
+        # radians = a * math.pi / 180
+        # reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5
+        # x = (xy[:, 2] + xy[:, 0]) / 2
+        # y = (xy[:, 3] + xy[:, 1]) / 2
+        # w = (xy[:, 2] - xy[:, 0]) * reduction
+        # h = (xy[:, 3] - xy[:, 1]) * reduction
+        # xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T
+
+        # clip boxes
+        xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
+        xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
+
+        # filter candidates
+        i = box_candidates(box1=targets[:, 1:5].T * s, box2=xy.T)
+        targets = targets[i]
+        targets[:, 1:5] = xy[i]
+
+    return img, targets
+
+
+def box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.1):  # box1(4,n), box2(4,n)
+    # Compute candidate boxes: box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio
+    w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
+    w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
+    ar = np.maximum(w2 / (h2 + 1e-16), h2 / (w2 + 1e-16))  # aspect ratio
+    return (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + 1e-16) > area_thr) & (ar < ar_thr)  # candidates
+
+
+def cutout(image, labels):
+    # Applies image cutout augmentation https://arxiv.org/abs/1708.04552
+    h, w = image.shape[:2]
+
+    def bbox_ioa(box1, box2):
+        # Returns the intersection over box2 area given box1, box2. box1 is 4, box2 is nx4. boxes are x1y1x2y2
+        box2 = box2.transpose()
+
+        # Get the coordinates of bounding boxes
+        b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
+        b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
+
+        # Intersection area
+        inter_area = (np.minimum(b1_x2, b2_x2) - np.maximum(b1_x1, b2_x1)).clip(0) * \
+                     (np.minimum(b1_y2, b2_y2) - np.maximum(b1_y1, b2_y1)).clip(0)
+
+        # box2 area
+        box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + 1e-16
+
+        # Intersection over box2 area
+        return inter_area / box2_area
+
+    # create random masks
+    scales = [0.5] * 1 + [0.25] * 2 + [0.125] * 4 + [0.0625] * 8 + [0.03125] * 16  # image size fraction
+    for s in scales:
+        mask_h = random.randint(1, int(h * s))
+        mask_w = random.randint(1, int(w * s))
+
+        # box
+        xmin = max(0, random.randint(0, w) - mask_w // 2)
+        ymin = max(0, random.randint(0, h) - mask_h // 2)
+        xmax = min(w, xmin + mask_w)
+        ymax = min(h, ymin + mask_h)
+
+        # apply random color mask
+        image[ymin:ymax, xmin:xmax] = [random.randint(64, 191) for _ in range(3)]
+
+        # return unobscured labels
+        if len(labels) and s > 0.03:
+            box = np.array([xmin, ymin, xmax, ymax], dtype=np.float32)
+            ioa = bbox_ioa(box, labels[:, 1:5])  # intersection over area
+            labels = labels[ioa < 0.60]  # remove >60% obscured labels
+
+    return labels
+
+
+def create_folder(path='./new'):
+    # Create folder
+    if os.path.exists(path):
+        shutil.rmtree(path)  # delete output folder
+    os.makedirs(path)  # make new output folder
+
+
+def flatten_recursive(path='../coco128'):
+    # Flatten a recursive directory by bringing all files to top level
+    new_path = Path(path + '_flat')
+    create_folder(new_path)
+    for file in tqdm(glob.glob(str(Path(path)) + '/**/*.*', recursive=True)):
+        shutil.copyfile(file, new_path / Path(file).name)
+
+
+def extract_boxes(path='../coco128/'):  # from utils.datasets import *; extract_boxes('../coco128')
+    # Convert detection dataset into classification dataset, with one directory per class
+
+    path = Path(path)  # images dir
+    shutil.rmtree(path / 'classifier') if (path / 'classifier').is_dir() else None  # remove existing
+    files = list(path.rglob('*.*'))
+    n = len(files)  # number of files
+    for im_file in tqdm(files, total=n):
+        if im_file.suffix[1:] in img_formats:
+            # image
+            im = cv2.imread(str(im_file))[..., ::-1]  # BGR to RGB
+            h, w = im.shape[:2]
+
+            # labels
+            lb_file = Path(img2label_paths([str(im_file)])[0])
+            if Path(lb_file).exists():
+                with open(lb_file, 'r') as f:
+                    lb = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32)  # labels
+
+                for j, x in enumerate(lb):
+                    c = int(x[0])  # class
+                    f = (path / 'classifier') / f'{c}' / f'{path.stem}_{im_file.stem}_{j}.jpg'  # new filename
+                    if not f.parent.is_dir():
+                        f.parent.mkdir(parents=True)
+
+                    b = x[1:] * [w, h, w, h]  # box
+                    # b[2:] = b[2:].max()  # rectangle to square
+                    b[2:] = b[2:] * 1.2 + 3  # pad
+                    b = xywh2xyxy(b.reshape(-1, 4)).ravel().astype(np.int)
+
+                    b[[0, 2]] = np.clip(b[[0, 2]], 0, w)  # clip boxes outside of image
+                    b[[1, 3]] = np.clip(b[[1, 3]], 0, h)
+                    assert cv2.imwrite(str(f), im[b[1]:b[3], b[0]:b[2]]), f'box failure in {f}'
+
+
+def autosplit(path='../coco128', weights=(0.9, 0.1, 0.0)):  # from utils.datasets import *; autosplit('../coco128')
+    """ Autosplit a dataset into train/val/test splits and save path/autosplit_*.txt files
+    # Arguments
+        path:       Path to images directory
+        weights:    Train, val, test weights (list)
+    """
+    path = Path(path)  # images dir
+    files = list(path.rglob('*.*'))
+    n = len(files)  # number of files
+    indices = random.choices([0, 1, 2], weights=weights, k=n)  # assign each image to a split
+    txt = ['autosplit_train.txt', 'autosplit_val.txt', 'autosplit_test.txt']  # 3 txt files
+    [(path / x).unlink() for x in txt if (path / x).exists()]  # remove existing
+    for i, img in tqdm(zip(indices, files), total=n):
+        if img.suffix[1:] in img_formats:
+            with open(path / txt[i], 'a') as f:
+                f.write(str(img) + '\n')  # add image to txt file
diff --git a/model/detection/yolov5/utils/google_app_engine/Dockerfile b/model/detection/yolov5/utils/google_app_engine/Dockerfile
new file mode 100644
index 0000000..0155618
--- /dev/null
+++ b/model/detection/yolov5/utils/google_app_engine/Dockerfile
@@ -0,0 +1,25 @@
+FROM gcr.io/google-appengine/python
+
+# Create a virtualenv for dependencies. This isolates these packages from
+# system-level packages.
+# Use -p python3 or -p python3.7 to select python version. Default is version 2.
+RUN virtualenv /env -p python3
+
+# Setting these environment variables are the same as running
+# source /env/bin/activate.
+ENV VIRTUAL_ENV /env
+ENV PATH /env/bin:$PATH
+
+RUN apt-get update && apt-get install -y python-opencv
+
+# Copy the application's requirements.txt and run pip to install all
+# dependencies into the virtualenv.
+ADD requirements.txt /app/requirements.txt
+RUN pip install -r /app/requirements.txt
+
+# Add the application source code.
+ADD . /app
+
+# Run a WSGI server to serve the application. gunicorn must be declared as
+# a dependency in requirements.txt.
+CMD gunicorn -b :$PORT main:app
diff --git a/model/detection/yolov5/utils/google_app_engine/additional_requirements.txt b/model/detection/yolov5/utils/google_app_engine/additional_requirements.txt
new file mode 100644
index 0000000..5fcc305
--- /dev/null
+++ b/model/detection/yolov5/utils/google_app_engine/additional_requirements.txt
@@ -0,0 +1,4 @@
+# add these requirements in your app on top of the existing ones
+pip==18.1
+Flask==1.0.2
+gunicorn==19.9.0
diff --git a/model/detection/yolov5/utils/google_app_engine/app.yaml b/model/detection/yolov5/utils/google_app_engine/app.yaml
new file mode 100644
index 0000000..ac29d10
--- /dev/null
+++ b/model/detection/yolov5/utils/google_app_engine/app.yaml
@@ -0,0 +1,14 @@
+runtime: custom
+env: flex
+
+service: yolov5app
+
+liveness_check:
+  initial_delay_sec: 600
+
+manual_scaling:
+  instances: 1
+resources:
+  cpu: 1
+  memory_gb: 4
+  disk_size_gb: 20
\ No newline at end of file
diff --git a/model/detection/yolov5/utils/metrics.py b/model/detection/yolov5/utils/metrics.py
new file mode 100644
index 0000000..99d5bcf
--- /dev/null
+++ b/model/detection/yolov5/utils/metrics.py
@@ -0,0 +1,200 @@
+# Model validation metrics
+
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+from . import general
+
+
+def fitness(x):
+    # Model fitness as a weighted combination of metrics
+    w = [0.0, 0.0, 0.1, 0.9]  # weights for [P, R, mAP@0.5, mAP@0.5:0.95]
+    return (x[:, :4] * w).sum(1)
+
+
+def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='precision-recall_curve.png', names=[]):
+    """ Compute the average precision, given the recall and precision curves.
+    Source: https://github.com/rafaelpadilla/Object-Detection-Metrics.
+    # Arguments
+        tp:  True positives (nparray, nx1 or nx10).
+        conf:  Objectness value from 0-1 (nparray).
+        pred_cls:  Predicted object classes (nparray).
+        target_cls:  True object classes (nparray).
+        plot:  Plot precision-recall curve at mAP@0.5
+        save_dir:  Plot save directory
+    # Returns
+        The average precision as computed in py-faster-rcnn.
+    """
+
+    # Sort by objectness
+    i = np.argsort(-conf)
+    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
+
+    # Find unique classes
+    unique_classes = np.unique(target_cls)
+
+    # Create Precision-Recall curve and compute AP for each class
+    px, py = np.linspace(0, 1, 1000), []  # for plotting
+    pr_score = 0.1  # score to evaluate P and R https://github.com/ultralytics/yolov3/issues/898
+    s = [unique_classes.shape[0], tp.shape[1]]  # number class, number iou thresholds (i.e. 10 for mAP0.5...0.95)
+    ap, p, r = np.zeros(s), np.zeros(s), np.zeros(s)
+    for ci, c in enumerate(unique_classes):
+        i = pred_cls == c
+        n_l = (target_cls == c).sum()  # number of labels
+        n_p = i.sum()  # number of predictions
+
+        if n_p == 0 or n_l == 0:
+            continue
+        else:
+            # Accumulate FPs and TPs
+            fpc = (1 - tp[i]).cumsum(0)
+            tpc = tp[i].cumsum(0)
+
+            # Recall
+            recall = tpc / (n_l + 1e-16)  # recall curve
+            r[ci] = np.interp(-pr_score, -conf[i], recall[:, 0])  # r at pr_score, negative x, xp because xp decreases
+
+            # Precision
+            precision = tpc / (tpc + fpc)  # precision curve
+            p[ci] = np.interp(-pr_score, -conf[i], precision[:, 0])  # p at pr_score
+
+            # AP from recall-precision curve
+            for j in range(tp.shape[1]):
+                ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j])
+                if plot and (j == 0):
+                    py.append(np.interp(px, mrec, mpre))  # precision at mAP@0.5
+
+    # Compute F1 score (harmonic mean of precision and recall)
+    f1 = 2 * p * r / (p + r + 1e-16)
+
+    if plot:
+        plot_pr_curve(px, py, ap, save_dir, names)
+
+    return p, r, ap, f1, unique_classes.astype('int32')
+
+
+def compute_ap(recall, precision):
+    """ Compute the average precision, given the recall and precision curves
+    # Arguments
+        recall:    The recall curve (list)
+        precision: The precision curve (list)
+    # Returns
+        Average precision, precision curve, recall curve
+    """
+
+    # Append sentinel values to beginning and end
+    mrec = np.concatenate(([0.], recall, [recall[-1] + 0.01]))
+    mpre = np.concatenate(([1.], precision, [0.]))
+
+    # Compute the precision envelope
+    mpre = np.flip(np.maximum.accumulate(np.flip(mpre)))
+
+    # Integrate area under curve
+    method = 'interp'  # methods: 'continuous', 'interp'
+    if method == 'interp':
+        x = np.linspace(0, 1, 101)  # 101-point interp (COCO)
+        ap = np.trapz(np.interp(x, mrec, mpre), x)  # integrate
+    else:  # 'continuous'
+        i = np.where(mrec[1:] != mrec[:-1])[0]  # points where x axis (recall) changes
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])  # area under curve
+
+    return ap, mpre, mrec
+
+
+class ConfusionMatrix:
+    # Updated version of https://github.com/kaanakan/object_detection_confusion_matrix
+    def __init__(self, nc, conf=0.25, iou_thres=0.45):
+        self.matrix = np.zeros((nc + 1, nc + 1))
+        self.nc = nc  # number of classes
+        self.conf = conf
+        self.iou_thres = iou_thres
+
+    def process_batch(self, detections, labels):
+        """
+        Return intersection-over-union (Jaccard index) of boxes.
+        Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+        Arguments:
+            detections (Array[N, 6]), x1, y1, x2, y2, conf, class
+            labels (Array[M, 5]), class, x1, y1, x2, y2
+        Returns:
+            None, updates confusion matrix accordingly
+        """
+        detections = detections[detections[:, 4] > self.conf]
+        gt_classes = labels[:, 0].int()
+        detection_classes = detections[:, 5].int()
+        iou = general.box_iou(labels[:, 1:], detections[:, :4])
+
+        x = torch.where(iou > self.iou_thres)
+        if x[0].shape[0]:
+            matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy()
+            if x[0].shape[0] > 1:
+                matches = matches[matches[:, 2].argsort()[::-1]]
+                matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
+                matches = matches[matches[:, 2].argsort()[::-1]]
+                matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
+        else:
+            matches = np.zeros((0, 3))
+
+        n = matches.shape[0] > 0
+        m0, m1, _ = matches.transpose().astype(np.int16)
+        for i, gc in enumerate(gt_classes):
+            j = m0 == i
+            if n and sum(j) == 1:
+                self.matrix[gc, detection_classes[m1[j]]] += 1  # correct
+            else:
+                self.matrix[gc, self.nc] += 1  # background FP
+
+        if n:
+            for i, dc in enumerate(detection_classes):
+                if not any(m1 == i):
+                    self.matrix[self.nc, dc] += 1  # background FN
+
+    def matrix(self):
+        return self.matrix
+
+    def plot(self, save_dir='', names=()):
+        try:
+            import seaborn as sn
+
+            array = self.matrix / (self.matrix.sum(0).reshape(1, self.nc + 1) + 1E-6)  # normalize
+            array[array < 0.005] = np.nan  # don't annotate (would appear as 0.00)
+
+            fig = plt.figure(figsize=(12, 9), tight_layout=True)
+            sn.set(font_scale=1.0 if self.nc < 50 else 0.8)  # for label size
+            labels = (0 < len(names) < 99) and len(names) == self.nc  # apply names to ticklabels
+            sn.heatmap(array, annot=self.nc < 30, annot_kws={"size": 8}, cmap='Blues', fmt='.2f', square=True,
+                       xticklabels=names + ['background FN'] if labels else "auto",
+                       yticklabels=names + ['background FP'] if labels else "auto").set_facecolor((1, 1, 1))
+            fig.axes[0].set_xlabel('True')
+            fig.axes[0].set_ylabel('Predicted')
+            fig.savefig(Path(save_dir) / 'confusion_matrix.png', dpi=250)
+        except Exception as e:
+            pass
+
+    def print(self):
+        for i in range(self.nc + 1):
+            print(' '.join(map(str, self.matrix[i])))
+
+
+# Plots ----------------------------------------------------------------------------------------------------------------
+
+def plot_pr_curve(px, py, ap, save_dir='.', names=()):
+    fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True)
+    py = np.stack(py, axis=1)
+
+    if 0 < len(names) < 21:  # show mAP in legend if < 10 classes
+        for i, y in enumerate(py.T):
+            ax.plot(px, y, linewidth=1, label=f'{names[i]} %.3f' % ap[i, 0])  # plot(recall, precision)
+    else:
+        ax.plot(px, py, linewidth=1, color='grey')  # plot(recall, precision)
+
+    ax.plot(px, py.mean(1), linewidth=3, color='blue', label='all classes %.3f mAP@0.5' % ap[:, 0].mean())
+    ax.set_xlabel('Recall')
+    ax.set_ylabel('Precision')
+    ax.set_xlim(0, 1)
+    ax.set_ylim(0, 1)
+    plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left")
+    fig.savefig(Path(save_dir) / 'precision_recall_curve.png', dpi=250)
diff --git a/model/detection/yolov5/weights/download_weights.sh b/model/detection/yolov5/weights/download_weights.sh
new file mode 100644
index 0000000..43c8e31
--- /dev/null
+++ b/model/detection/yolov5/weights/download_weights.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# Download latest models from https://github.com/ultralytics/yolov5/releases
+# Usage:
+#    $ bash weights/download_weights.sh
+
+python - <<EOF
+from utils.google_utils import attempt_download
+
+for x in ['s', 'm', 'l', 'x']:
+    attempt_download(f'yolov5{x}.pt')
+
+EOF
diff --git a/training/centralized_classification_trainer.py b/training/centralized_classification_trainer.py
new file mode 100644
index 0000000..670625c
--- /dev/null
+++ b/training/centralized_classification_trainer.py
@@ -0,0 +1,154 @@
+import logging
+
+import torch
+import torch.nn as nn
+
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy, JsdCrossEntropy
+from timm.optim import create_optimizer
+from timm.scheduler import create_scheduler
+
+from FedML.fedml_core.trainer.model_trainer import ModelTrainer
+
+
+class ClassificationTrainer(ModelTrainer):
+    def __init__(self, model, device, args):
+        super().__init__(model)
+        # self.model = model
+        self.args = args
+
+        self.optimizer = create_optimizer(args, model)
+        self.lr_scheduler, self.num_epochs = create_scheduler(args, self.optimizer)
+        self.lr_scheduler.step(0)
+
+        # setup loss function
+        # if args.jsd:
+        #     assert num_aug_splits > 1  # JSD only valid with aug splits set
+        #     self.train_loss_fn = JsdCrossEntropy(num_splits=num_aug_splits, smoothing=args.smoothing).to(device)
+        # elif mixup_active:
+        #     # smoothing is handled with mixup target transform
+        #     self.train_loss_fn = SoftTargetCrossEntropy().to(device)
+        if args.smoothing:
+            self.train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing).to(device)
+        else:
+            self.train_loss_fn = nn.CrossEntropyLoss().to(device)
+        self.validate_loss_fn = nn.CrossEntropyLoss().to(device)
+
+
+    def get_model_params(self):
+        return self.model.cpu().state_dict()
+
+    def set_model_params(self, model_parameters):
+        self.model.load_state_dict(model_parameters)
+
+    def train(self, train_data, device, args):
+        model = self.model
+
+        model.to(device)
+        model.train()
+
+        epoch_loss = []
+        for epoch in range(args.epochs):
+            batch_loss = []
+            for batch_idx, (x, labels) in enumerate(train_data):
+                # logging.info(images.shape)
+                x, labels = x.to(device), labels.to(device)
+                self.optimizer.zero_grad()
+                log_probs = model(x)
+                loss = self.train_loss_fn(log_probs, labels)
+                loss.backward()
+                self.optimizer.step()
+                batch_loss.append(loss.item())
+            if len(batch_loss) > 0:
+                epoch_loss.append(sum(batch_loss) / len(batch_loss))
+                logging.info('(Trainer_ID {}. Local Training Epoch: {} \tLoss: {:.6f}'.format(
+                    self.id, epoch, sum(epoch_loss) / len(epoch_loss)))
+            self.lr_scheduler.step(epoch=epoch + 1, metric=None)
+
+
+    def train_one_epoch(self, train_data, device, args, epoch, tracker=None, metrics=None):
+        model = self.model
+
+        model.to(device)
+        model.train()
+        batch_loss = []
+        for batch_idx, (x, labels) in enumerate(train_data):
+            x, labels = x.to(device), labels.to(device)
+            self.optimizer.zero_grad()
+            log_probs = model(x)
+            # logging.debug("labels: {}".format(labels))
+            # logging.debug("pred: {}".format(log_probs))
+            loss = self.train_loss_fn(log_probs, labels)
+            loss.backward()
+            self.optimizer.step()
+            batch_loss.append(loss.item())
+            if (metrics is not None) and (tracker is not None):
+                metric_stat = metrics.evaluate(loss, log_probs, labels)
+                tracker.update_metrics(metric_stat, n_samples=labels.size(0))
+                if len(batch_loss) > 0:
+                    logging.info('(Trainer_ID {}. Local Training Epoch: {}, Iter: {} \tLoss: {:.6f} ACC1:{}'.format(
+                        self.id, epoch, batch_idx, sum(batch_loss) / len(batch_loss), metric_stat['Acc1']))
+            else:
+                if len(batch_loss) > 0:
+                    logging.info('(Trainer_ID {}. Local Training Epoch: {}, Iter: {} \tLoss: {:.6f}'.format(
+                        self.id, epoch, batch_idx, sum(batch_loss) / len(batch_loss)))
+        self.lr_scheduler.step(epoch=epoch + 1, metric=None)
+
+        if (metrics is not None) and (tracker is not None):
+            return None
+        else:
+            return sum(batch_loss) / len(batch_loss)
+
+
+
+    def train_one_step(self, train_batch_data, device, args, tracker=None, metrics=None):
+        model = self.model
+
+        model.to(device)
+        model.train()
+        x, labels = train_batch_data
+        x, labels = x.to(device), labels.to(device)
+        self.optimizer.zero_grad()
+        log_probs = model(x)
+        loss = self.train_loss_fn(log_probs, labels)
+        loss.backward()
+        self.optimizer.step()
+        if (tracker is not None) and (metrics is not None): 
+            metric_stat = metrics.evaluate(loss, log_probs, labels)
+            tracker.update_metrics(metric_stat, n_samples=labels.size(0))
+
+        return loss, log_probs, labels
+
+
+
+    def test(self, test_data, device, args, tracker=None, metrics=None):
+        model = self.model
+
+        model.eval()
+        model.to(device)
+
+
+        with torch.no_grad():
+            for batch_idx, (x, target) in enumerate(test_data):
+                x = x.to(device)
+                target = target.to(device)
+                pred = model(x)
+                # logging.debug("labels: {}".format(target))
+                # logging.debug("pred: {}".format(pred))
+                loss = self.validate_loss_fn(pred, target)
+                if (metrics is not None) and (tracker is not None):
+                    metric_stat = metrics.evaluate(loss, pred, target)
+                    tracker.update_metrics(metric_stat, n_samples=target.size(0))
+                    logging.info('(Trainer_ID {}. Local Testing Iter: {} \tLoss: {:.6f} ACC1:{}'.format(
+                        self.id, batch_idx, loss.item(), metric_stat['Acc1']))
+                else:
+                    raise NotImplementedError
+
+        if (metrics is not None) and (tracker is not None):
+            return None
+        else:
+            raise NotImplementedError
+
+
+
+
+
diff --git a/training/fedavg_classification_trainer.py b/training/fedavg_classification_trainer.py
new file mode 100644
index 0000000..ba99c0c
--- /dev/null
+++ b/training/fedavg_classification_trainer.py
@@ -0,0 +1,126 @@
+import logging
+
+import torch
+from torch import nn
+
+from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy, JsdCrossEntropy
+from timm.optim import create_optimizer
+from timm.scheduler import create_scheduler
+
+
+from FedML.fedml_core.trainer.model_trainer import ModelTrainer
+
+
+class ClassificationTrainer(ModelTrainer):
+    def __init__(self, model, device, args):
+        super().__init__(model)
+        # self.model = model
+        self.args = args
+
+        if args.opt in ['rmsproptf']:
+            self.optimizer = create_optimizer(args, model)
+        elif args.opt == 'sgd':
+             self.optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, 
+                                            weight_decay=args.wd, momentum=args.momentum)
+        elif args.opt == 'adam':
+            self.optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
+                                         lr=args.lr,
+                                         weight_decay=args.wd, amsgrad=True)
+        else:
+            raise NotImplementedError
+        # TODO
+        # In fedavg, decay according to the round
+        args.decay_epochs = args.decay_rounds
+        if args.sched == 'step':
+            self.lr_scheduler, self.num_epochs = create_scheduler(args, self.optimizer)
+        elif args.sched == 'StepLR':
+            self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, 
+                                                args.decay_epochs, args.decay_rate)
+        else:
+            raise NotImplementedError
+
+        self.lr_scheduler.step(0)
+
+        if args.smoothing:
+            self.train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing).to(device)
+        else:
+            self.train_loss_fn = nn.CrossEntropyLoss().to(device)
+        self.validate_loss_fn = nn.CrossEntropyLoss().to(device)
+
+
+    def get_model_params(self):
+        return self.model.cpu().state_dict()
+
+    def set_model_params(self, model_parameters):
+        self.model.load_state_dict(model_parameters)
+
+    def train(self, train_data, device, args):
+        model = self.model
+
+        model.to(device)
+        model.train()
+
+        epoch_loss = []
+        for epoch in range(args.epochs):
+            batch_loss = []
+            for batch_idx, (x, labels) in enumerate(train_data):
+                # logging.info(images.shape)
+                x, labels = x.to(device), labels.to(device)
+                self.optimizer.zero_grad()
+                log_probs = model(x)
+                loss = self.train_loss_fn(log_probs, labels)
+                loss.backward()
+                self.optimizer.step()
+                batch_loss.append(loss.item())
+                logging.info('Local Training Epoch: {} iter: {} \t Loss: {:.6f}'.format(
+                                epoch, batch_idx, loss.item()))
+            if len(batch_loss) > 0:
+                epoch_loss.append(sum(batch_loss) / len(batch_loss))
+                logging.info('(Trainer_ID {}. Local Training Epoch: {} \tLoss: {:.6f}'.format(
+                    self.id, epoch, sum(epoch_loss) / len(epoch_loss)))
+        # self.lr_scheduler.step(epoch=epoch + 1, metric=None)
+        self.lr_scheduler.step(epoch=args.round_idx)
+
+
+    def test(self, test_data, device, args):
+        model = self.model
+
+        model.eval()
+        model.to(device)
+
+        metrics = {
+            'test_correct': 0,
+            'test_loss': 0,
+            'test_precision': 0,
+            'test_recall': 0,
+            'test_total': 0
+        }
+
+        # criterion = nn.CrossEntropyLoss().to(device)
+        with torch.no_grad():
+            for batch_idx, (x, target) in enumerate(test_data):
+                x = x.to(device)
+                target = target.to(device)
+                pred = model(x)
+                loss = self.validate_loss_fn(pred, target)
+                if args.dataset == "stackoverflow_lr":
+                    predicted = (pred > .5).int()
+                    correct = predicted.eq(target).sum(axis=-1).eq(target.size(1)).sum()
+                    true_positive = ((target * predicted) > .1).int().sum(axis=-1)
+                    precision = true_positive / (predicted.sum(axis=-1) + 1e-13)
+                    recall = true_positive / (target.sum(axis=-1) + 1e-13)
+                    metrics['test_precision'] += precision.sum().item()
+                    metrics['test_recall'] += recall.sum().item()
+                else:
+                    _, predicted = torch.max(pred, -1)
+                    correct = predicted.eq(target).sum()
+
+                metrics['test_correct'] += correct.item()
+                metrics['test_loss'] += loss.item() * target.size(0)
+                metrics['test_total'] += target.size(0)
+                logging.info('Local Testing iter: {} \t Loss: {:.6f} Acc: {:.6f}'.format(
+                                batch_idx, loss.item(),  metrics['test_correct']/metrics['test_total']))
+        return metrics
+
+    def test_on_the_server(self, train_data_local_dict, test_data_local_dict, device, args=None) -> bool:
+        pass
diff --git a/utils/context.py b/utils/context.py
new file mode 100644
index 0000000..76cea18
--- /dev/null
+++ b/utils/context.py
@@ -0,0 +1,35 @@
+from contextlib import contextmanager
+import threading
+
+import traceback
+from mpi4py import MPI
+
+
+
+@contextmanager
+def raise_MPI_error():
+    import logging
+    logging.debug("Debugging, Enter the MPI catch error")
+    try:
+        yield
+    except Exception as e:
+        logging.info(e)
+        logging.info('traceback.format_exc():\n%s' % traceback.format_exc())
+        MPI.COMM_WORLD.Abort()
+
+@contextmanager
+def raise_error_without_process():
+    import logging
+    logging.debug("Debugging, Enter the MPI catch error")
+    try:
+        yield
+    except Exception as e:
+        logging.info(e)
+        logging.info('traceback.format_exc():\n%s' % traceback.format_exc())
+
+@contextmanager
+def get_lock(lock: threading.Lock()):
+    lock.acquire()
+    yield
+    if lock.locked():
+        lock.release()
diff --git a/utils/logger.py b/utils/logger.py
new file mode 100644
index 0000000..f2c53d0
--- /dev/null
+++ b/utils/logger.py
@@ -0,0 +1,196 @@
+import os
+import json
+import time
+import platform
+import logging
+
+def logging_config(args, process_id):
+    # customize the log format
+    while logging.getLogger().handlers:
+        logging.getLogger().handlers.clear()
+    console = logging.StreamHandler()
+    if args.level == 'INFO':
+        console.setLevel(logging.INFO)
+    elif args.level == 'DEBUG':
+        console.setLevel(logging.DEBUG)
+    else:
+        raise NotImplementedError
+    formatter = logging.Formatter(str(process_id) + 
+        ' - %(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
+    console.setFormatter(formatter)
+    # Create an instance
+    logging.getLogger().addHandler(console)
+    # logging.getLogger().info("test")
+    logging.basicConfig()
+    logger = logging.getLogger()
+    if args.level == 'INFO':
+        logger.setLevel(logging.INFO)
+    elif args.level == 'DEBUG':
+        logger.setLevel(logging.DEBUG)
+    else:
+        raise NotImplementedError
+    logging.info(args)
+
+
+
+class Logger(object):
+
+    INFO = 0
+    DEBUG = 1
+    WARNING = 2
+    ERROR = 3
+    CRITICAL = 4
+
+    @classmethod
+    def config_logger(cls, file_folder='.', level="info",
+                        save_log=False, display_source=False):
+        """
+        :param filename: ending with .json
+        :param auto_save: save the JSON file after every addition
+        """
+        cls.file_folder = file_folder
+        cls.file_json = os.path.join(file_folder, "log-1.json")
+        # cls.file_log can be changed by add_log_file()
+        cls.file_log = os.path.join(file_folder, "log.log")
+        cls.values = []
+        cls.save_log = save_log
+        logger = logging.getLogger()
+        if display_source:
+            cls.formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)d] %(levelname)s %(message)s')
+        else:
+            cls.formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
+        cls.level = level
+        if level == "info":
+            logger.setLevel(logging.INFO)
+        elif level == "debug":
+            logger.setLevel(logging.DEBUG)
+        elif level == "warning":
+            logger.setLevel(logging.WARNING)
+        elif level == "error":
+            logger.setLevel(logging.ERROR)
+        elif level == "critical":
+            logger.setLevel(logging.CRITICAL)
+
+        strhdlr = logging.StreamHandler()
+        strhdlr.setFormatter(cls.formatter)
+        logger.addHandler(strhdlr)
+        if save_log:
+            cls.add_log_file(cls.file_log)
+        cls.logger = logger
+
+
+    @classmethod
+    def add_log_file(cls, logfile):
+        assert cls.save_log is True
+        hdlr = logging.FileHandler(logfile)
+        hdlr.setFormatter(cls.formatter)
+        cls.logger.addHandler(hdlr) 
+
+
+    @classmethod
+    def display_metric(cls, name, values, tags):
+        cls.info(
+            value="{name} ({tags}): {values} ".format(
+                name=name, values=values)
+        )
+
+
+    @classmethod
+    def cache_metric_in_memory(cls, name, values, tags):
+        """
+        Store a scalar metric. Example:
+        name="runtime",
+        values={
+            "time": current_time,
+            "rank": rank,
+            "epoch": epoch,
+            "best_perf": best_perf,
+        },
+        tags={"split": "test", "type": "local_model_avg"},
+        """
+        cls.values.append({"measurement": name, **tags, **values})
+
+
+    @classmethod
+    def log_timer(cls, name, values, tags):
+        cls.info(
+            value="{name} ({tags}): {values} ".format(
+                name=name, values=values)
+        )
+
+
+    @classmethod
+    def info(cls, value):
+        cls.logger.info(value)
+
+    @classmethod
+    def debug(cls, value):
+        cls.logger.debug(value)
+
+    @classmethod
+    def warning(cls, value):
+        cls.logger.warning(value)
+    
+    @classmethod
+    def error(cls, value):
+        cls.logger.error(value)
+
+    @classmethod
+    def critical(cls, value):
+        cls.logger.critical(value)
+
+
+    @classmethod
+    def save_json(cls):
+        """Save the internal memory to a file."""
+        with open(cls.file_json, "w") as fp:
+            json.dump(cls.values, fp, indent=" ")
+
+        if len(cls.values) > 1e3:
+            # reset 'values' and redirect the json file to other name.
+            cls.values = []
+            cls.redirect_new_json()
+
+
+    @classmethod
+    def redirect_new_json(cls):
+        """get the number of existing json files under the current folder."""
+        existing_json_files = [
+            file for file in os.listdir(cls.file_folder) if "json" in file
+        ]
+        cls.file_json = os.path.join(
+            cls.file_folder, "log-{}.json".format(len(existing_json_files) + 1)
+        )
+
+
+# Usage example
+def display_training_stat(conf, tracker, epoch, n_bits_to_transmit):
+    current_time = time.strftime("%Y-%m-%d %H:%M:%S")
+
+    # display the runtime training information.
+    Logger.display_metric(
+        name="runtime",
+        values={
+            "time": current_time,
+            "epoch": epoch,
+            "n_bits_to_transmit": n_bits_to_transmit / 8 / (2 ** 20),
+            **tracker(),
+        },
+        tags={"split": "train"}
+    )
+
+
+# Usage example
+def display_test_stat(conf, tracker, epoch, label="local"):
+    current_time = time.strftime("%Y-%m-%d %H:%M:%S")
+
+    # display the runtime training information.
+    Logger.display_metric(
+        name="runtime",
+        values={
+            "time": current_time,
+            "epoch": epoch,
+            **tracker(),
+        },
+        tags={"split": "test", "type": label}
+    )
diff --git a/utils/metrics.py b/utils/metrics.py
new file mode 100644
index 0000000..9b4ccbe
--- /dev/null
+++ b/utils/metrics.py
@@ -0,0 +1,67 @@
+import math
+
+
+class Metrics(object):
+
+    def __init__(self, topks=[1], task="classification"):
+        self.task = task
+        self.topks = topks
+        self.metric_names = self.get_metric_names(topks, task)
+        self.metrics_fn = self._get_metric_measure(topks, task)
+
+    def evaluate(self, loss, output, target):
+        return self.metrics_fn(loss, output, target)
+
+    @classmethod
+    def get_metric_names(cls, topks, task):
+        if task == "classification":
+            metric_names = ["Acc{}".format(topk) for topk in topks]
+            metric_names += ["Loss"]
+        elif task == "stackoverflow_lr":
+            metric_names = ["Acc", "Loss", "Precision", "Recall"]
+        else:
+            raise NotImplementedError
+        return metric_names
+
+    def _get_metric_measure(self, topks, task):
+        if task == "classification":
+            return self._classification_metric
+        elif task == "stackoverflow_lr":
+            return self._stackoverflow_lr_metric
+        else:
+            raise NotImplementedError
+        
+        assert self.metric_names is not None
+
+    def _classification_metric(self, loss, output, target):
+        """Computes the precision@k for the specified values of k"""
+        metric_stat = {}
+        metric_stat["Loss"] = loss.item()
+
+        maxk = max(self.topks)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        for topk in self.topks:
+            correct_k = correct[:topk].view(-1).float().sum(0, keepdim=True)
+            # res.append(correct_k.mul_(100.0 / batch_size).item())
+            metric_stat["Acc{}".format(topk)] = correct_k.mul_(100.0 / batch_size).item()
+
+        return metric_stat
+
+    def _stackoverflow_lr_metric(self, loss, output, target):
+        metric_stat = {}
+        metric_stat["Loss"] = loss.item()
+        predicted = (output > .5).int()
+        correct = predicted.eq(target).sum(axis=-1).eq(target.size(1)).sum()
+        true_positive = ((target * predicted) > .1).int().sum(axis=-1)
+        metric_stat["Precision"] = true_positive / (predicted.sum(axis=-1) + 1e-13)
+        metric_stat["Recall"] = true_positive / (target.sum(axis=-1) + 1e-13)
+        metric_stat["Acc"] = correct.mul_(100.0 / target.size(0)).item()
+        metric_stat["Loss"] = loss.item()
+        return metric_stat
+
+
diff --git a/utils/tracker.py b/utils/tracker.py
new file mode 100644
index 0000000..dcce876
--- /dev/null
+++ b/utils/tracker.py
@@ -0,0 +1,133 @@
+from copy import deepcopy
+
+
+class MaxMeter(object):
+    """
+    Keeps track of the max of all the values that are 'add'ed
+    """
+
+    def __init__(self):
+        self.max = None
+
+    def update(self, value):
+        """
+        Add a value to the accumulator.
+        :return: `true` if the provided value became the new max
+        """
+        if self.max is None or value > self.max:
+            self.max = deepcopy(value)
+            return True
+        else:
+            return False
+
+    def value(self):
+        """Access the current running average"""
+        return self.max
+
+
+class MinMeter(object):
+    """
+    Keeps track of the max of all the values that are 'add'ed
+    """
+
+    def __init__(self):
+        self.min = None
+
+    def update(self, value):
+        """
+        Add a value to the accumulator.
+        :return: `true` if the provided value became the new max
+        """
+        if self.min is None or value < self.min:
+            self.min = deepcopy(value)
+            return True
+        else:
+            return False
+
+    def value(self):
+        """Access the current running average"""
+        return self.min
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.max = -float("inf")
+        self.min = float("inf")
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+        self.max = val if val > self.max else self.max
+        self.min = val if val < self.min else self.min
+
+
+class RuntimeTracker(object):
+    """Tracking the runtime stat for local training."""
+
+    # def __init__(self, metrics_to_track=["top1"], on_cuda=True):
+    def __init__(self, things_to_track=["loss"], on_cuda=True):
+        self.things_to_track = things_to_track
+        self.on_cuda = on_cuda
+        self.n_samples = 0
+        self.stat = None
+        self.reset()
+
+
+    def reset(self):
+        self.stat = dict((name, AverageMeter()) for name in self.things_to_track)
+        self.n_samples = 0
+
+    # def evaluate_global_metric(self, metric):
+    #     return global_average(
+    #         self.stat[metric].sum, self.stat[metric].count, on_cuda=self.on_cuda
+    #     ).item()
+
+    # def evaluate_global_metrics(self):
+    #     return [self.evaluate_global_metric(metric) for metric in self.metrics_to_track]
+
+    def get_metrics_performance(self):
+        return [self.stat[thing].avg for thing in self.things_to_track]
+
+    def update_metrics(self, metric_stat, n_samples):
+        self.n_samples += n_samples
+        for thing in self.things_to_track:
+            self.stat[thing].update(metric_stat[thing], n_samples)
+
+    def __call__(self):
+        return dict((name, val.avg) for name, val in self.stat.items())
+
+
+class BestPerf(object):
+    def __init__(self, best_perf=None, larger_is_better=True):
+        self.best_perf = best_perf
+        self.cur_perf = None
+        self.best_perf_locs = []
+        self.larger_is_better = larger_is_better
+
+        # define meter
+        self._define_meter()
+
+    def _define_meter(self):
+        self.meter = MaxMeter() if self.larger_is_better else MinMeter()
+
+    def update(self, perf, perf_location):
+        self.is_best = self.meter.update(perf)
+        self.cur_perf = perf
+
+        if self.is_best:
+            self.best_perf = perf
+            self.best_perf_locs += [perf_location]
+
+    def get_best_perf_loc(self):
+        return self.best_perf_locs[-1] if len(self.best_perf_locs) != 0 else None
diff --git a/utils/wandb_util.py b/utils/wandb_util.py
new file mode 100644
index 0000000..984aae6
--- /dev/null
+++ b/utils/wandb_util.py
@@ -0,0 +1,18 @@
+import wandb
+
+def wandb_log(prefix, sp_values, com_values):
+    """
+        prefix + tags.values is the name of sp_values;
+        values should include information like:
+        {"Acc": 0.9, "Loss":}
+        com_values should include information like:
+        {"epoch": epoch, }
+    """
+    new_values = {}
+    for k, _ in sp_values.items():
+        new_values[prefix+"/" + k] = sp_values[k]
+    new_values.update(com_values)
+    wandb.log(new_values)
+
+
+