diff --git a/.gitignore b/.gitignore index c3d0c6c..d5f1a2f 100644 --- a/.gitignore +++ b/.gitignore @@ -4,8 +4,8 @@ .vscode/* *.pyc +*.conf wandb *.zip -*.log diff --git a/FedML b/FedML index 400072e..49a3c76 160000 --- a/FedML +++ b/FedML @@ -1 +1 @@ -Subproject commit 400072ef5daa9a9ca0f205ff3a90ccf13f975729 +Subproject commit 49a3c760c7d166d6730c118eb0aafae872c852bf diff --git a/data/cifar100/download_cifar100.sh b/data/cifar100/download_cifar100.sh new file mode 100644 index 0000000..cb2d7f0 --- /dev/null +++ b/data/cifar100/download_cifar100.sh @@ -0,0 +1 @@ +wget https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz \ No newline at end of file diff --git a/data_preprocessing/ImageNet/data_loader.py b/data_preprocessing/ImageNet/data_loader.py index 7a470cb..e528b23 100644 --- a/data_preprocessing/ImageNet/data_loader.py +++ b/data_preprocessing/ImageNet/data_loader.py @@ -4,9 +4,15 @@ import torch import torch.utils.data as data import torchvision.transforms as transforms +from torch.utils.data.distributed import DistributedSampler +from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset from .datasets import ImageNet +from .datasets import ImageNet100 from .datasets import ImageNet_truncated +from .datasets_hdf5 import ImageNet_hdf5 +from .datasets_hdf5 import ImageNet_truncated_hdf5 + logging.basicConfig() logger = logging.getLogger() @@ -35,12 +41,18 @@ def __call__(self, img): return img -def _data_transforms_ImageNet(): +def _data_transforms_ImageNet(args): # IMAGENET_MEAN = [0.5071, 0.4865, 0.4409] # IMAGENET_STD = [0.2673, 0.2564, 0.2762] + if args.data_transform == 'FLTransform': + IMAGENET_MEAN = [0.5, 0.5, 0.5] + IMAGENET_STD = [0.5, 0.5, 0.5] + elif args.data_transform == 'NormalTransform': + IMAGENET_MEAN = [0.485, 0.456, 0.406] + IMAGENET_STD = [0.229, 0.224, 0.225] + else: + raise NotImplementedError - IMAGENET_MEAN = [0.485, 0.456, 0.406] - IMAGENET_STD = [0.229, 0.224, 0.225] image_size = 224 train_transform = transforms.Compose([ @@ -62,74 +74,244 @@ def _data_transforms_ImageNet(): return train_transform, valid_transform -# for centralized training -def get_dataloader(dataset, datadir, train_bs, test_bs, dataidxs=None): - return get_dataloader_ImageNet(datadir, train_bs, test_bs, dataidxs) - - -# for local devices -def get_dataloader_test(dataset, datadir, train_bs, test_bs, dataidxs_train, dataidxs_test): - return get_dataloader_test_ImageNet(datadir, train_bs, test_bs, dataidxs_train, dataidxs_test) +def get_ImageNet_truncated(imagenet_dataset_train, imagenet_dataset_test, train_bs, + test_bs, dataidxs=None, net_dataidx_map=None, args=None): + """ + imagenet_dataset_train, imagenet_dataset_test should be ImageNet or ImageNet_hdf5 + """ + if type(imagenet_dataset_train) in [ImageNet, ImageNet100]: + dl_obj = ImageNet_truncated + elif type(imagenet_dataset_train) == ImageNet_hdf5: + dl_obj = ImageNet_truncated_hdf5 + else: + raise NotImplementedError() -def get_dataloader_ImageNet_truncated(imagenet_dataset_train: ImageNet, imagenet_dataset_test: ImageNet, train_bs, - test_bs, dataidxs=None, net_dataidx_map=None): - dl_obj = ImageNet_truncated - - transform_train, transform_test = _data_transforms_ImageNet() + transform_train, transform_test = _data_transforms_ImageNet(args) train_ds = dl_obj(imagenet_dataset_train, dataidxs, net_dataidx_map, train=True, transform=transform_train, download=False) - test_ds = dl_obj(imagenet_dataset_test, dataidxs=None, net_dataidx_map=None, train=False, transform=transform_test, + test_ds = dl_obj(imagenet_dataset_test, dataidxs, net_dataidx_map, train=False, transform=transform_test, download=False) + return train_ds, test_ds - train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=False) - test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=False) - - return train_dl, test_dl - - -def get_dataloader_ImageNet(datadir, train_bs, test_bs, dataidxs=None): - dl_obj = ImageNet - - transform_train, transform_test = _data_transforms_ImageNet() - train_ds = dl_obj(datadir, dataidxs=dataidxs, train=True, transform=transform_train, download=False) - test_ds = dl_obj(datadir, dataidxs=None, train=False, transform=transform_test, download=False) +def get_dataloader(dataset_train, dataset_test, train_bs, + test_bs, dataidxs=None, net_dataidx_map=None, args=None): - train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=False) - test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=False) + train_dl = data.DataLoader(dataset=dataset_train, batch_size=train_bs, shuffle=True, drop_last=False, + pin_memory=True, num_workers=args.data_load_num_workers) + test_dl = data.DataLoader(dataset=dataset_test, batch_size=test_bs, shuffle=False, drop_last=False, + pin_memory=True, num_workers=args.data_load_num_workers) return train_dl, test_dl -def get_dataloader_test_ImageNet(datadir, train_bs, test_bs, dataidxs_train=None, dataidxs_test=None): - dl_obj = ImageNet - transform_train, transform_test = _data_transforms_ImageNet() +def get_timm_loader(dataset_train, dataset_test, args): + """ + Use for get data loader of timm, for data transforms, augmentations, etc. + dataset: self-defined dataset, + return: timm loader + """ + logging.info("Using timm dataset and dataloader") + + # TODO not sure whether any problem here + data_config = resolve_data_config(vars(args), model=None, verbose=args.rank == 0) + + # setup augmentation batch splits for contrastive loss or split bn + num_aug_splits = 0 + if args.aug_splits > 0: + assert args.aug_splits > 1, 'A split of 1 makes no sense' + num_aug_splits = args.aug_splits + + # wrap dataset in AugMix helper + if num_aug_splits > 1: + dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits) + + # create data loaders w/ augmentation pipeiine + train_interpolation = args.train_interpolation + if args.no_aug or not train_interpolation: + train_interpolation = data_config['interpolation'] + + # some args not in the args + args.prefetcher = False + args.pin_mem = False + collate_fn = None + args.use_multi_epochs_loader = False + + train_batch_size = args.batch_size + test_batch_size = args.batch_size // 4 + + if args.data_transform == 'FLTransform': + data_config['mean'] = [0.5, 0.5, 0.5] + data_config['std'] = [0.5, 0.5, 0.5] + elif args.data_transform == 'NormalTransform': + pass + # data_config['mean'] = + # data_config['std'] = + else: + raise NotImplementedError + + logging.info("data transform, MEAN: {}, STD: {}.".format( + data_config['mean'], data_config['std'])) + loader_train = create_loader( + dataset_train, + input_size=data_config['input_size'], + batch_size=train_batch_size, + is_training=True, + use_prefetcher=args.prefetcher, + no_aug=args.no_aug, + re_prob=args.reprob, + re_mode=args.remode, + re_count=args.recount, + re_split=args.resplit, + scale=args.scale, + ratio=args.ratio, + hflip=args.hflip, + vflip=args.vflip, + color_jitter=args.color_jitter, + auto_augment=args.aa, + num_aug_splits=num_aug_splits, + interpolation=train_interpolation, + mean=data_config['mean'], + std=data_config['std'], + num_workers=args.data_load_num_workers, + distributed=args.distributed, + collate_fn=collate_fn, + pin_memory=args.pin_mem, + use_multi_epochs_loader=args.use_multi_epochs_loader + ) + + loader_eval = create_loader( + dataset_test, + input_size=data_config['input_size'], + batch_size=test_batch_size, + is_training=False, + use_prefetcher=args.prefetcher, + interpolation=data_config['interpolation'], + mean=data_config['mean'], + std=data_config['std'], + num_workers=args.data_load_num_workers, + distributed=args.distributed, + crop_pct=data_config['crop_pct'], + pin_memory=args.pin_mem, + ) + return loader_train, loader_eval + + +def distributed_centralized_ImageNet_loader(dataset, data_dir, + world_size, rank, batch_size, args): + """ + Used for generating distributed dataloader for + accelerating centralized training + """ + + train_bs=batch_size + test_bs=batch_size + + transform_train, transform_test = _data_transforms_ImageNet(args) + if dataset == 'ILSVRC2012': + train_dataset = ImageNet(data_dir=data_dir, + dataidxs=None, + train=True, + transform=transform_train) + + test_dataset = ImageNet(data_dir=data_dir, + dataidxs=None, + train=False, + transform=transform_test) + class_num = 1000 + elif dataset == 'ILSVRC2012-100': + train_dataset = ImageNet100(data_dir=data_dir, + dataidxs=None, + train=True, + transform=transform_train) + + test_dataset = ImageNet100(data_dir=data_dir, + dataidxs=None, + train=False, + transform=transform_test) + class_num = 100 + elif dataset == 'ILSVRC2012_hdf5': + train_dataset = ImageNet_hdf5(data_dir=data_dir, + dataidxs=None, + train=True, + transform=transform_train) + + test_dataset = ImageNet_hdf5(data_dir=data_dir, + dataidxs=None, + train=False, + transform=transform_test) + class_num = 1000 + else: + raise NotImplementedError + + + if args.if_timm_dataset: + train_dl, test_dl = get_timm_loader(train_dataset, test_dataset, args) + else: + train_sam = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank) + # test_sam = DistributedSampler(test_dataset, num_replicas=world_size, rank=rank) + + train_dl = data.DataLoader(train_dataset, batch_size=train_bs , sampler=train_sam, + pin_memory=True, num_workers=args.data_load_num_workers) + + test_dl = data.DataLoader(test_dataset, batch_size=test_bs, sampler=None, + pin_memory=True, num_workers=args.data_load_num_workers) - train_ds = dl_obj(datadir, dataidxs=dataidxs_train, train=True, transform=transform_train, download=True) - test_ds = dl_obj(datadir, dataidxs=dataidxs_test, train=False, transform=transform_test, download=True) - - train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=False) - test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=False) - - return train_dl, test_dl - - -def load_partition_data_ImageNet(dataset, data_dir, - partition_method=None, partition_alpha=None, client_number=100, batch_size=10): - train_dataset = ImageNet(data_dir=data_dir, - dataidxs=None, - train=True) + train_data_num = len(train_dataset) + test_data_num = len(test_dataset) - test_dataset = ImageNet(data_dir=data_dir, - dataidxs=None, - train=False) + logging.info("len of train_dataset: {}".format(train_data_num)) + logging.info("len of test_dataset: {}".format(test_data_num)) + + return train_data_num, test_data_num, train_dl, test_dl, \ + None, None, None, class_num + + +def load_partition_data_ImageNet(dataset, data_dir, partition_method=None, partition_alpha=None, + client_number=100, batch_size=10, args=None): + + transform_train, transform_test = _data_transforms_ImageNet(args) + if dataset == 'ILSVRC2012': + train_dataset = ImageNet(data_dir=data_dir, + dataidxs=None, + train=True, + transform=transform_train) + + test_dataset = ImageNet(data_dir=data_dir, + dataidxs=None, + train=False, + transform=transform_test) + class_num = 1000 + elif dataset == 'ILSVRC2012-100': + train_dataset = ImageNet100(data_dir=data_dir, + dataidxs=None, + train=True, + transform=transform_train) + + test_dataset = ImageNet100(data_dir=data_dir, + dataidxs=None, + train=False, + transform=transform_test) + class_num = 100 + elif dataset == 'ILSVRC2012_hdf5': + train_dataset = ImageNet_hdf5(data_dir=data_dir, + dataidxs=None, + train=True, + transform=transform_train) + + test_dataset = ImageNet_hdf5(data_dir=data_dir, + dataidxs=None, + train=False, + transform=transform_test) + class_num = 1000 + else: + raise NotImplementedError net_dataidx_map = train_dataset.get_net_dataidx_map() - class_num = 1000 # logging.info("traindata_cls_counts = " + str(traindata_cls_counts)) # train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)]) @@ -137,11 +319,12 @@ def load_partition_data_ImageNet(dataset, data_dir, test_data_num = len(test_dataset) class_num_dict = train_dataset.get_data_local_num_dict() - # train_data_global, test_data_global = get_dataloader(dataset, data_dir, batch_size, batch_size) - - train_data_global, test_data_global = get_dataloader_ImageNet_truncated(train_dataset, test_dataset, - train_bs=batch_size, test_bs=batch_size, - dataidxs=None, net_dataidx_map=None, ) + if args.if_timm_dataset: + train_data_global, test_data_global = get_timm_loader(train_dataset, test_dataset, args) + else: + train_data_global, test_data_global = get_dataloader(train_dataset, test_dataset, + train_bs=batch_size, test_bs=batch_size, + dataidxs=None, net_dataidx_map=None, args=None) logging.info("train_dl_global number = " + str(len(train_data_global))) logging.info("test_dl_global number = " + str(len(test_data_global))) @@ -153,11 +336,19 @@ def load_partition_data_ImageNet(dataset, data_dir, for client_idx in range(client_number): if client_number == 1000: + if dataset not in ['ILSVRC2012', 'ILSVRC2012_hdf5']: + raise NotImplementedError("Only support 1000 clients for Full ILSVRC2012!") dataidxs = client_idx data_local_num_dict = class_num_dict elif client_number == 100: - dataidxs = [client_idx * 10 + i for i in range(10)] - data_local_num_dict[client_idx] = sum(class_num_dict[client_idx + i] for i in range(10)) + if dataset in ['ILSVRC2012', 'ILSVRC2012_hdf5']: + dataidxs = [client_idx * 10 + i for i in range(10)] + data_local_num_dict[client_idx] = sum(class_num_dict[client_idx + i] for i in range(10)) + elif dataset in ['ILSVRC2012-100']: + dataidxs = client_idx + data_local_num_dict = class_num_dict + else: + raise NotImplementedError else: raise NotImplementedError("Not support other client_number for now!") @@ -168,10 +359,16 @@ def load_partition_data_ImageNet(dataset, data_dir, # training batch size = 64; algorithms batch size = 32 # train_data_local, test_data_local = get_dataloader(dataset, data_dir, batch_size, batch_size, # dataidxs) - train_data_local, test_data_local = get_dataloader_ImageNet_truncated(train_dataset, test_dataset, - train_bs=batch_size, test_bs=batch_size, - dataidxs=dataidxs, - net_dataidx_map=net_dataidx_map) + train_dataset_local, test_dataset_local = get_ImageNet_truncated(train_dataset, test_dataset, + train_bs=batch_size, test_bs=batch_size, + dataidxs=dataidxs, + net_dataidx_map=net_dataidx_map, args=args) + if args.if_timm_dataset: + train_data_local, test_data_local = get_timm_loader(train_dataset_local, test_dataset_local, args) + else: + train_data_local, test_data_local = get_dataloader(train_dataset_local, test_dataset_local, + train_bs=batch_size, test_bs=batch_size, + dataidxs=None, net_dataidx_map=None, args=args) # logging.info("client_idx = %d, batch_num_train_local = %d, batch_num_test_local = %d" % ( # client_idx, len(train_data_local), len(test_data_local))) @@ -184,7 +381,8 @@ def load_partition_data_ImageNet(dataset, data_dir, if __name__ == '__main__': - data_dir = '/home/datasets/imagenet/ILSVRC2012_dataset' + # data_dir = '/home/datasets/imagenet/ILSVRC2012_dataset' + data_dir = '/home/datasets/imagenet/imagenet_hdf5/imagenet-shuffled.hdf5' client_number = 100 train_data_num, test_data_num, train_data_global, test_data_global, \ diff --git a/data_preprocessing/ImageNet/datasets.py b/data_preprocessing/ImageNet/datasets.py index 44d702b..89ce77d 100644 --- a/data_preprocessing/ImageNet/datasets.py +++ b/data_preprocessing/ImageNet/datasets.py @@ -1,8 +1,10 @@ import os import os.path +import logging -import torch.utils.data as data from PIL import Image +import torch.utils.data as data +from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset def has_file_allowed_extension(filename, extensions): @@ -25,18 +27,21 @@ def find_classes(dir): return classes, class_to_idx -def make_dataset(dir, class_to_idx, extensions): +def make_dataset(dir, class_to_idx, extensions, num_classes=1000): images = [] data_local_num_dict = dict() net_dataidx_map = dict() sum_temp = 0 dir = os.path.expanduser(dir) + + i_target = 0 for target in sorted(os.listdir(dir)): + if not (i_target < num_classes): + break d = os.path.join(dir, target) if not os.path.isdir(d): continue - target_num = 0 for root, _, fnames in sorted(os.walk(d)): for fname in sorted(fnames): @@ -49,6 +54,7 @@ def make_dataset(dir, class_to_idx, extensions): net_dataidx_map[class_to_idx[target]] = (sum_temp, sum_temp + target_num) data_local_num_dict[class_to_idx[target]] = target_num sum_temp += target_num + i_target += 1 assert len(images) == sum_temp return images, data_local_num_dict, net_dataidx_map @@ -126,7 +132,83 @@ def __getdatasets__(self): if len(all_data) == 0: raise (RuntimeError("Found 0 files in subfolders of: " + self.data_dir + "\n" "Supported extensions are: " + ",".join( - extensions))) + IMG_EXTENSIONS))) + return all_data, data_local_num_dict, net_dataidx_map + + def __getitem__(self, index): + """ + Args: + index (int): Index + + Returns: + tuple: (image, target) where target is index of the target class. + """ + # img, target = self.data[index], self.target[index] + + path, target = self.local_data[index] + img = self.loader(path) + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + target = self.target_transform(target) + + return img, target + + def __len__(self): + return len(self.local_data) + + +class ImageNet100(data.Dataset): + + def __init__(self, data_dir, dataidxs=None, train=True, transform=None, target_transform=None, download=False): + """ + Generating this class too many times will be time-consuming. + So it will be better calling this once and put it into ImageNet_truncated. + """ + self.dataidxs = dataidxs + self.train = train + self.transform = transform + self.target_transform = target_transform + self.download = download + self.loader = default_loader + if self.train: + self.data_dir = os.path.join(data_dir, 'train') + else: + self.data_dir = os.path.join(data_dir, 'val') + + self.all_data, self.data_local_num_dict, self.net_dataidx_map = self.__getdatasets__() + if dataidxs == None: + self.local_data = self.all_data + elif type(dataidxs) == int: + (begin, end) = self.net_dataidx_map[dataidxs] + self.local_data = self.all_data[begin: end] + else: + self.local_data = [] + for idxs in dataidxs: + (begin, end) = self.net_dataidx_map[idxs] + self.local_data += self.all_data[begin: end] + + def get_local_data(self): + return self.local_data + + def get_net_dataidx_map(self): + return self.net_dataidx_map + + def get_data_local_num_dict(self): + return self.data_local_num_dict + + def __getdatasets__(self): + # all_data = datasets.ImageFolder(data_dir, self.transform, self.target_transform) + + classes, class_to_idx = find_classes(self.data_dir) + IMG_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif'] + all_data, data_local_num_dict, net_dataidx_map = make_dataset( + self.data_dir, class_to_idx, IMG_EXTENSIONS, num_classes=100) + if len(all_data) == 0: + raise (RuntimeError("Found 0 files in subfolders of: " + self.data_dir + "\n" + "Supported extensions are: " + ",".join( + IMG_EXTENSIONS))) return all_data, data_local_num_dict, net_dataidx_map def __getitem__(self, index): diff --git a/data_preprocessing/ImageNet/datasets_hdf5.py b/data_preprocessing/ImageNet/datasets_hdf5.py new file mode 100644 index 0000000..35f6fe8 --- /dev/null +++ b/data_preprocessing/ImageNet/datasets_hdf5.py @@ -0,0 +1,191 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function + +import os +import os.path + +import torch.utils.data as data +import torchvision.transforms as transforms +import h5py +import numpy as np + +class DatasetHDF5(data.Dataset): + def __init__(self, hdf5fn, t, transform=None, target_transform=None): + """ + t: 'train' or 'val' + """ + super(DatasetHDF5, self).__init__() + self.hf = h5py.File(hdf5fn, 'r', libver='latest', swmr=True) + self.t = t + self.n_images= self.hf['%s_img'%self.t].shape[0] + self.dlabel = self.hf['%s_labels'%self.t][...] + self.d = self.hf['%s_img'%self.t] + # self.transform = transform + # self.target_transform = target_transform + + def _get_dataset_x_and_target(self, index): + img = self.d[index, ...] + target = self.dlabel[index] + return img, np.int64(target) + + def __getitem__(self, index): + img, target = self._get_dataset_x_and_target(index) + # if self.transform is not None: + # img = self.transform(img) + # if self.target_transform is not None: + # target = self.target_transform(target) + return img, target + + def __len__(self): + return self.n_images + + +class ImageNet_hdf5(data.Dataset): + + def __init__(self, data_dir, dataidxs=None, train=True, transform=None, target_transform=None, download=False): + """ + Generating this class too many times will be time-consuming. + So it will be better calling this once and put it into ImageNet_truncated. + """ + self.dataidxs = dataidxs + self.train = train + self.transform = transform + self.target_transform = target_transform + self.download = download + self.hdf5fn = os.path.join(data_dir) + + # if self.train: + # self.data_dir = os.path.join(data_dir, 'train') + # else: + # self.data_dir = os.path.join(data_dir, 'val') + + self.all_data_hdf5 = DatasetHDF5(self.hdf5fn, 'train' if self.train else 'val', + transform=self.transform, target_transform=self.target_transform) + + self.data_local_num_dict, self.net_dataidx_map = \ + self._get_net_dataidx_map() + + """ + self.local_data_idx is a list containing indexes of local client + """ + self.all_data_idx = range(len(self.all_data_hdf5)) + if dataidxs == None: + self.local_data_idx = self.all_data_idx + elif type(dataidxs) == int: + self.local_data_idx = self.net_dataidx_map[dataidxs] + else: + self.local_data_idx = [] + for idxs in dataidxs: + self.local_data_idx += self.net_dataidx_map[idxs] + + + def _get_net_dataidx_map(self): + data_local_num_dict = dict() + net_dataidx_map = dict() + for i, label in enumerate(self.all_data_hdf5.dlabel): + label_int = np.int64(label) + if label in net_dataidx_map: + net_dataidx_map[label_int].append(i) + else: + net_dataidx_map[label_int] = [] + net_dataidx_map[label_int].append(i) + + for key, value in net_dataidx_map.items(): + data_local_num_dict[key] = len(value) + + return data_local_num_dict, net_dataidx_map + + + def get_net_dataidx_map(self): + return self.net_dataidx_map + + def get_data_local_num_dict(self): + return self.data_local_num_dict + + def __getitem__(self, index): + """ + Args: + index (int): Index + + Returns: + tuple: (image, target) where target is index of the target class. + """ + + img, target = self.all_data_hdf5[self.local_data_idx[index]] + img = transforms.ToPILImage()(img) + # img = self.loader(path) + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + target = self.target_transform(target) + + return img, target + + def __len__(self): + return len(self.local_data_idx) + + + +class ImageNet_truncated_hdf5(data.Dataset): + + def __init__(self, imagenet_dataset: ImageNet_hdf5, dataidxs, net_dataidx_map, train=True, transform=None, + target_transform=None, download=False): + + self.dataidxs = dataidxs + self.train = train + # self.transform = transform + # self.target_transform = target_transform + self.download = download + + self.all_data_hdf5 = imagenet_dataset + + self.data_local_num_dict = imagenet_dataset.data_local_num_dict + + self.net_dataidx_map = imagenet_dataset.net_dataidx_map + + """ + self.local_data_idx is a list containing indexes of local client + """ + self.all_data_idx = range(len(self.all_data_hdf5)) + if dataidxs == None: + self.local_data_idx = self.all_data_idx + elif type(dataidxs) == int: + self.local_data_idx = self.net_dataidx_map[dataidxs] + else: + self.local_data_idx = [] + for idxs in dataidxs: + self.local_data_idx += self.net_dataidx_map[idxs] + + + def get_net_dataidx_map(self): + return self.net_dataidx_map + + def get_data_local_num_dict(self): + return self.data_local_num_dict + + def __getitem__(self, index): + """ + Args: + index (int): Index + + Returns: + tuple: (image, target) where target is index of the target class. + """ + + # Transform operation has been conducted in all_data_hdf5 + img, target = self.all_data_hdf5[self.local_data_idx[index]] + return img, target + + def __len__(self): + return len(self.local_data_idx) + + + + + + + + + + diff --git a/data_preprocessing/Landmarks/data_loader.py b/data_preprocessing/Landmarks/data_loader.py index 377efb7..d1e8ea9 100644 --- a/data_preprocessing/Landmarks/data_loader.py +++ b/data_preprocessing/Landmarks/data_loader.py @@ -9,6 +9,7 @@ import torch import torch.utils.data as data import torchvision.transforms as transforms +from timm.data import Dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset from .datasets import Landmarks @@ -28,47 +29,7 @@ def _read_csv(path: str): with open(path, 'r') as f: return list(csv.DictReader(f)) -# class Cutout(object): -# def __init__(self, length): -# self.length = length -# def __call__(self, img): -# h, w = img.size(1), img.size(2) -# mask = np.ones((h, w), np.float32) -# y = np.random.randint(h) -# x = np.random.randint(w) - -# y1 = np.clip(y - self.length // 2, 0, h) -# y2 = np.clip(y + self.length // 2, 0, h) -# x1 = np.clip(x - self.length // 2, 0, w) -# x2 = np.clip(x + self.length // 2, 0, w) - -# mask[y1: y2, x1: x2] = 0. -# mask = torch.from_numpy(mask) -# mask = mask.expand_as(img) -# img *= mask -# return img - -# def _data_transforms_landmarks(): -# landmarks_MEAN = [0.5071, 0.4865, 0.4409] -# landmarks_STD = [0.2673, 0.2564, 0.2762] - -# train_transform = transforms.Compose([ -# transforms.ToPILImage(), -# transforms.RandomCrop(32, padding=4), -# transforms.RandomHorizontalFlip(), -# transforms.ToTensor(), -# transforms.Normalize(landmarks_MEAN, landmarks_STD), -# ]) - -# train_transform.transforms.append(Cutout(16)) - -# valid_transform = transforms.Compose([ -# transforms.ToTensor(), -# transforms.Normalize(landmarks_MEAN, landmarks_STD), -# ]) - -# return train_transform, valid_transform class Cutout(object): def __init__(self, length): @@ -92,12 +53,16 @@ def __call__(self, img): return img -def _data_transforms_landmarks(): - # IMAGENET_MEAN = [0.5071, 0.4865, 0.4409] - # IMAGENET_STD = [0.2673, 0.2564, 0.2762] +def _data_transforms_landmarks(args): - IMAGENET_MEAN = [0.5, 0.5, 0.5] - IMAGENET_STD = [0.5, 0.5, 0.5] + if args.data_transform == 'FLTransform': + IMAGENET_MEAN = [0.5, 0.5, 0.5] + IMAGENET_STD = [0.5, 0.5, 0.5] + elif args.data_transform == 'NormalTransform': + IMAGENET_MEAN = [0.485, 0.456, 0.406] + IMAGENET_STD = [0.229, 0.224, 0.225] + else: + raise NotImplementedError image_size = 224 train_transform = transforms.Compose([ @@ -132,7 +97,7 @@ def get_mapping_per_user(fn): mapping_table = _read_csv(fn) expected_cols = ['user_id', 'image_id', 'class'] if not all(col in mapping_table[0].keys() for col in expected_cols): - logger.error('%s has wrong format.', mapping_file) + logger.error('%s has wrong format.', fn) raise ValueError( 'The mapping file must contain user_id, image_id and class columns. ' 'The existing columns are %s' % ','.join(mapping_table[0].keys())) @@ -161,46 +126,127 @@ def get_mapping_per_user(fn): return data_files, data_local_num_dict, net_dataidx_map -# for centralized training -def get_dataloader(dataset, datadir, train_files, test_files, train_bs, test_bs, dataidxs=None): - return get_dataloader_Landmarks(datadir, train_files, test_files, train_bs, test_bs, dataidxs) - - -# for local devices -def get_dataloader_test(dataset, datadir, train_files, test_files, train_bs, test_bs, dataidxs_train, dataidxs_test): - return get_dataloader_test_Landmarks(datadir, train_files, test_files, train_bs, test_bs, dataidxs_train, dataidxs_test) +def get_dataloader(dataset_train, dataset_test, dataidxs=None, args=None): + train_bs = args.batch_size + test_bs = args.batch_size + train_dl = data.DataLoader(dataset=dataset_train, batch_size=train_bs, shuffle=True, drop_last=False, + pin_memory=True, num_workers=args.data_load_num_workers) + test_dl = data.DataLoader(dataset=dataset_test, batch_size=test_bs, shuffle=False, drop_last=False, + pin_memory=True, num_workers=args.data_load_num_workers) -def get_dataloader_Landmarks(datadir, train_files, test_files, train_bs, test_bs, dataidxs=None): - dl_obj = Landmarks + return train_dl, test_dl - transform_train, transform_test = _data_transforms_landmarks() - train_ds = dl_obj(datadir, train_files, dataidxs=dataidxs, train=True, transform=transform_train, download=True) - test_ds = dl_obj(datadir, test_files, dataidxs=None, train=False, transform=transform_test, download=True) +# def get_dataloader_Landmarks(datadir, train_files, test_files, train_bs, test_bs, dataidxs=None): +# dl_obj = Landmarks - train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=False) - test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=False) +# transform_train, transform_test = _data_transforms_landmarks() - return train_dl, test_dl +# train_ds = dl_obj(datadir, train_files, dataidxs=dataidxs, train=True, transform=transform_train, download=True) +# test_ds = dl_obj(datadir, test_files, dataidxs=dataidxs, train=False, transform=transform_test, download=True) +# train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=False) +# test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=False) -def get_dataloader_test_Landmarks(datadir, train_files, test_files, train_bs, test_bs, dataidxs_train=None, dataidxs_test=None): - dl_obj = Landmarks +# return train_dl, test_dl - transform_train, transform_test = _data_transforms_landmarks() - train_ds = dl_obj(datadir, train_files, dataidxs=dataidxs_train, train=True, transform=transform_train, download=True) - test_ds = dl_obj(datadir, test_files, dataidxs=dataidxs_test, train=False, transform=transform_test, download=True) +def get_timm_loader(dataset_train, dataset_test, args): + """ + Use for get data loader of timm, for data transforms, augmentations, etc. + dataset: self-defined dataset, + return: timm loader + """ + logging.info("Using timm dataset and dataloader") + + # TODO not sure whether any problem here + data_config = resolve_data_config(vars(args), model=None, verbose=args.rank == 0) + + # setup augmentation batch splits for contrastive loss or split bn + num_aug_splits = 0 + if args.aug_splits > 0: + assert args.aug_splits > 1, 'A split of 1 makes no sense' + num_aug_splits = args.aug_splits + + # wrap dataset in AugMix helper + if num_aug_splits > 1: + dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits) + + # create data loaders w/ augmentation pipeiine + train_interpolation = args.train_interpolation + if args.no_aug or not train_interpolation: + train_interpolation = data_config['interpolation'] + + # some args not in the args + args.prefetcher = False + args.pin_mem = False + collate_fn = None + args.use_multi_epochs_loader = False + + train_batch_size = args.batch_size + test_batch_size = args.batch_size // 4 + + if args.data_transform == 'FLTransform': + data_config['mean'] = [0.5, 0.5, 0.5] + data_config['std'] = [0.5, 0.5, 0.5] + elif args.data_transform == 'NormalTransform': + pass + # data_config['mean'] = + # data_config['std'] = + else: + raise NotImplementedError + + logging.info("data transform, MEAN: {}, STD: {}.".format( + data_config['mean'], data_config['std'])) + loader_train = create_loader( + dataset_train, + input_size=data_config['input_size'], + batch_size=train_batch_size, + is_training=True, + use_prefetcher=args.prefetcher, + no_aug=args.no_aug, + re_prob=args.reprob, + re_mode=args.remode, + re_count=args.recount, + re_split=args.resplit, + scale=args.scale, + ratio=args.ratio, + hflip=args.hflip, + vflip=args.vflip, + color_jitter=args.color_jitter, + auto_augment=args.aa, + num_aug_splits=num_aug_splits, + interpolation=train_interpolation, + mean=data_config['mean'], + std=data_config['std'], + num_workers=args.data_load_num_workers, + distributed=args.distributed, + collate_fn=collate_fn, + pin_memory=args.pin_mem, + use_multi_epochs_loader=args.use_multi_epochs_loader + ) + + loader_eval = create_loader( + dataset_test, + input_size=data_config['input_size'], + batch_size=test_batch_size, + is_training=False, + use_prefetcher=args.prefetcher, + interpolation=data_config['interpolation'], + mean=data_config['mean'], + std=data_config['std'], + num_workers=args.data_load_num_workers, + distributed=args.distributed, + crop_pct=data_config['crop_pct'], + pin_memory=args.pin_mem, + ) + return loader_train, loader_eval - train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=False) - test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=False) - - return train_dl, test_dl def load_partition_data_landmarks(dataset, data_dir, fed_train_map_file, fed_test_map_file, - partition_method=None, partition_alpha=None, client_number=233, batch_size=10): + partition_method=None, partition_alpha=None, client_number=233, batch_size=10, args=None): train_files, data_local_num_dict, net_dataidx_map = get_mapping_per_user(fed_train_map_file) test_files = _read_csv(fed_test_map_file) @@ -209,7 +255,18 @@ def load_partition_data_landmarks(dataset, data_dir, fed_train_map_file, fed_tes # logging.info("traindata_cls_counts = " + str(traindata_cls_counts)) train_data_num = len(train_files) - train_data_global, test_data_global = get_dataloader(dataset, data_dir, train_files, test_files, batch_size, batch_size) + + transform_train, transform_test = _data_transforms_landmarks(args) + + train_dataset = Landmarks(data_dir, train_files, dataidxs=None, train=True, transform=transform_train, download=True) + test_dataset = Landmarks(data_dir, test_files, dataidxs=None, train=False, transform=transform_test, download=True) + + + if args.if_timm_dataset: + train_data_global, test_data_global = get_timm_loader(train_dataset, test_dataset, args) + else: + train_data_global, test_data_global = get_dataloader(train_dataset, test_dataset, args) + # logging.info("train_dl_global number = " + str(len(train_data_global))) # logging.info("test_dl_global number = " + str(len(test_data_global))) test_data_num = len(test_files) @@ -227,9 +284,13 @@ def load_partition_data_landmarks(dataset, data_dir, fed_train_map_file, fed_tes # data_local_num_dict[client_idx] = local_data_num # logging.info("client_idx = %d, local_sample_number = %d" % (client_idx, local_data_num)) - # training batch size = 64; algorithms batch size = 32 - train_data_local, test_data_local = get_dataloader(dataset, data_dir, train_files, test_files, batch_size, batch_size, - dataidxs) + train_dataset_local = Landmarks(data_dir, train_files, dataidxs=dataidxs, train=True, transform=transform_train, download=True) + test_dataset_local = Landmarks(data_dir, test_files, dataidxs=None, train=False, transform=transform_test, download=True) + if args.if_timm_dataset: + train_data_local, test_data_local = get_timm_loader(train_dataset_local, test_dataset_local, args) + else: + train_data_local, test_data_local = get_dataloader(train_dataset_local, test_dataset_local, args) + # logging.info("client_idx = %d, batch_num_train_local = %d, batch_num_test_local = %d" % ( # client_idx, len(train_data_local), len(test_data_local))) train_data_local_dict[client_idx] = train_data_local diff --git a/data_preprocessing/cifar10/__init__.py b/data_preprocessing/cifar10/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data_preprocessing/cifar10/data_loader.py b/data_preprocessing/cifar10/data_loader.py new file mode 100644 index 0000000..3377f9d --- /dev/null +++ b/data_preprocessing/cifar10/data_loader.py @@ -0,0 +1,269 @@ +import logging + +import numpy as np +import torch +import torch.utils.data as data +import torchvision.transforms as transforms + +from .datasets import CIFAR10_truncated + +logging.basicConfig() +logger = logging.getLogger() +logger.setLevel(logging.INFO) + + +# generate the non-IID distribution for all methods +def read_data_distribution(filename='./data_preprocessing/non-iid-distribution/CIFAR10/distribution.txt'): + distribution = {} + with open(filename, 'r') as data: + for x in data.readlines(): + if '{' != x[0] and '}' != x[0]: + tmp = x.split(':') + if '{' == tmp[1].strip(): + first_level_key = int(tmp[0]) + distribution[first_level_key] = {} + else: + second_level_key = int(tmp[0]) + distribution[first_level_key][second_level_key] = int(tmp[1].strip().replace(',', '')) + return distribution + + +def read_net_dataidx_map(filename='./data_preprocessing/non-iid-distribution/CIFAR10/net_dataidx_map.txt'): + net_dataidx_map = {} + with open(filename, 'r') as data: + for x in data.readlines(): + if '{' != x[0] and '}' != x[0] and ']' != x[0]: + tmp = x.split(':') + if '[' == tmp[-1].strip(): + key = int(tmp[0]) + net_dataidx_map[key] = [] + else: + tmp_array = x.split(',') + net_dataidx_map[key] = [int(i.strip()) for i in tmp_array] + return net_dataidx_map + + +def record_net_data_stats(y_train, net_dataidx_map): + net_cls_counts = {} + + for net_i, dataidx in net_dataidx_map.items(): + unq, unq_cnt = np.unique(y_train[dataidx], return_counts=True) + tmp = {unq[i]: unq_cnt[i] for i in range(len(unq))} + net_cls_counts[net_i] = tmp + logging.debug('Data statistics: %s' % str(net_cls_counts)) + return net_cls_counts + + +class Cutout(object): + def __init__(self, length): + self.length = length + + def __call__(self, img): + h, w = img.size(1), img.size(2) + mask = np.ones((h, w), np.float32) + y = np.random.randint(h) + x = np.random.randint(w) + + y1 = np.clip(y - self.length // 2, 0, h) + y2 = np.clip(y + self.length // 2, 0, h) + x1 = np.clip(x - self.length // 2, 0, w) + x2 = np.clip(x + self.length // 2, 0, w) + + mask[y1: y2, x1: x2] = 0. + mask = torch.from_numpy(mask) + mask = mask.expand_as(img) + img *= mask + return img + + +def _data_transforms_cifar10(): + CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124] + CIFAR_STD = [0.24703233, 0.24348505, 0.26158768] + + train_transform = transforms.Compose([ + transforms.ToPILImage(), + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(CIFAR_MEAN, CIFAR_STD), + ]) + + train_transform.transforms.append(Cutout(16)) + + valid_transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize(CIFAR_MEAN, CIFAR_STD), + ]) + + return train_transform, valid_transform + + +def load_cifar10_data(datadir): + train_transform, test_transform = _data_transforms_cifar10() + + cifar10_train_ds = CIFAR10_truncated(datadir, train=True, download=True, transform=train_transform) + cifar10_test_ds = CIFAR10_truncated(datadir, train=False, download=True, transform=test_transform) + + X_train, y_train = cifar10_train_ds.data, cifar10_train_ds.target + X_test, y_test = cifar10_test_ds.data, cifar10_test_ds.target + + return (X_train, y_train, X_test, y_test) + + +def partition_data(dataset, datadir, partition, n_nets, alpha): + logging.info("*********partition data***************") + X_train, y_train, X_test, y_test = load_cifar10_data(datadir) + n_train = X_train.shape[0] + # n_test = X_test.shape[0] + + if partition == "homo": + total_num = n_train + idxs = np.random.permutation(total_num) + batch_idxs = np.array_split(idxs, n_nets) + net_dataidx_map = {i: batch_idxs[i] for i in range(n_nets)} + + elif partition == "hetero": + min_size = 0 + K = 10 + N = y_train.shape[0] + logging.info("N = " + str(N)) + net_dataidx_map = {} + + while min_size < 10: + idx_batch = [[] for _ in range(n_nets)] + # for each class in the dataset + for k in range(K): + idx_k = np.where(y_train == k)[0] + np.random.shuffle(idx_k) + proportions = np.random.dirichlet(np.repeat(alpha, n_nets)) + ## Balance + proportions = np.array([p * (len(idx_j) < N / n_nets) for p, idx_j in zip(proportions, idx_batch)]) + proportions = proportions / proportions.sum() + proportions = (np.cumsum(proportions) * len(idx_k)).astype(int)[:-1] + idx_batch = [idx_j + idx.tolist() for idx_j, idx in zip(idx_batch, np.split(idx_k, proportions))] + min_size = min([len(idx_j) for idx_j in idx_batch]) + + for j in range(n_nets): + np.random.shuffle(idx_batch[j]) + net_dataidx_map[j] = idx_batch[j] + + elif partition == "hetero-fix": + dataidx_map_file_path = './data_preprocessing/non-iid-distribution/CIFAR10/net_dataidx_map.txt' + net_dataidx_map = read_net_dataidx_map(dataidx_map_file_path) + + if partition == "hetero-fix": + distribution_file_path = './data_preprocessing/non-iid-distribution/CIFAR10/distribution.txt' + traindata_cls_counts = read_data_distribution(distribution_file_path) + else: + traindata_cls_counts = record_net_data_stats(y_train, net_dataidx_map) + + return X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts + + +# for centralized training +def get_dataloader(dataset, datadir, train_bs, test_bs, dataidxs=None): + return get_dataloader_CIFAR10(datadir, train_bs, test_bs, dataidxs) + + +# for local devices +def get_dataloader_test(dataset, datadir, train_bs, test_bs, dataidxs_train, dataidxs_test): + return get_dataloader_test_CIFAR10(datadir, train_bs, test_bs, dataidxs_train, dataidxs_test) + + +def get_dataloader_CIFAR10(datadir, train_bs, test_bs, dataidxs=None): + dl_obj = CIFAR10_truncated + + transform_train, transform_test = _data_transforms_cifar10() + + train_ds = dl_obj(datadir, dataidxs=dataidxs, train=True, transform=transform_train, download=True) + test_ds = dl_obj(datadir, train=False, transform=transform_test, download=True) + + train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=True) + test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=True) + + return train_dl, test_dl + + +def get_dataloader_test_CIFAR10(datadir, train_bs, test_bs, dataidxs_train=None, dataidxs_test=None): + dl_obj = CIFAR10_truncated + + transform_train, transform_test = _data_transforms_cifar10() + + train_ds = dl_obj(datadir, dataidxs=dataidxs_train, train=True, transform=transform_train, download=True) + test_ds = dl_obj(datadir, dataidxs=dataidxs_test, train=False, transform=transform_test, download=True) + + train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=True) + test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=True) + + return train_dl, test_dl + + +def load_partition_data_distributed_cifar10(process_id, dataset, data_dir, partition_method, partition_alpha, + client_number, batch_size): + X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts = partition_data(dataset, + data_dir, + partition_method, + client_number, + partition_alpha) + class_num = len(np.unique(y_train)) + logging.info("traindata_cls_counts = " + str(traindata_cls_counts)) + train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)]) + + # get global test data + if process_id == 0: + train_data_global, test_data_global = get_dataloader(dataset, data_dir, batch_size, batch_size) + logging.info("train_dl_global number = " + str(len(train_data_global))) + logging.info("test_dl_global number = " + str(len(test_data_global))) + train_data_local = None + test_data_local = None + local_data_num = 0 + else: + # get local dataset + dataidxs = net_dataidx_map[process_id - 1] + local_data_num = len(dataidxs) + logging.info("rank = %d, local_sample_number = %d" % (process_id, local_data_num)) + # training batch size = 64; algorithms batch size = 32 + train_data_local, test_data_local = get_dataloader(dataset, data_dir, batch_size, batch_size, + dataidxs) + logging.info("process_id = %d, batch_num_train_local = %d, batch_num_test_local = %d" % ( + process_id, len(train_data_local), len(test_data_local))) + train_data_global = None + test_data_global = None + return train_data_num, train_data_global, test_data_global, local_data_num, train_data_local, test_data_local, class_num + + +def load_partition_data_cifar10(dataset, data_dir, partition_method, partition_alpha, client_number, batch_size): + X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts = partition_data(dataset, + data_dir, + partition_method, + client_number, + partition_alpha) + class_num = len(np.unique(y_train)) + logging.info("traindata_cls_counts = " + str(traindata_cls_counts)) + train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)]) + + train_data_global, test_data_global = get_dataloader(dataset, data_dir, batch_size, batch_size) + logging.info("train_dl_global number = " + str(len(train_data_global))) + logging.info("test_dl_global number = " + str(len(test_data_global))) + test_data_num = len(test_data_global) + + # get local dataset + data_local_num_dict = dict() + train_data_local_dict = dict() + test_data_local_dict = dict() + + for client_idx in range(client_number): + dataidxs = net_dataidx_map[client_idx] + local_data_num = len(dataidxs) + data_local_num_dict[client_idx] = local_data_num + logging.info("client_idx = %d, local_sample_number = %d" % (client_idx, local_data_num)) + + # training batch size = 64; algorithms batch size = 32 + train_data_local, test_data_local = get_dataloader(dataset, data_dir, batch_size, batch_size, + dataidxs) + logging.info("client_idx = %d, batch_num_train_local = %d, batch_num_test_local = %d" % ( + client_idx, len(train_data_local), len(test_data_local))) + train_data_local_dict[client_idx] = train_data_local + test_data_local_dict[client_idx] = test_data_local + return train_data_num, test_data_num, train_data_global, test_data_global, \ + data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num diff --git a/data_preprocessing/cifar10/datasets.py b/data_preprocessing/cifar10/datasets.py new file mode 100644 index 0000000..54dabd9 --- /dev/null +++ b/data_preprocessing/cifar10/datasets.py @@ -0,0 +1,96 @@ +import logging + +import numpy as np +import torch.utils.data as data +from PIL import Image +from torchvision.datasets import CIFAR10 + +logging.basicConfig() +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp') + + +def accimage_loader(path): + import accimage + try: + return accimage.Image(path) + except IOError: + # Potentially a decoding problem, fall back to PIL.Image + return pil_loader(path) + + +def pil_loader(path): + # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) + with open(path, 'rb') as f: + img = Image.open(f) + return img.convert('RGB') + + +def default_loader(path): + from torchvision import get_image_backend + if get_image_backend() == 'accimage': + return accimage_loader(path) + else: + return pil_loader(path) + + +class CIFAR10_truncated(data.Dataset): + + def __init__(self, root, dataidxs=None, train=True, transform=None, target_transform=None, download=False): + + self.root = root + self.dataidxs = dataidxs + self.train = train + self.transform = transform + self.target_transform = target_transform + self.download = download + + self.data, self.target = self.__build_truncated_dataset__() + + def __build_truncated_dataset__(self): + print("download = " + str(self.download)) + cifar_dataobj = CIFAR10(self.root, self.train, self.transform, self.target_transform, self.download) + + if self.train: + # print("train member of the class: {}".format(self.train)) + # data = cifar_dataobj.train_data + data = cifar_dataobj.data + target = np.array(cifar_dataobj.targets) + else: + data = cifar_dataobj.data + target = np.array(cifar_dataobj.targets) + + if self.dataidxs is not None: + data = data[self.dataidxs] + target = target[self.dataidxs] + + return data, target + + def truncate_channel(self, index): + for i in range(index.shape[0]): + gs_index = index[i] + self.data[gs_index, :, :, 1] = 0.0 + self.data[gs_index, :, :, 2] = 0.0 + + def __getitem__(self, index): + """ + Args: + index (int): Index + + Returns: + tuple: (image, target) where target is index of the target class. + """ + img, target = self.data[index], self.target[index] + + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + target = self.target_transform(target) + + return img, target + + def __len__(self): + return len(self.data) diff --git a/data_preprocessing/cifar10/iid_data_loader.py b/data_preprocessing/cifar10/iid_data_loader.py new file mode 100644 index 0000000..5582028 --- /dev/null +++ b/data_preprocessing/cifar10/iid_data_loader.py @@ -0,0 +1,80 @@ +import os +import argparse +import time +import math +import logging + +import torch +import torch.utils.data as data +import torchvision.transforms as transforms +from torchvision.datasets import CIFAR10 +from torch.utils.data.distributed import DistributedSampler + + + + +def load_iid_cifar10(dataset, data_dir, partition_method, + partition_alpha, client_number, batch_size, rank=0): + + CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124] + CIFAR_STD = [0.24703233, 0.24348505, 0.26158768] + + image_size = 32 + train_transform = transforms.Compose([ + transforms.RandomCrop(image_size, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(mean=CIFAR_MEAN , std=CIFAR_STD), + ]) + test_transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize(mean=CIFAR_MEAN , std=CIFAR_STD), + ]) + + train_dataset = CIFAR10(root=data_dir, train=True, + transform=train_transform, download=False) + + test_dataset = CIFAR10(root=data_dir, train=False, + transform=test_transform, download=False) + + train_sampler = None + shuffle = True + if client_number > 1: + train_sampler = data.distributed.DistributedSampler( + train_dataset, num_replicas=client_number, rank=rank) + train_sampler.set_epoch(0) + shuffle = False + + train_sampler = train_sampler + train_dl = data.DataLoader(train_dataset, batch_size=batch_size, + shuffle=shuffle, num_workers=4, sampler=train_sampler) + test_dl = data.DataLoader(test_dataset, batch_size=batch_size, + shuffle=False, num_workers=4) + # classes = ('plane', 'car', 'bird', 'cat', + # 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') + + class_num = 10 + + train_data_num = len(train_dataset) + test_data_num = len(test_dataset) + + data_local_num_dict = dict() + train_data_local_dict = dict() + test_data_local_dict = dict() + + for client_idx in range(client_number): + train_data_local_dict[client_idx] = train_dl + test_data_local_dict[client_idx] = test_dl + data_local_num_dict[client_idx] = train_data_num // client_number + logging.info("client_idx = %d, local_sample_number = %d" % (client_idx, train_data_num)) + + return train_data_num, test_data_num, train_dl, test_dl, \ + data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num + + + + + + + + diff --git a/data_preprocessing/cifar100/__init__.py b/data_preprocessing/cifar100/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data_preprocessing/cifar100/data_loader.py b/data_preprocessing/cifar100/data_loader.py new file mode 100644 index 0000000..c4e5a36 --- /dev/null +++ b/data_preprocessing/cifar100/data_loader.py @@ -0,0 +1,269 @@ +import logging + +import numpy as np +import torch +import torch.utils.data as data +import torchvision.transforms as transforms + +from .datasets import CIFAR100_truncated + +logging.basicConfig() +logger = logging.getLogger() +logger.setLevel(logging.INFO) + + +# generate the non-IID distribution for all methods +def read_data_distribution(filename='./data_preprocessing/non-iid-distribution/CIFAR10/distribution.txt'): + distribution = {} + with open(filename, 'r') as data: + for x in data.readlines(): + if '{' != x[0] and '}' != x[0]: + tmp = x.split(':') + if '{' == tmp[1].strip(): + first_level_key = int(tmp[0]) + distribution[first_level_key] = {} + else: + second_level_key = int(tmp[0]) + distribution[first_level_key][second_level_key] = int(tmp[1].strip().replace(',', '')) + return distribution + + +def read_net_dataidx_map(filename='./data_preprocessing/non-iid-distribution/CIFAR10/net_dataidx_map.txt'): + net_dataidx_map = {} + with open(filename, 'r') as data: + for x in data.readlines(): + if '{' != x[0] and '}' != x[0] and ']' != x[0]: + tmp = x.split(':') + if '[' == tmp[-1].strip(): + key = int(tmp[0]) + net_dataidx_map[key] = [] + else: + tmp_array = x.split(',') + net_dataidx_map[key] = [int(i.strip()) for i in tmp_array] + return net_dataidx_map + + +def record_net_data_stats(y_train, net_dataidx_map): + net_cls_counts = {} + + for net_i, dataidx in net_dataidx_map.items(): + unq, unq_cnt = np.unique(y_train[dataidx], return_counts=True) + tmp = {unq[i]: unq_cnt[i] for i in range(len(unq))} + net_cls_counts[net_i] = tmp + logging.debug('Data statistics: %s' % str(net_cls_counts)) + return net_cls_counts + + +class Cutout(object): + def __init__(self, length): + self.length = length + + def __call__(self, img): + h, w = img.size(1), img.size(2) + mask = np.ones((h, w), np.float32) + y = np.random.randint(h) + x = np.random.randint(w) + + y1 = np.clip(y - self.length // 2, 0, h) + y2 = np.clip(y + self.length // 2, 0, h) + x1 = np.clip(x - self.length // 2, 0, w) + x2 = np.clip(x + self.length // 2, 0, w) + + mask[y1: y2, x1: x2] = 0. + mask = torch.from_numpy(mask) + mask = mask.expand_as(img) + img *= mask + return img + + +def _data_transforms_cifar100(): + CIFAR_MEAN = [0.5071, 0.4865, 0.4409] + CIFAR_STD = [0.2673, 0.2564, 0.2762] + + train_transform = transforms.Compose([ + transforms.ToPILImage(), + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(CIFAR_MEAN, CIFAR_STD), + ]) + + train_transform.transforms.append(Cutout(16)) + + valid_transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize(CIFAR_MEAN, CIFAR_STD), + ]) + + return train_transform, valid_transform + +def load_cifar100_data(datadir): + train_transform, test_transform = _data_transforms_cifar100() + + cifar10_train_ds = CIFAR100_truncated(datadir, train=True, download=True, transform=train_transform) + cifar10_test_ds = CIFAR100_truncated(datadir, train=False, download=True, transform=test_transform) + + X_train, y_train = cifar10_train_ds.data, cifar10_train_ds.target + X_test, y_test = cifar10_test_ds.data, cifar10_test_ds.target + + return (X_train, y_train, X_test, y_test) + + +def partition_data(dataset, datadir, partition, n_nets, alpha): + logging.info("*********partition data***************") + X_train, y_train, X_test, y_test = load_cifar100_data(datadir) + n_train = X_train.shape[0] + # n_test = X_test.shape[0] + + if partition == "homo": + total_num = n_train + idxs = np.random.permutation(total_num) + batch_idxs = np.array_split(idxs, n_nets) + net_dataidx_map = {i: batch_idxs[i] for i in range(n_nets)} + + elif partition == "hetero": + min_size = 0 + K = 100 + N = y_train.shape[0] + logging.info("N = " + str(N)) + net_dataidx_map = {} + + while min_size < 10: + idx_batch = [[] for _ in range(n_nets)] + # for each class in the dataset + for k in range(K): + idx_k = np.where(y_train == k)[0] + np.random.shuffle(idx_k) + proportions = np.random.dirichlet(np.repeat(alpha, n_nets)) + ## Balance + proportions = np.array([p * (len(idx_j) < N / n_nets) for p, idx_j in zip(proportions, idx_batch)]) + proportions = proportions / proportions.sum() + proportions = (np.cumsum(proportions) * len(idx_k)).astype(int)[:-1] + idx_batch = [idx_j + idx.tolist() for idx_j, idx in zip(idx_batch, np.split(idx_k, proportions))] + min_size = min([len(idx_j) for idx_j in idx_batch]) + + for j in range(n_nets): + np.random.shuffle(idx_batch[j]) + net_dataidx_map[j] = idx_batch[j] + + elif partition == "hetero-fix": + dataidx_map_file_path = './data_preprocessing/non-iid-distribution/CIFAR100/net_dataidx_map.txt' + net_dataidx_map = read_net_dataidx_map(dataidx_map_file_path) + + if partition == "hetero-fix": + distribution_file_path = './data_preprocessing/non-iid-distribution/CIFAR100/distribution.txt' + traindata_cls_counts = read_data_distribution(distribution_file_path) + else: + traindata_cls_counts = record_net_data_stats(y_train, net_dataidx_map) + + return X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts + + +# for centralized training +def get_dataloader(dataset, datadir, train_bs, test_bs, dataidxs=None): + return get_dataloader_CIFAR100(datadir, train_bs, test_bs, dataidxs) + + +# for local devices +def get_dataloader_test(dataset, datadir, train_bs, test_bs, dataidxs_train, dataidxs_test): + return get_dataloader_test_CIFAR100(datadir, train_bs, test_bs, dataidxs_train, dataidxs_test) + + +def get_dataloader_CIFAR100(datadir, train_bs, test_bs, dataidxs=None): + dl_obj = CIFAR100_truncated + + transform_train, transform_test = _data_transforms_cifar100() + + train_ds = dl_obj(datadir, dataidxs=dataidxs, train=True, transform=transform_train, download=True) + test_ds = dl_obj(datadir, train=False, transform=transform_test, download=True) + + train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=True) + test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=True) + + return train_dl, test_dl + + +def get_dataloader_test_CIFAR100(datadir, train_bs, test_bs, dataidxs_train=None, dataidxs_test=None): + dl_obj = CIFAR100_truncated + + transform_train, transform_test = _data_transforms_cifar100() + + train_ds = dl_obj(datadir, dataidxs=dataidxs_train, train=True, transform=transform_train, download=True) + test_ds = dl_obj(datadir, dataidxs=dataidxs_test, train=False, transform=transform_test, download=True) + + train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=True) + test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=True) + + return train_dl, test_dl + + +def load_partition_data_distributed_cifar100(process_id, dataset, data_dir, partition_method, partition_alpha, + client_number, batch_size): + X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts = partition_data(dataset, + data_dir, + partition_method, + client_number, + partition_alpha) + class_num = len(np.unique(y_train)) + logging.info("traindata_cls_counts = " + str(traindata_cls_counts)) + train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)]) + + # get global test data + if process_id == 0: + train_data_global, test_data_global = get_dataloader(dataset, data_dir, batch_size, batch_size) + logging.info("train_dl_global number = " + str(len(train_data_global))) + logging.info("test_dl_global number = " + str(len(train_data_global))) + train_data_local = None + test_data_local = None + local_data_num = 0 + else: + # get local dataset + dataidxs = net_dataidx_map[process_id - 1] + local_data_num = len(dataidxs) + logging.info("rank = %d, local_sample_number = %d" % (process_id, local_data_num)) + # training batch size = 64; algorithms batch size = 32 + train_data_local, test_data_local = get_dataloader(dataset, data_dir, batch_size, batch_size, + dataidxs) + logging.info("process_id = %d, batch_num_train_local = %d, batch_num_test_local = %d" % ( + process_id, len(train_data_local), len(test_data_local))) + train_data_global = None + test_data_global = None + + return train_data_num, train_data_global, test_data_global, local_data_num, train_data_local, test_data_local, class_num + + +def load_partition_data_cifar100(dataset, data_dir, partition_method, partition_alpha, client_number, batch_size): + X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts = partition_data(dataset, + data_dir, + partition_method, + client_number, + partition_alpha) + class_num = len(np.unique(y_train)) + logging.info("traindata_cls_counts = " + str(traindata_cls_counts)) + train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)]) + + train_data_global, test_data_global = get_dataloader(dataset, data_dir, batch_size, batch_size) + logging.info("train_dl_global number = " + str(len(train_data_global))) + logging.info("test_dl_global number = " + str(len(train_data_global))) + test_data_num = len(test_data_global) + + # get local dataset + data_local_num_dict = dict() + train_data_local_dict = dict() + test_data_local_dict = dict() + + for client_idx in range(client_number): + dataidxs = net_dataidx_map[client_idx] + local_data_num = len(dataidxs) + data_local_num_dict[client_idx] = local_data_num + logging.info("client_idx = %d, local_sample_number = %d" % (client_idx, local_data_num)) + + # training batch size = 64; algorithms batch size = 32 + train_data_local, test_data_local = get_dataloader(dataset, data_dir, batch_size, batch_size, + dataidxs) + logging.info("client_idx = %d, batch_num_train_local = %d, batch_num_test_local = %d" % ( + client_idx, len(train_data_local), len(test_data_local))) + train_data_local_dict[client_idx] = train_data_local + test_data_local_dict[client_idx] = test_data_local + return train_data_num, test_data_num, train_data_global, test_data_global, \ + data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num diff --git a/data_preprocessing/cifar100/datasets.py b/data_preprocessing/cifar100/datasets.py new file mode 100644 index 0000000..f94460c --- /dev/null +++ b/data_preprocessing/cifar100/datasets.py @@ -0,0 +1,96 @@ +import logging + +import numpy as np +import torch.utils.data as data +from PIL import Image +from torchvision.datasets import CIFAR100 + +logging.basicConfig() +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp') + + +def accimage_loader(path): + import accimage + try: + return accimage.Image(path) + except IOError: + # Potentially a decoding problem, fall back to PIL.Image + return pil_loader(path) + + +def pil_loader(path): + # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) + with open(path, 'rb') as f: + img = Image.open(f) + return img.convert('RGB') + + +def default_loader(path): + from torchvision import get_image_backend + if get_image_backend() == 'accimage': + return accimage_loader(path) + else: + return pil_loader(path) + + +class CIFAR100_truncated(data.Dataset): + + def __init__(self, root, dataidxs=None, train=True, transform=None, target_transform=None, download=False): + + self.root = root + self.dataidxs = dataidxs + self.train = train + self.transform = transform + self.target_transform = target_transform + self.download = download + + self.data, self.target = self.__build_truncated_dataset__() + + def __build_truncated_dataset__(self): + + cifar_dataobj = CIFAR100(self.root, self.train, self.transform, self.target_transform, self.download) + + if self.train: + # print("train member of the class: {}".format(self.train)) + # data = cifar_dataobj.train_data + data = cifar_dataobj.data + target = np.array(cifar_dataobj.targets) + else: + data = cifar_dataobj.data + target = np.array(cifar_dataobj.targets) + + if self.dataidxs is not None: + data = data[self.dataidxs] + target = target[self.dataidxs] + + return data, target + + def truncate_channel(self, index): + for i in range(index.shape[0]): + gs_index = index[i] + self.data[gs_index, :, :, 1] = 0.0 + self.data[gs_index, :, :, 2] = 0.0 + + def __getitem__(self, index): + """ + Args: + index (int): Index + + Returns: + tuple: (image, target) where target is index of the target class. + """ + img, target = self.data[index], self.target[index] + + if self.transform is not None: + img = self.transform(img) + + if self.target_transform is not None: + target = self.target_transform(target) + + return img, target + + def __len__(self): + return len(self.data) \ No newline at end of file diff --git a/data_preprocessing/cinic10/__init__.py b/data_preprocessing/cinic10/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data_preprocessing/cinic10/data_loader.py b/data_preprocessing/cinic10/data_loader.py new file mode 100644 index 0000000..d499d2d --- /dev/null +++ b/data_preprocessing/cinic10/data_loader.py @@ -0,0 +1,321 @@ +import logging +import os + +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils.data as data +import torchvision.transforms as transforms + +from .datasets import ImageFolderTruncated + +logging.basicConfig() +logger = logging.getLogger() +logger.setLevel(logging.INFO) + + +# generate the non-IID distribution for all methods +def read_data_distribution(filename='./data_preprocessing/non-iid-distribution/CIFAR10/distribution.txt'): + distribution = {} + with open(filename, 'r') as data: + for x in data.readlines(): + if '{' != x[0] and '}' != x[0]: + tmp = x.split(':') + if '{' == tmp[1].strip(): + first_level_key = int(tmp[0]) + distribution[first_level_key] = {} + else: + second_level_key = int(tmp[0]) + distribution[first_level_key][second_level_key] = int(tmp[1].strip().replace(',', '')) + return distribution + + +def read_net_dataidx_map(filename='./data_preprocessing/non-iid-distribution/CIFAR10/net_dataidx_map.txt'): + net_dataidx_map = {} + with open(filename, 'r') as data: + for x in data.readlines(): + if '{' != x[0] and '}' != x[0] and ']' != x[0]: + tmp = x.split(':') + if '[' == tmp[-1].strip(): + key = int(tmp[0]) + net_dataidx_map[key] = [] + else: + tmp_array = x.split(',') + net_dataidx_map[key] = [int(i.strip()) for i in tmp_array] + return net_dataidx_map + + +def record_net_data_stats(y_train, net_dataidx_map): + net_cls_counts = {} + + for net_i, dataidx in net_dataidx_map.items(): + unq, unq_cnt = np.unique(y_train[dataidx], return_counts=True) + tmp = {unq[i]: unq_cnt[i] for i in range(len(unq))} + net_cls_counts[net_i] = tmp + logging.debug('Data statistics: %s' % str(net_cls_counts)) + return net_cls_counts + + +class Cutout(object): + def __init__(self, length): + self.length = length + + def __call__(self, img): + h, w = img.size(1), img.size(2) + mask = np.ones((h, w), np.float32) + y = np.random.randint(h) + x = np.random.randint(w) + + y1 = np.clip(y - self.length // 2, 0, h) + y2 = np.clip(y + self.length // 2, 0, h) + x1 = np.clip(x - self.length // 2, 0, w) + x2 = np.clip(x + self.length // 2, 0, w) + + mask[y1: y2, x1: x2] = 0. + mask = torch.from_numpy(mask) + mask = mask.expand_as(img) + img *= mask + return img + + +def _data_transforms_cinic10(): + cinic_mean = [0.47889522, 0.47227842, 0.43047404] + cinic_std = [0.24205776, 0.23828046, 0.25874835] + # Transformer for train set: random crops and horizontal flip + train_transform = transforms.Compose([transforms.ToTensor(), + transforms.Lambda( + lambda x: F.pad(x.unsqueeze(0), + (4, 4, 4, 4), + mode='reflect').data.squeeze()), + transforms.ToPILImage(), + transforms.RandomCrop(32), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(mean=cinic_mean, + std=cinic_std), + ]) + + # Transformer for test set + valid_transform = transforms.Compose([transforms.ToTensor(), + transforms.Lambda( + lambda x: F.pad(x.unsqueeze(0), + (4, 4, 4, 4), + mode='reflect').data.squeeze()), + transforms.ToPILImage(), + transforms.RandomCrop(32), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(mean=cinic_mean, + std=cinic_std), + ]) + return train_transform, valid_transform + + +def load_cinic10_data(datadir): + _train_dir = datadir + str('/train') + logging.info("_train_dir = " + str(_train_dir)) + _test_dir = datadir + str('/test') + cinic_mean = [0.47889522, 0.47227842, 0.43047404] + cinic_std = [0.24205776, 0.23828046, 0.25874835] + trainset = ImageFolderTruncated(_train_dir, transform=transforms.Compose([transforms.ToTensor(), + transforms.Lambda( + lambda x: F.pad(x.unsqueeze(0), + (4, 4, 4, 4), + mode='reflect').data.squeeze()), + transforms.ToPILImage(), + transforms.RandomCrop(32), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(mean=cinic_mean, + std=cinic_std), + ])) + + testset = ImageFolderTruncated(_test_dir, transform=transforms.Compose([transforms.ToTensor(), + transforms.Lambda( + lambda x: F.pad(x.unsqueeze(0), + (4, 4, 4, 4), + mode='reflect').data.squeeze()), + transforms.ToPILImage(), + transforms.RandomCrop(32), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize(mean=cinic_mean, + std=cinic_std), + ])) + X_train, y_train = trainset.imgs, trainset.targets + X_test, y_test = testset.imgs, testset.targets + return (X_train, y_train, X_test, y_test) + + +def partition_data(dataset, datadir, partition, n_nets, alpha): + logging.info("*********partition data***************") + pil_logger = logging.getLogger('PIL') + pil_logger.setLevel(logging.INFO) + + X_train, y_train, X_test, y_test = load_cinic10_data(datadir) + X_train = np.array(X_train) + X_test = np.array(X_test) + y_train = np.array(y_train) + y_test = np.array(y_test) + n_train = len(X_train) + # n_test = len(X_test) + + if partition == "homo": + total_num = n_train + idxs = np.random.permutation(total_num) + batch_idxs = np.array_split(idxs, n_nets) + net_dataidx_map = {i: batch_idxs[i] for i in range(n_nets)} + + elif partition == "hetero": + min_size = 0 + K = 10 + N = y_train.shape[0] + logging.info("N = " + str(N)) + net_dataidx_map = {} + + while min_size < 10: + idx_batch = [[] for _ in range(n_nets)] + # for each class in the dataset + for k in range(K): + idx_k = np.where(y_train == k)[0] + np.random.shuffle(idx_k) + proportions = np.random.dirichlet(np.repeat(alpha, n_nets)) + ## Balance + proportions = np.array([p * (len(idx_j) < N / n_nets) for p, idx_j in zip(proportions, idx_batch)]) + proportions = proportions / proportions.sum() + proportions = (np.cumsum(proportions) * len(idx_k)).astype(int)[:-1] + idx_batch = [idx_j + idx.tolist() for idx_j, idx in zip(idx_batch, np.split(idx_k, proportions))] + min_size = min([len(idx_j) for idx_j in idx_batch]) + + for j in range(n_nets): + np.random.shuffle(idx_batch[j]) + net_dataidx_map[j] = idx_batch[j] + + elif partition == "hetero-fix": + dataidx_map_file_path = './data_preprocessing/non-iid-distribution/CINIC10/net_dataidx_map.txt' + net_dataidx_map = read_net_dataidx_map(dataidx_map_file_path) + + if partition == "hetero-fix": + distribution_file_path = './data_preprocessing/non-iid-distribution/CINIC10/distribution.txt' + traindata_cls_counts = read_data_distribution(distribution_file_path) + else: + traindata_cls_counts = record_net_data_stats(y_train, net_dataidx_map) + + return X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts + + +# for centralized training +def get_dataloader(dataset, datadir, train_bs, test_bs, dataidxs=None): + return get_dataloader_cinic10(datadir, train_bs, test_bs, dataidxs) + + +# for local devices +def get_dataloader_test(dataset, datadir, train_bs, test_bs, dataidxs_train, dataidxs_test): + return get_dataloader_test_cinic10(datadir, train_bs, test_bs, dataidxs_train, dataidxs_test) + + +def get_dataloader_cinic10(datadir, train_bs, test_bs, dataidxs=None): + dl_obj = ImageFolderTruncated + + transform_train, transform_test = _data_transforms_cinic10() + + traindir = os.path.join(datadir, 'train') + valdir = os.path.join(datadir, 'test') + + train_ds = dl_obj(traindir, dataidxs=dataidxs, transform=transform_train) + test_ds = dl_obj(valdir, transform=transform_train) + + train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=True) + test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=True) + + return train_dl, test_dl + +def get_dataloader_test_cinic10(datadir, train_bs, test_bs, dataidxs_train=None, dataidxs_test=None): + dl_obj = ImageFolderTruncated + + transform_train, transform_test = _data_transforms_cinic10() + + traindir = os.path.join(datadir, 'train') + valdir = os.path.join(datadir, 'test') + + train_ds = dl_obj(traindir, dataidxs=dataidxs_train, transform=transform_train) + test_ds = dl_obj(valdir, dataidxs=dataidxs_test, transform=transform_test) + + train_dl = data.DataLoader(dataset=train_ds, batch_size=train_bs, shuffle=True, drop_last=True) + test_dl = data.DataLoader(dataset=test_ds, batch_size=test_bs, shuffle=False, drop_last=True) + + return train_dl, test_dl + + +def load_partition_data_distributed_cinic10(process_id, dataset, data_dir, partition_method, partition_alpha, + client_number, batch_size): + X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts = partition_data(dataset, + data_dir, + partition_method, + client_number, + partition_alpha) + class_num = len(np.unique(y_train)) + logging.info("traindata_cls_counts = " + str(traindata_cls_counts)) + train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)]) + + # get global test data + if process_id == 0: + train_data_global, test_data_global = get_dataloader(dataset, data_dir, batch_size, batch_size) + logging.info("train_dl_global number = " + str(len(train_data_global))) + logging.info("test_dl_global number = " + str(len(train_data_global))) + test_data_num = len(test_data_global) + train_data_local = None + test_data_local = None + local_data_num = 0 + else: + # get local dataset + dataidxs = net_dataidx_map[process_id - 1] + local_data_num = len(dataidxs) + logging.info("rank = %d, local_sample_number = %d" % (process_id, local_data_num)) + # training batch size = 64; algorithms batch size = 32 + train_data_local, test_data_local = get_dataloader(dataset, data_dir, batch_size, batch_size, + dataidxs) + logging.info("process_id = %d, batch_num_train_local = %d, batch_num_test_local = %d" % ( + process_id, len(train_data_local), len(test_data_local))) + test_data_num = 0 + train_data_global = None + test_data_global = None + + return train_data_num, test_data_num, train_data_global, test_data_global, local_data_num, train_data_local, test_data_local, class_num + + +def load_partition_data_cinic10(dataset, data_dir, partition_method, partition_alpha, client_number, batch_size): + X_train, y_train, X_test, y_test, net_dataidx_map, traindata_cls_counts = partition_data(dataset, + data_dir, + partition_method, + client_number, + partition_alpha) + class_num = len(np.unique(y_train)) + logging.info("traindata_cls_counts = " + str(traindata_cls_counts)) + train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)]) + + train_data_global, test_data_global = get_dataloader(dataset, data_dir, batch_size, batch_size) + logging.info("train_dl_global number = " + str(len(train_data_global))) + logging.info("test_dl_global number = " + str(len(train_data_global))) + test_data_num = len(test_data_global) + + # get local dataset + data_local_num_dict = dict() + train_data_local_dict = dict() + test_data_local_dict = dict() + + for client_idx in range(client_number): + dataidxs = net_dataidx_map[client_idx] + local_data_num = len(dataidxs) + data_local_num_dict[client_idx] = local_data_num + logging.info("client_idx = %d, local_sample_number = %d" % (client_idx, local_data_num)) + + # training batch size = 64; algorithms batch size = 32 + train_data_local, test_data_local = get_dataloader(dataset, data_dir, batch_size, batch_size, + dataidxs) + logging.info("client_idx = %d, batch_num_train_local = %d, batch_num_test_local = %d" % ( + client_idx, len(train_data_local), len(test_data_local))) + train_data_local_dict[client_idx] = train_data_local + test_data_local_dict[client_idx] = test_data_local + return train_data_num, test_data_num, train_data_global, test_data_global, \ + data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num diff --git a/data_preprocessing/cinic10/datasets.py b/data_preprocessing/cinic10/datasets.py new file mode 100644 index 0000000..454c651 --- /dev/null +++ b/data_preprocessing/cinic10/datasets.py @@ -0,0 +1,105 @@ +import logging + +import numpy as np +from PIL import Image +from torchvision.datasets import DatasetFolder + +logging.basicConfig() +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp') + + +def accimage_loader(path): + import accimage + try: + return accimage.Image(path) + except IOError: + # Potentially a decoding problem, fall back to PIL.Image + return pil_loader(path) + + +def pil_loader(path): + # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) + with open(path, 'rb') as f: + img = Image.open(f) + return img.convert('RGB') + + +def default_loader(path): + from torchvision import get_image_backend + if get_image_backend() == 'accimage': + return accimage_loader(path) + else: + return pil_loader(path) + + +class ImageFolderTruncated(DatasetFolder): + """A generic data loader where the images are arranged in this way: :: + + root/dog/xxx.png + root/dog/xxy.png + root/dog/xxz.png + + root/cat/123.png + root/cat/nsdf3.png + root/cat/asd932_.png + + Args: + root (string): Root directory path. + transform (callable, optional): A function/transform that takes in an PIL image + and returns a transformed version. E.g, ``transforms.RandomCrop`` + target_transform (callable, optional): A function/transform that takes in the + target and transforms it. + loader (callable, optional): A function to load an image given its path. + is_valid_file (callable, optional): A function that takes path of an Image file + and check if the file is a valid_file (used to check of corrupt files) + + Attributes: + classes (list): List of the class names. + class_to_idx (dict): Dict with items (class_name, class_index). + imgs (list): List of (image path, class_index) tuples + """ + + def __init__(self, root, dataidxs=None, transform=None, target_transform=None, + loader=default_loader, is_valid_file=None): + super(ImageFolderTruncated, self).__init__(root, loader, IMG_EXTENSIONS if is_valid_file is None else None, + transform=transform, + target_transform=target_transform, + is_valid_file=is_valid_file) + self.imgs = self.samples + self.dataidxs = dataidxs + + ### we need to fetch training labels out here: + self._train_labels = np.array([tup[-1] for tup in self.imgs]) + + self.__build_truncated_dataset__() + + def __build_truncated_dataset__(self): + if self.dataidxs is not None: + # self.imgs = self.imgs[self.dataidxs] + self.imgs = [self.imgs[idx] for idx in self.dataidxs] + + def __len__(self): + return len(self.imgs) + + def __getitem__(self, index): + """ + Args: + index (int): Index + + Returns: + tuple: (sample, target) where target is class_index of the target class. + """ + path, target = self.imgs[index] + sample = self.loader(path) + if self.transform is not None: + sample = self.transform(sample) + if self.target_transform is not None: + target = self.target_transform(target) + return sample, target + + @property + def get_train_labels(self): + return self._train_labels diff --git a/data_preprocessing/coco/dectection/data_loader.py b/data_preprocessing/coco/dectection/data_loader.py new file mode 100644 index 0000000..be3ac24 --- /dev/null +++ b/data_preprocessing/coco/dectection/data_loader.py @@ -0,0 +1,123 @@ +import logging + + +import os +import yaml +import math +import torch +import numpy as np +import torch.utils.data as data +import torchvision.transforms as transforms +from .datasets import create_dataloader +from pathlib import Path + +# def partition_data(data_path, partition, n_nets): +# n_data = len(os.listdir(data_path)) +# if partition == "homo": +# total_num = n_data +# idxs = np.random.permutation(total_num) +# batch_idxs = np.array_split(idxs, n_nets) +# net_dataidx_map = {i: batch_idxs[i] for i in range(n_nets)} +# elif partition == 'hetero': +# print("not support!") +# pass +# return net_dataidx_map +def make_divisible(x, divisor): + # Returns x evenly divisible by divisor + return math.ceil(x / divisor) * divisor + +def check_img_size(img_size, s=32): + # Verify img_size is a multiple of stride s + new_size = make_divisible(img_size, int(s)) # ceil gs-multiple + if new_size != img_size: + print('WARNING: --img-size %g must be multiple of max stride %g, updating to %g' % (img_size, s, new_size)) + return new_size + +def partition_data(data_path, partition, n_nets): + if os.path.isfile(data_path): + with open(data_path) as f: + data = f.readlines() + n_data = len(data) + else: + n_data = len(os.listdir(data_path)) + if partition == "homo": + total_num = n_data + idxs = np.random.permutation(total_num) + batch_idxs = np.array_split(idxs, n_nets) + net_dataidx_map = {i: batch_idxs[i] for i in range(n_nets)} + elif partition == 'hetero': + print("not support!") + pass + + return net_dataidx_map + + +def load_partition_data_coco(opt, hyp, model): + save_dir, epochs, batch_size, total_batch_size, weights, rank = \ + Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank + + with open(opt.data) as f: + data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict + + train_path = data_dict['train'] + test_path = data_dict['val'] + nc, names = (1, ['item']) if opt.single_cls else (int(data_dict['nc']), data_dict['names']) # number classes, names + gs = int(max(model.stride)) # grid size (max stride) + imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] + + + + client_number = opt.client_num_in_total + partition = opt.partition_method + + # client_list = [] + + net_dataidx_map = partition_data(train_path, partition=partition, n_nets=client_number) + net_dataidx_map_test = partition_data(test_path, partition=partition, n_nets=client_number) + train_data_loader_dict = dict() + test_data_loader_dict = dict() + train_data_num_dict = dict() + train_dataset_dict = dict() + + train_dataloader_global, train_dataset_global = create_dataloader(train_path, imgsz, batch_size, gs, opt, + hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, + rank=rank, + world_size=opt.world_size, workers=opt.workers, + image_weights=opt.image_weights) + train_data_num = len(train_dataset_global) + + test_dataloader_global = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, # testloader + hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, + rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5)[0] + + test_data_num = len(test_dataloader_global.dataset) + + + for i in range(client_number): + print("net_dataidx_map trainer:", net_dataidx_map[i]) + dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, + hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, + rank=rank, + world_size=opt.world_size, workers=opt.workers, + image_weights=opt.image_weights, + net_dataidx_map=net_dataidx_map[i]) + testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, # testloader + hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, + rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5, net_dataidx_map=net_dataidx_map_test[i])[0] + + + + train_dataset_dict[i] = dataset + train_data_num_dict[i] = len(dataset) + train_data_loader_dict[i] = dataloader + test_data_loader_dict[i] = testloader + # client_list.append( + # Client(i, train_data_loader_dict[i], len(dataset), opt, device, model, tb_writer=tb_writer, + # hyp=hyp, wandb=wandb)) + # + + + + return train_data_num, test_data_num, train_dataloader_global, test_dataloader_global, \ + train_data_num_dict, train_data_loader_dict, test_data_loader_dict, nc + diff --git a/data_preprocessing/coco/dectection/dataset.py b/data_preprocessing/coco/dectection/dataset.py new file mode 100644 index 0000000..48319b3 --- /dev/null +++ b/data_preprocessing/coco/dectection/dataset.py @@ -0,0 +1,968 @@ +# Dataset utils and dataloaders + +import glob +import logging +import math +import os +import random +import shutil +import time +from itertools import repeat +from multiprocessing.pool import ThreadPool +from pathlib import Path +from threading import Thread + +import cv2 +import numpy as np +import torch +from PIL import Image, ExifTags +from torch.utils.data import Dataset +from tqdm import tqdm +import sys + +from fedml_api.model.object_detection.yolov5.utils.general import xyxy2xywh, xywh2xyxy +from fedml_api.model.object_detection.yolov5.utils.torch_utils import torch_distributed_zero_first + +# Parameters +help_url = 'https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data' +img_formats = ['bmp', 'jpg', 'jpeg', 'png', 'tif', 'tiff', 'dng'] # acceptable image suffixes +vid_formats = ['mov', 'avi', 'mp4', 'mpg', 'mpeg', 'm4v', 'wmv', 'mkv'] # acceptable video suffixes +logger = logging.getLogger(__name__) + +# Get orientation exif tag +for orientation in ExifTags.TAGS.keys(): + if ExifTags.TAGS[orientation] == 'Orientation': + break + + +def get_hash(files): + # Returns a single hash value of a list of files + return sum(os.path.getsize(f) for f in files if os.path.isfile(f)) + + +def exif_size(img): + # Returns exif-corrected PIL size + s = img.size # (width, height) + try: + rotation = dict(img._getexif().items())[orientation] + if rotation == 6: # rotation 270 + s = (s[1], s[0]) + elif rotation == 8: # rotation 90 + s = (s[1], s[0]) + except: + pass + + return s + +def partition_data(data_path, partition, n_nets): + if os.path.isfile(data_path): + with open(data_path) as f: + data = f.readlines() + n_data = len(data) + else: + n_data = len(os.listdir(data_path)) + if partition == "homo": + total_num = n_data + idxs = np.random.permutation(total_num) + batch_idxs = np.array_split(idxs, n_nets) + net_dataidx_map = {i: batch_idxs[i] for i in range(n_nets)} + elif partition == 'hetero': + print("not support!") + pass + + return net_dataidx_map + +def create_dataloader(path, imgsz, batch_size, stride, opt, hyp=None, augment=False, cache=False, pad=0.0, rect=False, + rank=-1, world_size=1, workers=8, image_weights=False, net_dataidx_map=None): + # Make sure only the first process in DDP process the dataset first, and the following others can use the cache + with torch_distributed_zero_first(rank): + dataset = LoadImagesAndLabels(path, imgsz, batch_size, + augment=augment, # augment images + hyp=hyp, # augmentation hyperparameters + rect=rect, # rectangular training + cache_images=cache, + single_cls=opt.single_cls, + stride=int(stride), + pad=pad, + rank=rank, + image_weights=image_weights, + net_dataidx_map=net_dataidx_map) + + batch_size = min(batch_size, len(dataset)) + nw = min([os.cpu_count() // world_size, batch_size if batch_size > 1 else 0, workers]) # number of workers + sampler = torch.utils.data.distributed.DistributedSampler(dataset) if rank != -1 else None + loader = torch.utils.data.DataLoader if image_weights else InfiniteDataLoader + # Use torch.utils.data.DataLoader() if dataset.properties will update during training else InfiniteDataLoader() + dataloader = loader(dataset, + batch_size=batch_size, + num_workers=nw, + sampler=sampler, + pin_memory=True, + collate_fn=LoadImagesAndLabels.collate_fn) + return dataloader, dataset + + +class InfiniteDataLoader(torch.utils.data.dataloader.DataLoader): + """ Dataloader that reuses workers + + Uses same syntax as vanilla DataLoader + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + object.__setattr__(self, 'batch_sampler', _RepeatSampler(self.batch_sampler)) + self.iterator = super().__iter__() + + def __len__(self): + return len(self.batch_sampler.sampler) + + def __iter__(self): + for i in range(len(self)): + yield next(self.iterator) + + +class _RepeatSampler(object): + """ Sampler that repeats forever + + Args: + sampler (Sampler) + """ + + def __init__(self, sampler): + self.sampler = sampler + + def __iter__(self): + while True: + yield from iter(self.sampler) + + +class LoadImages: # for inference + def __init__(self, path, img_size=640): + p = str(Path(path)) # os-agnostic + p = os.path.abspath(p) # absolute path + if '*' in p: + files = sorted(glob.glob(p, recursive=True)) # glob + elif os.path.isdir(p): + files = sorted(glob.glob(os.path.join(p, '*.*'))) # dir + elif os.path.isfile(p): + files = [p] # files + else: + raise Exception('ERROR: %s does not exist' % p) + + images = [x for x in files if x.split('.')[-1].lower() in img_formats] + videos = [x for x in files if x.split('.')[-1].lower() in vid_formats] + ni, nv = len(images), len(videos) + + self.img_size = img_size + self.files = images + videos + self.nf = ni + nv # number of files + self.video_flag = [False] * ni + [True] * nv + self.mode = 'image' + if any(videos): + self.new_video(videos[0]) # new video + else: + self.cap = None + assert self.nf > 0, 'No images or videos found in %s. Supported formats are:\nimages: %s\nvideos: %s' % \ + (p, img_formats, vid_formats) + + def __iter__(self): + self.count = 0 + return self + + def __next__(self): + if self.count == self.nf: + raise StopIteration + path = self.files[self.count] + + if self.video_flag[self.count]: + # Read video + self.mode = 'video' + ret_val, img0 = self.cap.read() + if not ret_val: + self.count += 1 + self.cap.release() + if self.count == self.nf: # last video + raise StopIteration + else: + path = self.files[self.count] + self.new_video(path) + ret_val, img0 = self.cap.read() + + self.frame += 1 + print('video %g/%g (%g/%g) %s: ' % (self.count + 1, self.nf, self.frame, self.nframes, path), end='') + + else: + # Read image + self.count += 1 + img0 = cv2.imread(path) # BGR + assert img0 is not None, 'Image Not Found ' + path + print('image %g/%g %s: ' % (self.count, self.nf, path), end='') + + # Padded resize + img = letterbox(img0, new_shape=self.img_size)[0] + + # Convert + img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416 + img = np.ascontiguousarray(img) + + return path, img, img0, self.cap + + def new_video(self, path): + self.frame = 0 + self.cap = cv2.VideoCapture(path) + self.nframes = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) + + def __len__(self): + return self.nf # number of files + + +class LoadWebcam: # for inference + def __init__(self, pipe='0', img_size=640): + self.img_size = img_size + + if pipe.isnumeric(): + pipe = eval(pipe) # local camera + # pipe = 'rtsp://192.168.1.64/1' # IP camera + # pipe = 'rtsp://username:password@192.168.1.64/1' # IP camera with login + # pipe = 'http://wmccpinetop.axiscam.net/mjpg/video.mjpg' # IP golf camera + + self.pipe = pipe + self.cap = cv2.VideoCapture(pipe) # video capture object + self.cap.set(cv2.CAP_PROP_BUFFERSIZE, 3) # set buffer size + + def __iter__(self): + self.count = -1 + return self + + def __next__(self): + self.count += 1 + if cv2.waitKey(1) == ord('q'): # q to quit + self.cap.release() + cv2.destroyAllWindows() + raise StopIteration + + # Read frame + if self.pipe == 0: # local camera + ret_val, img0 = self.cap.read() + img0 = cv2.flip(img0, 1) # flip left-right + else: # IP camera + n = 0 + while True: + n += 1 + self.cap.grab() + if n % 30 == 0: # skip frames + ret_val, img0 = self.cap.retrieve() + if ret_val: + break + + # Print + assert ret_val, 'Camera Error %s' % self.pipe + img_path = 'webcam.jpg' + print('webcam %g: ' % self.count, end='') + + # Padded resize + img = letterbox(img0, new_shape=self.img_size)[0] + + # Convert + img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416 + img = np.ascontiguousarray(img) + + return img_path, img, img0, None + + def __len__(self): + return 0 + + +class LoadStreams: # multiple IP or RTSP cameras + def __init__(self, sources='streams.txt', img_size=640): + self.mode = 'stream' + self.img_size = img_size + + if os.path.isfile(sources): + with open(sources, 'r') as f: + sources = [x.strip() for x in f.read().strip().splitlines() if len(x.strip())] + else: + sources = [sources] + + n = len(sources) + self.imgs = [None] * n + self.sources = sources + for i, s in enumerate(sources): + # Start the thread to read frames from the video stream + print('%g/%g: %s... ' % (i + 1, n, s), end='') + cap = cv2.VideoCapture(eval(s) if s.isnumeric() else s) + assert cap.isOpened(), 'Failed to open %s' % s + w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fps = cap.get(cv2.CAP_PROP_FPS) % 100 + _, self.imgs[i] = cap.read() # guarantee first frame + thread = Thread(target=self.update, args=([i, cap]), daemon=True) + print(' success (%gx%g at %.2f FPS).' % (w, h, fps)) + thread.start() + print('') # newline + + # check for common shapes + s = np.stack([letterbox(x, new_shape=self.img_size)[0].shape for x in self.imgs], 0) # inference shapes + self.rect = np.unique(s, axis=0).shape[0] == 1 # rect inference if all shapes equal + if not self.rect: + print('WARNING: Different stream shapes detected. For optimal performance supply similarly-shaped streams.') + + def update(self, index, cap): + # Read next stream frame in a daemon thread + n = 0 + while cap.isOpened(): + n += 1 + # _, self.imgs[index] = cap.read() + cap.grab() + if n == 4: # read every 4th frame + _, self.imgs[index] = cap.retrieve() + n = 0 + time.sleep(0.01) # wait time + + def __iter__(self): + self.count = -1 + return self + + def __next__(self): + self.count += 1 + img0 = self.imgs.copy() + if cv2.waitKey(1) == ord('q'): # q to quit + cv2.destroyAllWindows() + raise StopIteration + + # Letterbox + img = [letterbox(x, new_shape=self.img_size, auto=self.rect)[0] for x in img0] + + # Stack + img = np.stack(img, 0) + + # Convert + img = img[:, :, :, ::-1].transpose(0, 3, 1, 2) # BGR to RGB, to bsx3x416x416 + img = np.ascontiguousarray(img) + + return self.sources, img, img0, None + + def __len__(self): + return 0 # 1E12 frames = 32 streams at 30 FPS for 30 years + + +def img2label_paths(img_paths): + # Define label paths as a function of image paths + sa, sb = os.sep + 'images' + os.sep, os.sep + 'labels' + os.sep # /images/, /labels/ substrings + return [x.replace(sa, sb, 1).replace('.' + x.split('.')[-1], '.txt') for x in img_paths] + + +class LoadImagesAndLabels(Dataset): # for training/testing + def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False, + cache_images=False, single_cls=False, stride=32, pad=0.0, rank=-1, net_dataidx_map=None): + self.img_size = img_size + self.augment = augment + self.hyp = hyp + self.image_weights = image_weights + self.rect = False if image_weights else rect + self.mosaic = self.augment and not self.rect # load 4 images at a time into a mosaic (only during training) + self.mosaic_border = [-img_size // 2, -img_size // 2] + self.stride = stride + + try: + f = [] # image files + for p in path if isinstance(path, list) else [path]: + p = Path(p) # os-agnostic + if p.is_dir(): # dir + f += glob.glob(str(p / '**' / '*.*'), recursive=True) + elif p.is_file(): # file + with open(p, 'r') as t: + t = t.read().strip().splitlines() + parent = str(p.parent) + os.sep + f += [x.replace('./', parent) if x.startswith('./') else x for x in t] # local to global path + else: + raise Exception('%s does not exist' % p) + self.img_files = sorted([x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in img_formats]) + assert self.img_files, 'No images found' + except Exception as e: + raise Exception('Error loading data from %s: %s\nSee %s' % (path, e, help_url)) + + # Check cache + self.label_files = img2label_paths(self.img_files) # labels + cache_path = Path(self.label_files[0]).parent.with_suffix('.cache') # cached labels + if cache_path.is_file(): + cache = torch.load(cache_path) # load + if cache['hash'] != get_hash(self.label_files + self.img_files) or 'results' not in cache: # changed + cache = self.cache_labels(cache_path) # re-cache + else: + cache = self.cache_labels(cache_path) # cache + + # Display cache + [nf, nm, ne, nc, n] = cache.pop('results') # found, missing, empty, corrupted, total + desc = f"Scanning '{cache_path}' for images and labels... {nf} found, {nm} missing, {ne} empty, {nc} corrupted" + tqdm(None, desc=desc, total=n, initial=n) + assert nf > 0 or not augment, f'No labels found in {cache_path}. Can not train without labels. See {help_url}' + + # Read cache + cache.pop('hash') # remove hash + labels, shapes = zip(*cache.values()) + self.labels = list(labels) + self.shapes = shapes + # self.shapes = np.array(shapes, dtype=np.float64) + self.img_files = list(cache.keys()) # update + self.label_files = img2label_paths(cache.keys()) # update + + # client + print('net_dataidx_map:', net_dataidx_map) + print("len label:", len(self.labels)) + # print("shapes:", self.shapes) + # 这里根据net_dataidx_map来更新labels和图像等数据 + if net_dataidx_map is not None: + self.labels = [self.labels[i-1] for i in net_dataidx_map] + self.shapes = [self.shapes[i-1] for i in net_dataidx_map] + self.img_files = [self.img_files[i-1] for i in net_dataidx_map] + self.label_files = [self.label_files[i-1] for i in net_dataidx_map] + + self.shapes = np.array(self.shapes, dtype=np.float64) + print("after shapes:", self.shapes, len(self.shapes)) + if single_cls: + for x in self.labels: + x[:, 0] = 0 + + n = len(self.shapes) # number of images + bi = np.floor(np.arange(n) / batch_size).astype(np.int) # batch index + nb = bi[-1] + 1 # number of batches + self.batch = bi # batch index of image + self.n = n + self.indices = range(n) + print("indices:", n, self.indices) + + # Rectangular Training + if self.rect: + # Sort by aspect ratio + s = self.shapes # wh + ar = s[:, 1] / s[:, 0] # aspect ratio + irect = ar.argsort() + self.img_files = [self.img_files[i] for i in irect] + self.label_files = [self.label_files[i] for i in irect] + self.labels = [self.labels[i] for i in irect] + self.shapes = s[irect] # wh + ar = ar[irect] + + # Set training image shapes + shapes = [[1, 1]] * nb + for i in range(nb): + ari = ar[bi == i] + mini, maxi = ari.min(), ari.max() + if maxi < 1: + shapes[i] = [maxi, 1] + elif mini > 1: + shapes[i] = [1, 1 / mini] + + self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(np.int) * stride + + # Cache images into memory for faster training (WARNING: large datasets may exceed system RAM) + self.imgs = [None] * n + if cache_images: + gb = 0 # Gigabytes of cached images + self.img_hw0, self.img_hw = [None] * n, [None] * n + results = ThreadPool(8).imap(lambda x: load_image(*x), zip(repeat(self), range(n))) # 8 threads + pbar = tqdm(enumerate(results), total=n) + for i, x in pbar: + self.imgs[i], self.img_hw0[i], self.img_hw[i] = x # img, hw_original, hw_resized = load_image(self, i) + gb += self.imgs[i].nbytes + pbar.desc = 'Caching images (%.1fGB)' % (gb / 1E9) + + def cache_labels(self, path=Path('./labels.cache')): + # Cache dataset labels, check images and read shapes + x = {} # dict + nm, nf, ne, nc = 0, 0, 0, 0 # number missing, found, empty, duplicate + pbar = tqdm(zip(self.img_files, self.label_files), desc='Scanning images', total=len(self.img_files)) + for i, (im_file, lb_file) in enumerate(pbar): + try: + # verify images + im = Image.open(im_file) + im.verify() # PIL verify + shape = exif_size(im) # image size + assert (shape[0] > 9) & (shape[1] > 9), 'image size <10 pixels' + + # verify labels + if os.path.isfile(lb_file): + nf += 1 # label found + with open(lb_file, 'r') as f: + l = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32) # labels + if len(l): + assert l.shape[1] == 5, 'labels require 5 columns each' + assert (l >= 0).all(), 'negative labels' + assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels' + assert np.unique(l, axis=0).shape[0] == l.shape[0], 'duplicate labels' + else: + ne += 1 # label empty + l = np.zeros((0, 5), dtype=np.float32) + else: + nm += 1 # label missing + l = np.zeros((0, 5), dtype=np.float32) + x[im_file] = [l, shape] + except Exception as e: + nc += 1 + print('WARNING: Ignoring corrupted image and/or label %s: %s' % (im_file, e)) + + pbar.desc = f"Scanning '{path.parent / path.stem}' for images and labels... " \ + f"{nf} found, {nm} missing, {ne} empty, {nc} corrupted" + + if nf == 0: + print(f'WARNING: No labels found in {path}. See {help_url}') + + x['hash'] = get_hash(self.label_files + self.img_files) + x['results'] = [nf, nm, ne, nc, i + 1] + torch.save(x, path) # save for next time + logging.info(f"New cache created: {path}") + return x + + def __len__(self): + return len(self.img_files) + + # def __iter__(self): + # self.count = -1 + # print('ran dataset iter') + # #self.shuffled_vector = np.random.permutation(self.nF) if self.augment else np.arange(self.nF) + # return self + + def __getitem__(self, index): + index = self.indices[index] # linear, shuffled, or image_weights + + hyp = self.hyp + mosaic = self.mosaic and random.random() < hyp['mosaic'] + if mosaic: + # Load mosaic + img, labels = load_mosaic(self, index) + shapes = None + + # MixUp https://arxiv.org/pdf/1710.09412.pdf + if random.random() < hyp['mixup']: + img2, labels2 = load_mosaic(self, random.randint(0, self.n - 1)) + r = np.random.beta(8.0, 8.0) # mixup ratio, alpha=beta=8.0 + img = (img * r + img2 * (1 - r)).astype(np.uint8) + labels = np.concatenate((labels, labels2), 0) + + else: + # Load image + img, (h0, w0), (h, w) = load_image(self, index) + + # Letterbox + shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size # final letterboxed shape + img, ratio, pad = letterbox(img, shape, auto=False, scaleup=self.augment) + shapes = (h0, w0), ((h / h0, w / w0), pad) # for COCO mAP rescaling + + # Load labels + labels = [] + x = self.labels[index] + if x.size > 0: + # Normalized xywh to pixel xyxy format + labels = x.copy() + labels[:, 1] = ratio[0] * w * (x[:, 1] - x[:, 3] / 2) + pad[0] # pad width + labels[:, 2] = ratio[1] * h * (x[:, 2] - x[:, 4] / 2) + pad[1] # pad height + labels[:, 3] = ratio[0] * w * (x[:, 1] + x[:, 3] / 2) + pad[0] + labels[:, 4] = ratio[1] * h * (x[:, 2] + x[:, 4] / 2) + pad[1] + + if self.augment: + # Augment imagespace + if not mosaic: + img, labels = random_perspective(img, labels, + degrees=hyp['degrees'], + translate=hyp['translate'], + scale=hyp['scale'], + shear=hyp['shear'], + perspective=hyp['perspective']) + + # Augment colorspace + augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v']) + + # Apply cutouts + # if random.random() < 0.9: + # labels = cutout(img, labels) + + nL = len(labels) # number of labels + if nL: + labels[:, 1:5] = xyxy2xywh(labels[:, 1:5]) # convert xyxy to xywh + labels[:, [2, 4]] /= img.shape[0] # normalized height 0-1 + labels[:, [1, 3]] /= img.shape[1] # normalized width 0-1 + + if self.augment: + # flip up-down + if random.random() < hyp['flipud']: + img = np.flipud(img) + if nL: + labels[:, 2] = 1 - labels[:, 2] + + # flip left-right + if random.random() < hyp['fliplr']: + img = np.fliplr(img) + if nL: + labels[:, 1] = 1 - labels[:, 1] + + labels_out = torch.zeros((nL, 6)) + if nL: + labels_out[:, 1:] = torch.from_numpy(labels) + + # Convert + img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416 + img = np.ascontiguousarray(img) + + return torch.from_numpy(img), labels_out, self.img_files[index], shapes + + @staticmethod + def collate_fn(batch): + img, label, path, shapes = zip(*batch) # transposed + for i, l in enumerate(label): + l[:, 0] = i # add target image index for build_targets() + return torch.stack(img, 0), torch.cat(label, 0), path, shapes + + +# Ancillary functions -------------------------------------------------------------------------------------------------- +def load_image(self, index): + # loads 1 image from dataset, returns img, original hw, resized hw + img = self.imgs[index] + if img is None: # not cached + path = self.img_files[index] + img = cv2.imread(path) # BGR + assert img is not None, 'Image Not Found ' + path + h0, w0 = img.shape[:2] # orig hw + r = self.img_size / max(h0, w0) # resize image to img_size + if r != 1: # always resize down, only resize up if training with augmentation + interp = cv2.INTER_AREA if r < 1 and not self.augment else cv2.INTER_LINEAR + img = cv2.resize(img, (int(w0 * r), int(h0 * r)), interpolation=interp) + return img, (h0, w0), img.shape[:2] # img, hw_original, hw_resized + else: + return self.imgs[index], self.img_hw0[index], self.img_hw[index] # img, hw_original, hw_resized + + +def augment_hsv(img, hgain=0.5, sgain=0.5, vgain=0.5): + r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1 # random gains + hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV)) + dtype = img.dtype # uint8 + + x = np.arange(0, 256, dtype=np.int16) + lut_hue = ((x * r[0]) % 180).astype(dtype) + lut_sat = np.clip(x * r[1], 0, 255).astype(dtype) + lut_val = np.clip(x * r[2], 0, 255).astype(dtype) + + img_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))).astype(dtype) + cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) # no return needed + + # Histogram equalization + # if random.random() < 0.2: + # for i in range(3): + # img[:, :, i] = cv2.equalizeHist(img[:, :, i]) + + +def load_mosaic(self, index): + # loads images in a mosaic + + labels4 = [] + s = self.img_size + yc, xc = [int(random.uniform(-x, 2 * s + x)) for x in self.mosaic_border] # mosaic center x, y + indices = [index] + [self.indices[random.randint(0, self.n - 1)] for _ in range(3)] # 3 additional image indices + for i, index in enumerate(indices): + # Load image + img, _, (h, w) = load_image(self, index) + + # place img in img4 + if i == 0: # top left + img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8) # base image with 4 tiles + x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc # xmin, ymin, xmax, ymax (large image) + x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h # xmin, ymin, xmax, ymax (small image) + elif i == 1: # top right + x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc + x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h + elif i == 2: # bottom left + x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h) + x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h) + elif i == 3: # bottom right + x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h) + x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h) + + img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b] # img4[ymin:ymax, xmin:xmax] + padw = x1a - x1b + padh = y1a - y1b + + # Labels + x = self.labels[index] + labels = x.copy() + if x.size > 0: # Normalized xywh to pixel xyxy format + labels[:, 1] = w * (x[:, 1] - x[:, 3] / 2) + padw + labels[:, 2] = h * (x[:, 2] - x[:, 4] / 2) + padh + labels[:, 3] = w * (x[:, 1] + x[:, 3] / 2) + padw + labels[:, 4] = h * (x[:, 2] + x[:, 4] / 2) + padh + labels4.append(labels) + + # Concat/clip labels + if len(labels4): + labels4 = np.concatenate(labels4, 0) + np.clip(labels4[:, 1:], 0, 2 * s, out=labels4[:, 1:]) # use with random_perspective + # img4, labels4 = replicate(img4, labels4) # replicate + + # Augment + img4, labels4 = random_perspective(img4, labels4, + degrees=self.hyp['degrees'], + translate=self.hyp['translate'], + scale=self.hyp['scale'], + shear=self.hyp['shear'], + perspective=self.hyp['perspective'], + border=self.mosaic_border) # border to remove + + return img4, labels4 + + +def replicate(img, labels): + # Replicate labels + h, w = img.shape[:2] + boxes = labels[:, 1:].astype(int) + x1, y1, x2, y2 = boxes.T + s = ((x2 - x1) + (y2 - y1)) / 2 # side length (pixels) + for i in s.argsort()[:round(s.size * 0.5)]: # smallest indices + x1b, y1b, x2b, y2b = boxes[i] + bh, bw = y2b - y1b, x2b - x1b + yc, xc = int(random.uniform(0, h - bh)), int(random.uniform(0, w - bw)) # offset x, y + x1a, y1a, x2a, y2a = [xc, yc, xc + bw, yc + bh] + img[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b] # img4[ymin:ymax, xmin:xmax] + labels = np.append(labels, [[labels[i, 0], x1a, y1a, x2a, y2a]], axis=0) + + return img, labels + + +def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True): + # Resize image to a 32-pixel-multiple rectangle https://github.com/ultralytics/yolov3/issues/232 + shape = img.shape[:2] # current shape [height, width] + if isinstance(new_shape, int): + new_shape = (new_shape, new_shape) + + # Scale ratio (new / old) + r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) + if not scaleup: # only scale down, do not scale up (for better test mAP) + r = min(r, 1.0) + + # Compute padding + ratio = r, r # width, height ratios + new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) + dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding + if auto: # minimum rectangle + dw, dh = np.mod(dw, 32), np.mod(dh, 32) # wh padding + elif scaleFill: # stretch + dw, dh = 0.0, 0.0 + new_unpad = (new_shape[1], new_shape[0]) + ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios + + dw /= 2 # divide padding into 2 sides + dh /= 2 + + if shape[::-1] != new_unpad: # resize + img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR) + top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) + left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) + img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border + return img, ratio, (dw, dh) + + +def random_perspective(img, targets=(), degrees=10, translate=.1, scale=.1, shear=10, perspective=0.0, border=(0, 0)): + # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10)) + # targets = [cls, xyxy] + + height = img.shape[0] + border[0] * 2 # shape(h,w,c) + width = img.shape[1] + border[1] * 2 + + # Center + C = np.eye(3) + C[0, 2] = -img.shape[1] / 2 # x translation (pixels) + C[1, 2] = -img.shape[0] / 2 # y translation (pixels) + + # Perspective + P = np.eye(3) + P[2, 0] = random.uniform(-perspective, perspective) # x perspective (about y) + P[2, 1] = random.uniform(-perspective, perspective) # y perspective (about x) + + # Rotation and Scale + R = np.eye(3) + a = random.uniform(-degrees, degrees) + # a += random.choice([-180, -90, 0, 90]) # add 90deg rotations to small rotations + s = random.uniform(1 - scale, 1 + scale) + # s = 2 ** random.uniform(-scale, scale) + R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s) + + # Shear + S = np.eye(3) + S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # x shear (deg) + S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # y shear (deg) + + # Translation + T = np.eye(3) + T[0, 2] = random.uniform(0.5 - translate, 0.5 + translate) * width # x translation (pixels) + T[1, 2] = random.uniform(0.5 - translate, 0.5 + translate) * height # y translation (pixels) + + # Combined rotation matrix + M = T @ S @ R @ P @ C # order of operations (right to left) is IMPORTANT + if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any(): # image changed + if perspective: + img = cv2.warpPerspective(img, M, dsize=(width, height), borderValue=(114, 114, 114)) + else: # affine + img = cv2.warpAffine(img, M[:2], dsize=(width, height), borderValue=(114, 114, 114)) + + # Visualize + # import matplotlib.pyplot as plt + # ax = plt.subplots(1, 2, figsize=(12, 6))[1].ravel() + # ax[0].imshow(img[:, :, ::-1]) # base + # ax[1].imshow(img2[:, :, ::-1]) # warped + + # Transform label coordinates + n = len(targets) + if n: + # warp points + xy = np.ones((n * 4, 3)) + xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1 + xy = xy @ M.T # transform + if perspective: + xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8) # rescale + else: # affine + xy = xy[:, :2].reshape(n, 8) + + # create new boxes + x = xy[:, [0, 2, 4, 6]] + y = xy[:, [1, 3, 5, 7]] + xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T + + # # apply angle-based reduction of bounding boxes + # radians = a * math.pi / 180 + # reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5 + # x = (xy[:, 2] + xy[:, 0]) / 2 + # y = (xy[:, 3] + xy[:, 1]) / 2 + # w = (xy[:, 2] - xy[:, 0]) * reduction + # h = (xy[:, 3] - xy[:, 1]) * reduction + # xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T + + # clip boxes + xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width) + xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height) + + # filter candidates + i = box_candidates(box1=targets[:, 1:5].T * s, box2=xy.T) + targets = targets[i] + targets[:, 1:5] = xy[i] + + return img, targets + + +def box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.1): # box1(4,n), box2(4,n) + # Compute candidate boxes: box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio + w1, h1 = box1[2] - box1[0], box1[3] - box1[1] + w2, h2 = box2[2] - box2[0], box2[3] - box2[1] + ar = np.maximum(w2 / (h2 + 1e-16), h2 / (w2 + 1e-16)) # aspect ratio + return (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + 1e-16) > area_thr) & (ar < ar_thr) # candidates + + +def cutout(image, labels): + # Applies image cutout augmentation https://arxiv.org/abs/1708.04552 + h, w = image.shape[:2] + + def bbox_ioa(box1, box2): + # Returns the intersection over box2 area given box1, box2. box1 is 4, box2 is nx4. boxes are x1y1x2y2 + box2 = box2.transpose() + + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3] + b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3] + + # Intersection area + inter_area = (np.minimum(b1_x2, b2_x2) - np.maximum(b1_x1, b2_x1)).clip(0) * \ + (np.minimum(b1_y2, b2_y2) - np.maximum(b1_y1, b2_y1)).clip(0) + + # box2 area + box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + 1e-16 + + # Intersection over box2 area + return inter_area / box2_area + + # create random masks + scales = [0.5] * 1 + [0.25] * 2 + [0.125] * 4 + [0.0625] * 8 + [0.03125] * 16 # image size fraction + for s in scales: + mask_h = random.randint(1, int(h * s)) + mask_w = random.randint(1, int(w * s)) + + # box + xmin = max(0, random.randint(0, w) - mask_w // 2) + ymin = max(0, random.randint(0, h) - mask_h // 2) + xmax = min(w, xmin + mask_w) + ymax = min(h, ymin + mask_h) + + # apply random color mask + image[ymin:ymax, xmin:xmax] = [random.randint(64, 191) for _ in range(3)] + + # return unobscured labels + if len(labels) and s > 0.03: + box = np.array([xmin, ymin, xmax, ymax], dtype=np.float32) + ioa = bbox_ioa(box, labels[:, 1:5]) # intersection over area + labels = labels[ioa < 0.60] # remove >60% obscured labels + + return labels + + +def create_folder(path='./new'): + # Create folder + if os.path.exists(path): + shutil.rmtree(path) # delete output folder + os.makedirs(path) # make new output folder + + +def flatten_recursive(path='../coco128'): + # Flatten a recursive directory by bringing all files to top level + new_path = Path(path + '_flat') + create_folder(new_path) + for file in tqdm(glob.glob(str(Path(path)) + '/**/*.*', recursive=True)): + shutil.copyfile(file, new_path / Path(file).name) + + +def extract_boxes(path='../coco128/'): # from utils.datasets import *; extract_boxes('../coco128') + # Convert detection dataset into classification dataset, with one directory per class + + path = Path(path) # images dir + shutil.rmtree(path / 'classifier') if (path / 'classifier').is_dir() else None # remove existing + files = list(path.rglob('*.*')) + n = len(files) # number of files + for im_file in tqdm(files, total=n): + if im_file.suffix[1:] in img_formats: + # image + im = cv2.imread(str(im_file))[..., ::-1] # BGR to RGB + h, w = im.shape[:2] + + # labels + lb_file = Path(img2label_paths([str(im_file)])[0]) + if Path(lb_file).exists(): + with open(lb_file, 'r') as f: + lb = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32) # labels + + for j, x in enumerate(lb): + c = int(x[0]) # class + f = (path / 'classifier') / f'{c}' / f'{path.stem}_{im_file.stem}_{j}.jpg' # new filename + if not f.parent.is_dir(): + f.parent.mkdir(parents=True) + + b = x[1:] * [w, h, w, h] # box + # b[2:] = b[2:].max() # rectangle to square + b[2:] = b[2:] * 1.2 + 3 # pad + b = xywh2xyxy(b.reshape(-1, 4)).ravel().astype(np.int) + + b[[0, 2]] = np.clip(b[[0, 2]], 0, w) # clip boxes outside of image + b[[1, 3]] = np.clip(b[[1, 3]], 0, h) + assert cv2.imwrite(str(f), im[b[1]:b[3], b[0]:b[2]]), f'box failure in {f}' + + +def autosplit(path='../coco128', weights=(0.9, 0.1, 0.0)): # from utils.datasets import *; autosplit('../coco128') + """ Autosplit a dataset into train/val/test splits and save path/autosplit_*.txt files + # Arguments + path: Path to images directory + weights: Train, val, test weights (list) + """ + path = Path(path) # images dir + files = list(path.rglob('*.*')) + n = len(files) # number of files + indices = random.choices([0, 1, 2], weights=weights, k=n) # assign each image to a split + txt = ['autosplit_train.txt', 'autosplit_val.txt', 'autosplit_test.txt'] # 3 txt files + [(path / x).unlink() for x in txt if (path / x).exists()] # remove existing + for i, img in tqdm(zip(indices, files), total=n): + if img.suffix[1:] in img_formats: + with open(path / txt[i], 'a') as f: + f.write(str(img) + '\n') # add image to txt file diff --git a/experiments/centralized/classification/README.md b/experiments/centralized/classification/README.md new file mode 100644 index 0000000..138ca2e --- /dev/null +++ b/experiments/centralized/classification/README.md @@ -0,0 +1,43 @@ + +# PyTorch DDP classification + +## lr_scheduler parameter reference: + + +EfficientNet-B0 with RandAugment - 77.7 top-1, 95.3 top-5 + + + +``` + +sh run_classification.sh 8 1 0 127.0.0.1 11111 "0,3" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --model efficientnet -b 384 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.048" + +# crashed need to rerun +sh run_classification.sh 3 1 0 127.0.0.1 11112 "0,2,3" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --model efficientnet --distributed --if-timm-dataset -b 256 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr .048" + +sh run_classification.sh 3 1 0 127.0.0.1 11112 "0,2,3" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --model efficientnet --distributed --if-timm-dataset -b 256 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr .048" + + +sh run_classification.sh 3 1 0 127.0.0.1 11112 "0,1,2" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --model efficientnet --distributed --if-timm-dataset -b 256 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr .048" + +``` + + +MobileNetV3-Large-100 - 75.766 top-1, 92,542 top-5 + + + +``` +# crashed need to rerun +sh run_classification.sh 4 1 0 127.0.0.1 11113 "0,1,2,3" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --model mobilenet_v3 --distributed --if-timm-dataset -b 256 --sched step --epochs 600 --decay-epochs 2.4 --decay-rate .973 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr .064 --lr-noise 0.42 0.9" + +# crashed need to rerun +sh run_classification.sh 3 1 0 127.0.0.1 11113 "0,2,3" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --model mobilenet_v3 --distributed --if-timm-dataset -b 256 --sched step --epochs 600 --decay-epochs 2.4 --decay-rate .973 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr .05 --lr-noise 0.35 0.9" + +sh run_classification.sh 3 1 0 127.0.0.1 11113 "0,1,2" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --model mobilenet_v3 --distributed --if-timm-dataset -b 512 --sched step --epochs 600 --decay-epochs 2.4 --decay-rate .973 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr .064 --lr-noise 0.42 0.9" + + +``` +# kill all processes +kill $(ps aux | grep "ddp_classification.py" | grep -v grep | awk '{print $2}') +``` \ No newline at end of file diff --git a/experiments/centralized/classification/__init__.py b/experiments/centralized/classification/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/experiments/centralized/classification/configs/example.conf b/experiments/centralized/classification/configs/example.conf new file mode 100644 index 0000000..a1c3d4d --- /dev/null +++ b/experiments/centralized/classification/configs/example.conf @@ -0,0 +1,4 @@ +PYTHON=~/anaconda3/envs/py36/bin/python +imagenet_data_dir=/home/datasets/imagenet/ILSVRC2012_dataset +gld_data_dir=~/datasets/landmarks +cifar10_data_dir=~/datasets/cifar10 \ No newline at end of file diff --git a/experiments/centralized/classification/ddp_classification.py b/experiments/centralized/classification/ddp_classification.py new file mode 100644 index 0000000..b60fe97 --- /dev/null +++ b/experiments/centralized/classification/ddp_classification.py @@ -0,0 +1,482 @@ +import argparse +import logging +import os +import random +import socket +import sys +import traceback + + +import numpy as np +import psutil +import setproctitle +import wandb +from mpi4py import MPI +import torch +import torch.distributed as dist +import torch.nn as nn +from torch.nn.parallel import DistributedDataParallel as DDP +from timm import create_model as timm_create_model +from timm.models import resume_checkpoint, load_checkpoint, convert_splitbn_model + + +sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../../"))) + +from utils.tracker import RuntimeTracker +from utils.metrics import Metrics +from utils.wandb_util import wandb_log +from data_preprocessing.ImageNet.data_loader import distributed_centralized_ImageNet_loader +from data_preprocessing.Landmarks.data_loader import load_partition_data_landmarks + +from data_preprocessing.cifar10.iid_data_loader import load_iid_cifar10 +from data_preprocessing.cifar10.data_loader import load_partition_data_cifar10 +from data_preprocessing.cifar100.data_loader import load_partition_data_cifar100 +from data_preprocessing.cinic10.data_loader import load_partition_data_cinic10 + + + + + +from training.centralized_classification_trainer import ClassificationTrainer + + + +def add_args(parser): + """ + parser : argparse.ArgumentParser + return a parser added with args required by fit + """ + # Training settings + parser.add_argument('--model', type=str, default='mobilenet', metavar='N', + help='neural network used in training') + + parser.add_argument('--dataset', type=str, default='cifar10', metavar='N', + help='dataset used for training') + + parser.add_argument('--data_dir', type=str, default='./../../../data/cifar10', + help='data directory') + + parser.add_argument('--partition_method', type=str, default='hetero', metavar='N', + help='how to partition the dataset on local workers') + + parser.add_argument('--partition_alpha', type=float, default=0.5, metavar='PA', + help='partition alpha (default: 0.5)') + + parser.add_argument('--client_num_in_total', type=int, default=1000, metavar='NN', + help='number of workers in a distributed cluster') + + parser.add_argument('--client_num_per_round', type=int, default=4, metavar='NN', + help='number of workers') + + parser.add_argument('--batch_size', '-b', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') + + parser.add_argument('--client_optimizer', type=str, default='adam', + help='SGD with momentum; adam') + + # parser.add_argument('--lr', type=float, default=0.001, metavar='LR', + # help='learning rate (default: 0.001)') + + parser.add_argument('--wd', help='weight decay parameter;', type=float, default=0.001) + + # parser.add_argument('--epochs', type=int, default=5, metavar='EP', + # help='how many epochs will be trained locally') + + parser.add_argument('--comm_round', type=int, default=10, + help='how many round of communications we shoud use') + + parser.add_argument('--is_mobile', type=int, default=0, + help='whether the program is running on the FedML-Mobile server side') + + parser.add_argument('--frequency_of_the_test', type=int, default=1, + help='the frequency of the algorithms') + + parser.add_argument('--gpu_server_num', type=int, default=1, + help='gpu_server_num') + + parser.add_argument('--gpu_num_per_server', type=int, default=4, + help='gpu_num_per_server') + + parser.add_argument('--ci', type=int, default=0, + help='CI') + + parser.add_argument('--local_rank', type=int, default=0, + help='given by torch.distributed.launch') + + parser.add_argument('--pretrained',action='store_true', default=False, + help='Start with pretrained version of specified network (if avail)') + + parser.add_argument('--distributed', action='store_true', default=False, + help='If distributed training') + + parser.add_argument('--if-timm-dataset', action='store_true', default=False, + help='If use timm dataset augmentation') + + parser.add_argument('--data_load_num_workers', type=int, default=4, + help='number of workers when loading data') + + + # Dataset + parser.add_argument('--img-size', type=int, default=None, metavar='N', + help='Image patch size (default: None => model default)') + parser.add_argument('--crop-pct', default=None, type=float, + metavar='N', help='Input image center crop percent (for validation only)') + parser.add_argument('--data_transform', default=None, type=str, metavar='TRANSFORM', + help='How to do data transform') + parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN', + help='Override mean pixel value of dataset') + parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD', + help='Override std deviation of of dataset') + parser.add_argument('--interpolation', default='', type=str, metavar='NAME', + help='Image resize interpolation type (overrides model)') + # parser.add_argument('-b', '--batch-size', type=int, default=32, metavar='N', + # help='input batch size for training (default: 32)') + parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N', + help='ratio of validation batch size to training batch size (default: 1)') + + + # Model parameters + parser.add_argument('--gp', default=None, type=str, metavar='POOL', + help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.') + + # Optimizer parameters + parser.add_argument('--opt', default='sgd', type=str, metavar='OPTIMIZER', + help='Optimizer (default: "sgd"') + parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', + help='Optimizer Epsilon (default: None, use opt default)') + parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', + help='Optimizer Betas (default: None, use opt default)') + parser.add_argument('--momentum', type=float, default=0.9, metavar='M', + help='Optimizer momentum (default: 0.9)') + parser.add_argument('--weight-decay', type=float, default=0.0001, + help='weight decay (default: 0.0001)') + parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM', + help='Clip gradient norm (default: None, no clipping)') + + + # Learning rate schedule parameters + parser.add_argument('--sched', default='step', type=str, metavar='SCHEDULER', + help='LR scheduler (default: "step"') + parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') + parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct', + help='learning rate noise on/off epoch percentages') + parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT', + help='learning rate noise limit percent (default: 0.67)') + parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV', + help='learning rate noise std-dev (default: 1.0)') + parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT', + help='learning rate cycle len multiplier (default: 1.0)') + parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N', + help='learning rate cycle limit') + parser.add_argument('--warmup-lr', type=float, default=0.0001, metavar='LR', + help='warmup learning rate (default: 0.0001)') + parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0 (1e-5)') + parser.add_argument('--epochs', type=int, default=200, metavar='N', + help='number of epochs to train (default: 2)') + parser.add_argument('--start-epoch', default=None, type=int, metavar='N', + help='manual epoch number (useful on restarts)') + parser.add_argument('--decay-epochs', type=float, default=30, metavar='N', + help='epoch interval to decay LR') + parser.add_argument('--warmup-epochs', type=int, default=3, metavar='N', + help='epochs to warmup LR, if scheduler supports') + parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N', + help='epochs to cooldown LR at min_lr, after cyclic schedule ends') + parser.add_argument('--patience-epochs', type=int, default=10, metavar='N', + help='patience epochs for Plateau LR scheduler (default: 10') + parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE', + help='LR decay rate (default: 0.1)') + + # Augmentation & regularization parameters + parser.add_argument('--no-aug', action='store_true', default=False, + help='Disable all training augmentation, override other train aug args') + parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT', + help='Random resize scale (default: 0.08 1.0)') + parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO', + help='Random resize aspect ratio (default: 0.75 1.33)') + parser.add_argument('--hflip', type=float, default=0.5, + help='Horizontal flip training aug probability') + parser.add_argument('--vflip', type=float, default=0., + help='Vertical flip training aug probability') + parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT', + help='Color jitter factor (default: 0.4)') + parser.add_argument('--aa', type=str, default=None, metavar='NAME', + help='Use AutoAugment policy. "v0" or "original". (default: None)'), + parser.add_argument('--aug-splits', type=int, default=0, + help='Number of augmentation splits (default: 0, valid: 0 or >=2)') + parser.add_argument('--jsd', action='store_true', default=False, + help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.') + parser.add_argument('--reprob', type=float, default=0., metavar='PCT', + help='Random erase prob (default: 0.)') + parser.add_argument('--remode', type=str, default='const', + help='Random erase mode (default: "const")') + parser.add_argument('--recount', type=int, default=1, + help='Random erase count (default: 1)') + parser.add_argument('--resplit', action='store_true', default=False, + help='Do not random erase first (clean) augmentation split') + parser.add_argument('--mixup', type=float, default=0.0, + help='mixup alpha, mixup enabled if > 0. (default: 0.)') + parser.add_argument('--cutmix', type=float, default=0.0, + help='cutmix alpha, cutmix enabled if > 0. (default: 0.)') + parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None, + help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)') + parser.add_argument('--mixup-prob', type=float, default=1.0, + help='Probability of performing mixup or cutmix when either/both is enabled') + parser.add_argument('--mixup-switch-prob', type=float, default=0.5, + help='Probability of switching to cutmix when both mixup and cutmix enabled') + parser.add_argument('--mixup-mode', type=str, default='batch', + help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"') + parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N', + help='Turn off mixup after this epoch, disabled if 0 (default: 0)') + parser.add_argument('--smoothing', type=float, default=0.1, + help='Label smoothing (default: 0.1)') + parser.add_argument('--train-interpolation', type=str, default='random', + help='Training interpolation (random, bilinear, bicubic default: "random")') + parser.add_argument('--drop', type=float, default=0.0, metavar='PCT', + help='Dropout rate (default: 0.)') + parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT', + help='Drop connect rate, DEPRECATED, use drop-path (default: None)') + parser.add_argument('--drop-path', type=float, default=None, metavar='PCT', + help='Drop path rate (default: None)') + parser.add_argument('--drop-block', type=float, default=None, metavar='PCT', + help='Drop block rate (default: None)') + + # Batch norm parameters (only works with gen_efficientnet based models currently) + parser.add_argument('--bn-tf', type=bool, default=False, + help='Use Tensorflow BatchNorm defaults for models that support it (default: False)') + parser.add_argument('--bn-momentum', type=float, default=None, + help='BatchNorm momentum override (if not None)') + parser.add_argument('--bn-eps', type=float, default=None, + help='BatchNorm epsilon override (if not None)') + parser.add_argument('--sync-bn', action='store_true', + help='Enable NVIDIA Apex or Torch synchronized BatchNorm.') + parser.add_argument('--dist-bn', type=str, default='', + help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")') + parser.add_argument('--split-bn', action='store_true', + help='Enable separate BN layers per augmentation split.') + + # Model Exponential Moving Average + parser.add_argument('--model-ema', action='store_true', default=False, + help='Enable tracking moving average of model weights') + parser.add_argument('--model-ema-force-cpu', action='store_true', default=False, + help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.') + parser.add_argument('--model-ema-decay', type=float, default=0.9998, + help='decay factor for model weights moving average (default: 0.9998)') + + args = parser.parse_args() + return args + + +def load_data(args, dataset_name): + if dataset_name in ["ILSVRC2012", "ILSVRC2012-100"]: + logging.info("load_data. dataset_name = %s" % dataset_name) + train_data_num, test_data_num, train_data_global, test_data_global, \ + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ + class_num = distributed_centralized_ImageNet_loader(dataset=dataset_name, data_dir=args.data_dir, + world_size=args.client_num_in_total, + rank=args.rank, batch_size=args.batch_size, + args=args) + + elif dataset_name == "gld23k": + logging.info("load_data. dataset_name = %s" % dataset_name) + args.client_num_in_total = 233 + # fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_train.csv') + # fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_test.csv') + fed_train_map_file = os.path.join(args.data_dir, 'gld23k_user_dict_train.csv') + fed_test_map_file = os.path.join(args.data_dir, 'gld23k_user_dict_test.csv') + args.data_dir = os.path.join(args.data_dir, 'images') + + train_data_num, test_data_num, train_data_global, test_data_global, \ + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ + class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir, + fed_train_map_file=fed_train_map_file, + fed_test_map_file=fed_test_map_file, + partition_method=None, partition_alpha=None, + client_number=233, batch_size=args.batch_size) + + elif dataset_name == "gld160k": + logging.info("load_data. dataset_name = %s" % dataset_name) + args.client_num_in_total = 1262 + # fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_train.csv') + # fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_test.csv') + fed_train_map_file = os.path.join(args.data_dir, 'gld160k_user_dict_train.csv') + fed_test_map_file = os.path.join(args.data_dir, 'gld160k_user_dict_test.csv') + args.data_dir = os.path.join(args.data_dir, 'images') + + + train_data_num, test_data_num, train_data_global, test_data_global, \ + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ + class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir, + fed_train_map_file=fed_train_map_file, + fed_test_map_file=fed_test_map_file, + partition_method=None, partition_alpha=None, + client_number=args.client_num_in_total, batch_size=args.batch_size) + else: + if dataset_name == "cifar10": + data_loader = load_partition_data_cifar10 + elif dataset_name == "cifar100": + data_loader = load_partition_data_cifar100 + elif dataset_name == "cinic10": + data_loader = load_partition_data_cinic10 + else: + raise Exception("no such dataset") + + train_data_num, test_data_num, train_data_global, test_data_global, \ + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ + class_num = data_loader(args.dataset, args.data_dir, args.partition_method, + args.partition_alpha, args.client_num_in_total, args.batch_size) + + + dataset = [train_data_num, test_data_num, train_data_global, test_data_global, + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] + return dataset + + +def create_model(args, model_name, output_dim): + logging.info("create_model. model_name = %s, output_dim = %s" % (model_name, output_dim)) + if model_name == 'mobilenet_v3': + '''model_mode \in {LARGE: 5.15M, SMALL: 2.94M}''' + # model = MobileNetV3(model_mode='LARGE') + model = timm_create_model( + model_name="mobilenetv3_large_100", + pretrained=args.pretrained, + num_classes=output_dim, + drop_rate=args.drop, + # drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path + drop_path_rate=args.drop_path, + drop_block_rate=args.drop_block, + global_pool=args.gp, + bn_tf=args.bn_tf, + bn_momentum=args.bn_momentum, + bn_eps=args.bn_eps) + + elif model_name == 'efficientnet': + model = timm_create_model( + model_name="efficientnet_b0", + pretrained=args.pretrained, + num_classes=output_dim, + drop_rate=args.drop, + # drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path + drop_path_rate=args.drop_path, + drop_block_rate=args.drop_block, + global_pool=args.gp, + bn_tf=args.bn_tf, + bn_momentum=args.bn_momentum, + bn_eps=args.bn_eps) + else: + raise Exception("no such model") + return model + + +def init_ddp(): + # use InfiniBand + os.environ['NCCL_DEBUG'] = 'INFO' + os.environ['NCCL_SOCKET_IFNAME'] = 'lo' + + # This the global rank: 0, 1, 2, ..., 15 + global_rank = int(os.environ['RANK']) + print("int(os.environ['RANK']) = %d" % global_rank) + + # This the globak world_size + world_size = int(os.environ['WORLD_SIZE']) + print("world_size = %d" % world_size) + + # initialize the process group + # dist.init_process_group(backend="nccl", rank=global_rank, world_size=world_size) + dist.init_process_group(backend="nccl", init_method="env://") + + local_rank = args.local_rank + print(f"Running basic DDP example on local rank {local_rank}.") + return local_rank, global_rank + + +def get_ddp_model(model, local_rank): + return DDP(model, device_ids=[local_rank], output_device=local_rank) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="PyTorch DDP Demo") + args = add_args(parser) + # parser.add_argument("--local_rank", type=int, default=0) + args = parser.parse_args() + logging.info(args) + # args.weight_decay = args.wd + args.wd = args.weight_decay + + # DDP + local_rank, global_rank = init_ddp() + process_id = global_rank + args.rank = global_rank + + # customize the process name + str_process_name = "ddp_classification:" + str(process_id) + setproctitle.setproctitle(str_process_name) + + # customize the log format + while logging.getLogger().handlers: + logging.getLogger().handlers.clear() + console = logging.StreamHandler() + console.setLevel(logging.INFO) + formatter = logging.Formatter(str(process_id) + + ' - %(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') + console.setFormatter(formatter) + # Create an instance + logging.getLogger().addHandler(console) + hostname = socket.gethostname() + logging.info("#############process ID = " + str(process_id) + + ", host name = " + hostname + "########" + + ", process ID = " + str(os.getpid()) + + ", process Name = " + str(psutil.Process(os.getpid()))) + + + name_model_ema = "-model_ema" if args.model_ema else "-no_model_ema" + name_aa = args.aa if args.aa is not None else "_None" + # initialize the wandb machine learning experimental tracking platform (https://www.wandb.com/). + if process_id == 0: + wandb.init( + entity="automl", + project="fedcv-classification", + name="FedCV (c new)" + str(args.partition_method) + "-" +str(args.dataset)+ + "-e" + str(args.epochs) + "-" + str(args.model) + "-" + + args.data_transform + "-aa" + name_aa + "-" + str(args.opt) + + name_model_ema + "-bs" + str(args.batch_size) + + "-lr" + str(args.lr) + "-wd" + str(args.wd), + config=args + ) + + random.seed(0) + np.random.seed(0) + torch.manual_seed(0) + torch.cuda.manual_seed_all(0) + + # GPU + device = torch.device("cuda:" + str(local_rank)) + + # load data + dataset = load_data(args, args.dataset) + [train_data_num, test_data_num, train_data_global, test_data_global, + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] = dataset + + # create model. + # Note if the model is DNN (e.g., ResNet), the training will be very slow. + # In this case, please use our FedML distributed version (./fedml_experiments/distributed_fedavg) + model = create_model(args, model_name=args.model, output_dim=dataset[7]) + model = model.to(device) + model = get_ddp_model(model, local_rank) + if global_rank == 0: + print(model) + + metrics = Metrics(topks=[1], task="classification") + train_tracker = RuntimeTracker(things_to_track=metrics.metric_names) + test_tracker = RuntimeTracker(things_to_track=metrics.metric_names) + + model_trainer = ClassificationTrainer(model, device, args) + for epoch in range(args.epochs): + model_trainer.train_one_epoch(train_data_global, device, args, epoch) + if global_rank == 0: + model_trainer.test(test_data_global, device, args, metrics, test_tracker) + wandb_log(prefix='Test', sp_values=test_tracker(), com_values={"epoch": epoch}) + # I forget to reset the tracker previously + test_tracker.reset() + dist.destroy_process_group() diff --git a/experiments/centralized/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md b/experiments/centralized/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md new file mode 100644 index 0000000..62d25c6 --- /dev/null +++ b/experiments/centralized/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md @@ -0,0 +1,70 @@ +# ILSVRC2012-100 MobileNetV3-Large-100 + +# scigpu +PYTHON=~/anaconda3/envs/py36/bin/python +imagenet_data_dir=/home/datasets/imagenet/ILSVRC2012_dataset +gld_data_dir=~/datasets/landmarks +cifar10_data_dir=~/datasets/cifar10 +GPU_UTIL_FILE=scigpu_gpu_util.yaml +MPI_HOST_FILE=scigpu_mpi_host_file + +# DAAI +PYTHON=~/py36/bin/python +imagenet_data_dir=/home/datasets/ILSVRC2012_dataset +gld_data_dir=/home/datasets/landmarks +cifar10_data_dir=/home/datasets/cifar10 +GPU_UTIL_FILE=DAAI_gpu_util.yaml +MPI_HOST_FILE=DAAI_mpi_host_file_2 + + + +## Pure + +``` +# running +./single_run_classification.sh "0" ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03" +./single_run_classification.sh "1" ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.01" +./single_run_classification.sh "2" ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.003" +``` + +## Pure with normal image transform + +``` + +./single_run_classification.sh "1" ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03" +./single_run_classification.sh "1" ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.01" +./single_run_classification.sh "1" ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.003" +``` + + +## Add AutoAugmentation +``` + + +./single_run_classification.sh "2" ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03" +./single_run_classification.sh "2" ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.01" +./single_run_classification.sh "2" ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.003" +``` + +## Add model EMA +``` +# running + +./single_run_classification.sh "3" ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.03" +./single_run_classification.sh "3" ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.01" + +./single_run_classification.sh "3" ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.003" +``` + + +## Add all +``` +./single_run_classification.sh "3" ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03" + +./single_run_classification.sh "0" ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.01" + +./single_run_classification.sh "1" ~/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.003" + +``` + + diff --git a/experiments/centralized/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md b/experiments/centralized/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md new file mode 100644 index 0000000..6eaeaad --- /dev/null +++ b/experiments/centralized/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md @@ -0,0 +1,79 @@ +# ILSVRC2012-100 MobileNetV3-Large-100 + +# scigpu +PYTHON=~/anaconda3/envs/py36/bin/python +imagenet_data_dir=/home/datasets/imagenet/ILSVRC2012_dataset +gld_data_dir=~/datasets/landmarks +cifar10_data_dir=~/datasets/cifar10 +GPU_UTIL_FILE=scigpu_gpu_util.yaml +MPI_HOST_FILE=scigpu_mpi_host_file + +# DAAI +PYTHON=~/py36/bin/python +imagenet_data_dir=/home/datasets/ILSVRC2012_dataset +gld_data_dir=/home/datasets/landmarks +cifar10_data_dir=/home/datasets/cifar10 +GPU_UTIL_FILE=DAAI_gpu_util.yaml +MPI_HOST_FILE=DAAI_mpi_host_file_2 + + + +## Pure + +``` + +# running +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03" + +# running +./single_run_classification.sh "2" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.01" + +# running +./single_run_classification.sh "2" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.06" +``` + +## Pure with normal image transform + +``` +# running +./single_run_classification.sh "3" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03" + +# running +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.01" + +# running +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.06" +``` + + +## Add AutoAugmentation +``` +# running +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03" + +# running +./single_run_classification.sh "1" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.01" + +# running +./single_run_classification.sh "1" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.06" +``` + +## Add model EMA +``` +# running +./single_run_classification.sh "1" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.03" +``` + + +## Add all +``` +# running +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03" + +./single_run_classification.sh "1" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.01" + +./single_run_classification.sh "3" ~/anaconda3/envs/py36/bin/python " --dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.003" + +``` + + diff --git a/experiments/centralized/classification/experiment_scripts/cifar100 EfficientNet.md b/experiments/centralized/classification/experiment_scripts/cifar100 EfficientNet.md new file mode 100644 index 0000000..7c3f319 --- /dev/null +++ b/experiments/centralized/classification/experiment_scripts/cifar100 EfficientNet.md @@ -0,0 +1,22 @@ +# CIFAR100 MobileNetV3-Large-100 + + +# DAAI +PYTHON=~/py36/bin/python +imagenet_data_dir=/home/datasets/ILSVRC2012_dataset +gld_data_dir=/home/datasets/landmarks +cifar10_data_dir=/home/datasets/cifar10 +cifar100_data_dir=/home/datasets/cifar100 +GPU_UTIL_FILE=DAAI_gpu_util.yaml +MPI_HOST_FILE=DAAI_mpi_host_file_2 + + + +# Pure +./single_run_classification.sh "0" ~/py36/bin/python " --dataset cifar100 --data_dir /home/datasets/cifar100 --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03" + + + + + + diff --git a/experiments/centralized/classification/experiment_scripts/cifar100 MobileNetV3.md b/experiments/centralized/classification/experiment_scripts/cifar100 MobileNetV3.md new file mode 100644 index 0000000..5900292 --- /dev/null +++ b/experiments/centralized/classification/experiment_scripts/cifar100 MobileNetV3.md @@ -0,0 +1,22 @@ +# CIFAR100 MobileNetV3-Large-100 + + +# DAAI +PYTHON=~/py36/bin/python +imagenet_data_dir=/home/datasets/ILSVRC2012_dataset +gld_data_dir=/home/datasets/landmarks +cifar10_data_dir=/home/datasets/cifar10 +cifar100_data_dir=/home/datasets/cifar100 +GPU_UTIL_FILE=DAAI_gpu_util.yaml +MPI_HOST_FILE=DAAI_mpi_host_file_2 + + + +# Pure +./single_run_classification.sh "1" ~/py36/bin/python " --dataset cifar100 --data_dir /home/datasets/cifar100 --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03" + + + + + + diff --git a/experiments/centralized/classification/experiment_scripts/gld160k EfficientNet.md b/experiments/centralized/classification/experiment_scripts/gld160k EfficientNet.md new file mode 100644 index 0000000..b8e91ce --- /dev/null +++ b/experiments/centralized/classification/experiment_scripts/gld160k EfficientNet.md @@ -0,0 +1,62 @@ +# ILSVRC2012-100 MobileNetV3-Large-100 + +# scigpu +PYTHON=~/anaconda3/envs/py36/bin/python +imagenet_data_dir=/home/datasets/imagenet/ILSVRC2012_dataset +gld_data_dir=~/datasets/landmarks +cifar10_data_dir=~/datasets/cifar10 +GPU_UTIL_FILE=scigpu_gpu_util.yaml +MPI_HOST_FILE=scigpu_mpi_host_file + +# DAAI +PYTHON=~/py36/bin/python +imagenet_data_dir=/home/datasets/ILSVRC2012_dataset +gld_data_dir=/home/datasets/landmarks +cifar10_data_dir=/home/datasets/cifar10 +GPU_UTIL_FILE=DAAI_gpu_util.yaml +MPI_HOST_FILE=DAAI_mpi_host_file_2 + +# t716 +PYTHON=~/miniconda3/bin/python +imagenet_data_dir=/nfs_home/datasets/ILSVRC2012 +gld_data_dir=/nfs_home/datasets/landmarks +cifar10_data_dir=/nfs_home/datasets/cifar10 +mnist_data_dir=/nfs_home/datasets/mnist + + +./single_run_classification.sh "0" ~/miniconda3/bin/python " --dataset gld160k --data_dir /nfs_home/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 64 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03" + +## Pure + +``` +cmd=" +cd ~/FedCV_classification/experiments/centralized/classification; +./single_run_classification.sh \"0\" ~/miniconda3/bin/python \" --dataset gld160k --data_dir /nfs_home/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03\" " +ssh host5 $cmd +``` + +## Pure with normal image transform + +``` +cmd=" +./single_run_classification.sh \"0\" ~/miniconda3/bin/python " --dataset gld160k --data_dir /nfs_home/datasets/landmarks --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03" +``` + + +## Add AutoAugmentation +``` +./single_run_classification.sh "0" ~/miniconda3/bin/python " --dataset gld160k --data_dir /nfs_home/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03" +``` + +## Add model EMA +``` +./single_run_classification.sh "0" ~/miniconda3/bin/python " --dataset gld160k --data_dir /nfs_home/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.03" +``` + + +## Add all +``` +./single_run_classification.sh "0" ~/miniconda3/bin/python " --dataset gld160k --data_dir /nfs_home/datasets/landmarks --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03" +``` + + diff --git a/experiments/centralized/classification/experiment_scripts/gld160k MobileNetV3.md b/experiments/centralized/classification/experiment_scripts/gld160k MobileNetV3.md new file mode 100644 index 0000000..0edbc91 --- /dev/null +++ b/experiments/centralized/classification/experiment_scripts/gld160k MobileNetV3.md @@ -0,0 +1,60 @@ +# ILSVRC2012-100 MobileNetV3-Large-100 + +# scigpu +PYTHON=~/anaconda3/envs/py36/bin/python +imagenet_data_dir=/home/datasets/imagenet/ILSVRC2012_dataset +gld_data_dir=/home/comp/20481896/datasets/landmarks +gld_data_dir=~/datasets/landmarks +cifar10_data_dir=/home/comp/20481896/datasets/cifar10 +cifar10_data_dir=~/datasets/cifar10 +GPU_UTIL_FILE=scigpu_gpu_util.yaml +MPI_HOST_FILE=scigpu_mpi_host_file + +# DAAI +PYTHON=~/py36/bin/python +imagenet_data_dir=/home/datasets/ILSVRC2012_dataset +gld_data_dir=/home/datasets/landmarks +cifar10_data_dir=/home/datasets/cifar10 +GPU_UTIL_FILE=DAAI_gpu_util.yaml +MPI_HOST_FILE=DAAI_mpi_host_file_2 + +# t716 +PYTHON=~/miniconda3/bin/python +imagenet_data_dir=/nfs_home/datasets/ILSVRC2012 +gld_data_dir=/nfs_home/datasets/landmarks +cifar10_data_dir=/nfs_home/datasets/cifar10 +mnist_data_dir=/nfs_home/datasets/mnist + + +## Pure + +``` + +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset gld160k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03" +``` + +## Pure with normal image transform + +``` + +./single_run_classification.sh "1" ~/anaconda3/envs/py36/bin/python " --dataset gld160k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03" +``` + + +## Add AutoAugmentation +``` +./single_run_classification.sh "2" ~/anaconda3/envs/py36/bin/python " --dataset gld160k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03" +``` + +## Add model EMA +``` +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset gld160k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.03" +``` + + +## Add all +``` +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset gld160k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03" +``` + + diff --git a/experiments/centralized/classification/experiment_scripts/gld23k EfficientNet.md b/experiments/centralized/classification/experiment_scripts/gld23k EfficientNet.md new file mode 100644 index 0000000..ff2dec6 --- /dev/null +++ b/experiments/centralized/classification/experiment_scripts/gld23k EfficientNet.md @@ -0,0 +1,65 @@ +# ILSVRC2012-100 MobileNetV3-Large-100 + +# t716 +PYTHON=~/miniconda3/bin/python +imagenet_data_dir=/nfs_home/datasets/ILSVRC2012 +gld_data_dir=/nfs_home/datasets/landmarks +cifar10_data_dir=/nfs_home/datasets/cifar10 +mnist_data_dir=/nfs_home/datasets/mnist + + + +## Pure + +``` + +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03" + +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.01" + +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.003" +``` + +## Pure with normal image transform + +``` + +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03" + +./single_run_classification.sh "1" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.01" + +./single_run_classification.sh "2" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.003" +``` + + +## Add AutoAugmentation +``` +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03" + +./single_run_classification.sh "1" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.01" + +./single_run_classification.sh "3" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.003" +``` + +## Add model EMA +``` +# running +./single_run_classification.sh "3" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.03" + +./single_run_classification.sh "1" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.01" + +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.003" + +``` + + +## Add all +``` +./single_run_classification.sh "2" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03" + +./single_run_classification.sh "3" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.01" + +./single_run_classification.sh "2" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model efficientnet --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.003" +``` + + diff --git a/experiments/centralized/classification/experiment_scripts/gld23k MobileNetV3.md b/experiments/centralized/classification/experiment_scripts/gld23k MobileNetV3.md new file mode 100644 index 0000000..e339a0e --- /dev/null +++ b/experiments/centralized/classification/experiment_scripts/gld23k MobileNetV3.md @@ -0,0 +1,79 @@ +# ILSVRC2012-100 MobileNetV3-Large-100 + +# t716 +PYTHON=~/miniconda3/bin/python +imagenet_data_dir=/nfs_home/datasets/ILSVRC2012 +gld_data_dir=/nfs_home/datasets/landmarks +cifar10_data_dir=/nfs_home/datasets/cifar10 +mnist_data_dir=/nfs_home/datasets/mnist + +# DAAI +PYTHON=~/py36/bin/python +imagenet_data_dir=/home/datasets/ILSVRC2012_dataset +gld_data_dir=/home/datasets/landmarks +cifar10_data_dir=/home/datasets/cifar10 +GPU_UTIL_FILE=DAAI_gpu_util.yaml +MPI_HOST_FILE=DAAI_mpi_host_file_2 + + +# scigpu +PYTHON=~/anaconda3/envs/py36/bin/python +imagenet_data_dir=/home/datasets/imagenet/ILSVRC2012_dataset +gld_data_dir=~/datasets/landmarks +cifar10_data_dir=~/datasets/cifar10 +GPU_UTIL_FILE=scigpu_gpu_util.yaml +MPI_HOST_FILE=scigpu_mpi_host_file + + +## Pure + +``` +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03" + +./single_run_classification.sh "1" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.01" + +./single_run_classification.sh "2" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.003" +``` + +## Pure with normal image transform + +``` + +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.03" + +./single_run_classification.sh "1" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.01" + +./single_run_classification.sh "3" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 --lr 0.003" +``` + + +## Add AutoAugmentation +``` +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03" + +./single_run_classification.sh "1" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.01" + +./single_run_classification.sh "2" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.003" +``` + +## Add model EMA +``` +./single_run_classification.sh "0" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.03" + +./single_run_classification.sh "1" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.01" + +./single_run_classification.sh "2" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform FLTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --remode pixel --reprob 0.2 --lr 0.003" +``` + + +## Add all +``` +./single_run_classification.sh "2" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.03" + +./single_run_classification.sh "1" ~/anaconda3/envs/py36/bin/python " --dataset gld23k --data_dir /home/comp/20481896/datasets/landmarks --data_transform NormalTransform --model mobilenet_v3 --if-timm-dataset -b 256 --sched step --epochs 100 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --lr 0.003" + + + +``` + + diff --git a/experiments/centralized/classification/remote_run.sh b/experiments/centralized/classification/remote_run.sh new file mode 100644 index 0000000..931f374 --- /dev/null +++ b/experiments/centralized/classification/remote_run.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +CD_PATH=$1 +HOST=$2 +EXECUTE_CMD=$3 + +echo $CD_PATH +echo $HOST +echo $EXECUTE_CMD + +cmd="cd $CD_PATH ; $EXECUTE_CMD" +echo $cmd +ssh $HOST $cmd diff --git a/experiments/centralized/classification/run_classification.sh b/experiments/centralized/classification/run_classification.sh new file mode 100644 index 0000000..a426e86 --- /dev/null +++ b/experiments/centralized/classification/run_classification.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +NPROC_PER_NODE=$1 +NNODE=$2 +NODE_RANK=$3 +MASTER_ADDR=$4 +MASTER_PORT=$5 +GPU_UTIL=$6 +PYTHON=$7 +ARGS=$8 + + + +CUDA_VISIBLE_DEVICES=$GPU_UTIL $PYTHON -m torch.distributed.launch \ +--nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODE --node_rank=$NODE_RANK \ +--master_addr $MASTER_ADDR \ +--master_port $MASTER_PORT \ +./ddp_classification.py --client_num_in_total $NPROC_PER_NODE $ARGS + +# CUDA_VISIBLE_DEVICES=$GPU_UTIL $PYTHON -m torch.distributed.launch \ +# --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODE --node_rank=$NODE_RANK \ +# --master_addr $MASTER_ADDR \ +# --master_port $MASTER_PORT \ +# ./ddp_classification.py diff --git a/experiments/centralized/classification/run_classification_with_conf.sh b/experiments/centralized/classification/run_classification_with_conf.sh new file mode 100644 index 0000000..a426e86 --- /dev/null +++ b/experiments/centralized/classification/run_classification_with_conf.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +NPROC_PER_NODE=$1 +NNODE=$2 +NODE_RANK=$3 +MASTER_ADDR=$4 +MASTER_PORT=$5 +GPU_UTIL=$6 +PYTHON=$7 +ARGS=$8 + + + +CUDA_VISIBLE_DEVICES=$GPU_UTIL $PYTHON -m torch.distributed.launch \ +--nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODE --node_rank=$NODE_RANK \ +--master_addr $MASTER_ADDR \ +--master_port $MASTER_PORT \ +./ddp_classification.py --client_num_in_total $NPROC_PER_NODE $ARGS + +# CUDA_VISIBLE_DEVICES=$GPU_UTIL $PYTHON -m torch.distributed.launch \ +# --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODE --node_rank=$NODE_RANK \ +# --master_addr $MASTER_ADDR \ +# --master_port $MASTER_PORT \ +# ./ddp_classification.py diff --git a/experiments/centralized/classification/single_classification.py b/experiments/centralized/classification/single_classification.py new file mode 100755 index 0000000..1e8b72f --- /dev/null +++ b/experiments/centralized/classification/single_classification.py @@ -0,0 +1,437 @@ +import argparse +import logging +import os +import random +import socket +import sys +import traceback + + +import numpy as np +import psutil +import setproctitle +import wandb +import torch +import torch.nn as nn +from timm import create_model as timm_create_model +from timm.models import resume_checkpoint, load_checkpoint, convert_splitbn_model + + +sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../../"))) + +from utils.tracker import RuntimeTracker +from utils.metrics import Metrics +from utils.wandb_util import wandb_log +from data_preprocessing.ImageNet.data_loader import distributed_centralized_ImageNet_loader +from data_preprocessing.Landmarks.data_loader import load_partition_data_landmarks +from data_preprocessing.cifar10.iid_data_loader import load_iid_cifar10 +from data_preprocessing.cifar10.data_loader import load_partition_data_cifar10 +from data_preprocessing.cifar100.data_loader import load_partition_data_cifar100 +from data_preprocessing.cinic10.data_loader import load_partition_data_cinic10 + + +from training.centralized_classification_trainer import ClassificationTrainer + +from utils.logger import ( + logging_config +) + +def add_args(parser): + """ + parser : argparse.ArgumentParser + return a parser added with args required by fit + """ + # Training settings + parser.add_argument('--model', type=str, default='mobilenet', metavar='N', + help='neural network used in training') + + parser.add_argument('--dataset', type=str, default='cifar10', metavar='N', + help='dataset used for training') + + parser.add_argument('--data_dir', type=str, default='./../../../data/cifar10', + help='data directory') + + parser.add_argument('--partition_method', type=str, default='hetero', metavar='N', + help='how to partition the dataset on local workers') + + parser.add_argument('--partition_alpha', type=float, default=0.5, metavar='PA', + help='partition alpha (default: 0.5)') + + parser.add_argument('--client_num_in_total', type=int, default=1000, metavar='NN', + help='number of workers in a distributed cluster') + + parser.add_argument('--client_num_per_round', type=int, default=4, metavar='NN', + help='number of workers') + + parser.add_argument('--batch_size', '-b', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') + + parser.add_argument('--client_optimizer', type=str, default='adam', + help='SGD with momentum; adam') + + + # parser.add_argument('--lr', type=float, default=0.001, metavar='LR', + # help='learning rate (default: 0.001)') + + parser.add_argument('--wd', help='weight decay parameter;', type=float, default=0.001) + + # parser.add_argument('--epochs', type=int, default=5, metavar='EP', + # help='how many epochs will be trained locally') + + parser.add_argument('--comm_round', type=int, default=10, + help='how many round of communications we shoud use') + + parser.add_argument('--is_mobile', type=int, default=0, + help='whether the program is running on the FedML-Mobile server side') + + parser.add_argument('--frequency_of_the_test', type=int, default=1, + help='the frequency of the algorithms') + + parser.add_argument('--gpu_server_num', type=int, default=1, + help='gpu_server_num') + + parser.add_argument('--gpu_num_per_server', type=int, default=4, + help='gpu_num_per_server') + + parser.add_argument('--ci', type=int, default=0, + help='CI') + + parser.add_argument('--local_rank', type=int, default=0, + help='given by torch.distributed.launch') + + parser.add_argument('--pretrained',action='store_true', default=False, + help='Start with pretrained version of specified network (if avail)') + + parser.add_argument('--distributed', action='store_true', default=False, + help='If distributed training') + + parser.add_argument('--if-timm-dataset', action='store_true', default=False, + help='If use timm dataset augmentation') + + parser.add_argument('--data_load_num_workers', type=int, default=4, + help='number of workers when loading data') + + + # logging settings + parser.add_argument('--level', type=str, default='INFO', + help='level of logging') + + # Dataset + parser.add_argument('--img-size', type=int, default=None, metavar='N', + help='Image patch size (default: None => model default)') + parser.add_argument('--crop-pct', default=None, type=float, + metavar='N', help='Input image center crop percent (for validation only)') + parser.add_argument('--data_transform', default=None, type=str, metavar='TRANSFORM', + help='How to do data transform') + parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN', + help='Override mean pixel value of dataset') + parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD', + help='Override std deviation of of dataset') + parser.add_argument('--interpolation', default='', type=str, metavar='NAME', + help='Image resize interpolation type (overrides model)') + # parser.add_argument('-b', '--batch-size', type=int, default=32, metavar='N', + # help='input batch size for training (default: 32)') + parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N', + help='ratio of validation batch size to training batch size (default: 1)') + + + # Model parameters + parser.add_argument('--gp', default=None, type=str, metavar='POOL', + help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.') + + # Optimizer parameters + parser.add_argument('--opt', default='sgd', type=str, metavar='OPTIMIZER', + help='Optimizer (default: "sgd"') + parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', + help='Optimizer Epsilon (default: None, use opt default)') + parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', + help='Optimizer Betas (default: None, use opt default)') + parser.add_argument('--momentum', type=float, default=0.9, metavar='M', + help='Optimizer momentum (default: 0.9)') + parser.add_argument('--weight-decay', type=float, default=0.0001, + help='weight decay (default: 0.0001)') + parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM', + help='Clip gradient norm (default: None, no clipping)') + + + # Learning rate schedule parameters + parser.add_argument('--sched', default=None, type=str, metavar='SCHEDULER', + help='LR scheduler (default: "step"') + parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') + parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct', + help='learning rate noise on/off epoch percentages') + parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT', + help='learning rate noise limit percent (default: 0.67)') + parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV', + help='learning rate noise std-dev (default: 1.0)') + parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT', + help='learning rate cycle len multiplier (default: 1.0)') + parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N', + help='learning rate cycle limit') + parser.add_argument('--warmup-lr', type=float, default=0.0001, metavar='LR', + help='warmup learning rate (default: 0.0001)') + parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0 (1e-5)') + parser.add_argument('--epochs', type=int, default=200, metavar='N', + help='number of epochs to train (default: 2)') + parser.add_argument('--start-epoch', default=None, type=int, metavar='N', + help='manual epoch number (useful on restarts)') + parser.add_argument('--decay-epochs', type=float, default=30, metavar='N', + help='epoch interval to decay LR') + parser.add_argument('--warmup-epochs', type=int, default=3, metavar='N', + help='epochs to warmup LR, if scheduler supports') + parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N', + help='epochs to cooldown LR at min_lr, after cyclic schedule ends') + parser.add_argument('--patience-epochs', type=int, default=10, metavar='N', + help='patience epochs for Plateau LR scheduler (default: 10') + parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE', + help='LR decay rate (default: 0.1)') + + # Augmentation & regularization parameters + parser.add_argument('--no-aug', action='store_true', default=False, + help='Disable all training augmentation, override other train aug args') + parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT', + help='Random resize scale (default: 0.08 1.0)') + parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO', + help='Random resize aspect ratio (default: 0.75 1.33)') + parser.add_argument('--hflip', type=float, default=0.5, + help='Horizontal flip training aug probability') + parser.add_argument('--vflip', type=float, default=0., + help='Vertical flip training aug probability') + parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT', + help='Color jitter factor (default: 0.4)') + parser.add_argument('--aa', type=str, default=None, metavar='NAME', + help='Use AutoAugment policy. "v0" or "original". (default: None)'), + parser.add_argument('--aug-splits', type=int, default=0, + help='Number of augmentation splits (default: 0, valid: 0 or >=2)') + parser.add_argument('--jsd', action='store_true', default=False, + help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.') + parser.add_argument('--reprob', type=float, default=0., metavar='PCT', + help='Random erase prob (default: 0.)') + parser.add_argument('--remode', type=str, default='const', + help='Random erase mode (default: "const")') + parser.add_argument('--recount', type=int, default=1, + help='Random erase count (default: 1)') + parser.add_argument('--resplit', action='store_true', default=False, + help='Do not random erase first (clean) augmentation split') + parser.add_argument('--mixup', type=float, default=0.0, + help='mixup alpha, mixup enabled if > 0. (default: 0.)') + parser.add_argument('--cutmix', type=float, default=0.0, + help='cutmix alpha, cutmix enabled if > 0. (default: 0.)') + parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None, + help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)') + parser.add_argument('--mixup-prob', type=float, default=1.0, + help='Probability of performing mixup or cutmix when either/both is enabled') + parser.add_argument('--mixup-switch-prob', type=float, default=0.5, + help='Probability of switching to cutmix when both mixup and cutmix enabled') + parser.add_argument('--mixup-mode', type=str, default='batch', + help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"') + parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N', + help='Turn off mixup after this epoch, disabled if 0 (default: 0)') + parser.add_argument('--smoothing', type=float, default=0.1, + help='Label smoothing (default: 0.1)') + parser.add_argument('--train-interpolation', type=str, default='random', + help='Training interpolation (random, bilinear, bicubic default: "random")') + parser.add_argument('--drop', type=float, default=0.0, metavar='PCT', + help='Dropout rate (default: 0.)') + parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT', + help='Drop connect rate, DEPRECATED, use drop-path (default: None)') + parser.add_argument('--drop-path', type=float, default=None, metavar='PCT', + help='Drop path rate (default: None)') + parser.add_argument('--drop-block', type=float, default=None, metavar='PCT', + help='Drop block rate (default: None)') + + # Batch norm parameters (only works with gen_efficientnet based models currently) + parser.add_argument('--bn-tf', type=bool, default=False, + help='Use Tensorflow BatchNorm defaults for models that support it (default: False)') + parser.add_argument('--bn-momentum', type=float, default=None, + help='BatchNorm momentum override (if not None)') + parser.add_argument('--bn-eps', type=float, default=None, + help='BatchNorm epsilon override (if not None)') + parser.add_argument('--sync-bn', action='store_true', + help='Enable NVIDIA Apex or Torch synchronized BatchNorm.') + parser.add_argument('--dist-bn', type=str, default='', + help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")') + parser.add_argument('--split-bn', action='store_true', + help='Enable separate BN layers per augmentation split.') + + # Model Exponential Moving Average + parser.add_argument('--model-ema', action='store_true', default=False, + help='Enable tracking moving average of model weights') + parser.add_argument('--model-ema-force-cpu', action='store_true', default=False, + help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.') + parser.add_argument('--model-ema-decay', type=float, default=0.9998, + help='decay factor for model weights moving average (default: 0.9998)') + + args = parser.parse_args() + return args + + +def load_data(args, dataset_name): + if dataset_name in ["ILSVRC2012", "ILSVRC2012-100"]: + logging.info("load_data. dataset_name = %s" % dataset_name) + train_data_num, test_data_num, train_data_global, test_data_global, \ + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ + class_num = distributed_centralized_ImageNet_loader(dataset=dataset_name, data_dir=args.data_dir, + world_size=args.client_num_in_total, + rank=args.rank, batch_size=args.batch_size, + args=args) + + elif dataset_name == "gld23k": + logging.info("load_data. dataset_name = %s" % dataset_name) + args.client_num_in_total = 233 + # fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_train.csv') + # fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_test.csv') + fed_train_map_file = os.path.join(args.data_dir, 'mini_gld_train_split.csv') + fed_test_map_file = os.path.join(args.data_dir, 'mini_gld_test.csv') + args.data_dir = os.path.join(args.data_dir, 'images') + + train_data_num, test_data_num, train_data_global, test_data_global, \ + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ + class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir, + fed_train_map_file=fed_train_map_file, + fed_test_map_file=fed_test_map_file, + partition_method=None, partition_alpha=None, + client_number=233, batch_size=args.batch_size, args=args) + + elif dataset_name == "gld160k": + logging.info("load_data. dataset_name = %s" % dataset_name) + args.client_num_in_total = 1262 + # fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_train.csv') + # fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_test.csv') + fed_train_map_file = os.path.join(args.data_dir, 'federated_train.csv') + fed_test_map_file = os.path.join(args.data_dir, 'test.csv') + args.data_dir = os.path.join(args.data_dir, 'images') + + train_data_num, test_data_num, train_data_global, test_data_global, \ + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ + class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir, + fed_train_map_file=fed_train_map_file, + fed_test_map_file=fed_test_map_file, + partition_method=None, partition_alpha=None, + client_number=1262, batch_size=args.batch_size, args=args) + else: + if dataset_name == "cifar10": + data_loader = load_partition_data_cifar10 + elif dataset_name == "cifar100": + data_loader = load_partition_data_cifar100 + elif dataset_name == "cinic10": + data_loader = load_partition_data_cinic10 + else: + raise Exception("no such dataset") + + train_data_num, test_data_num, train_data_global, test_data_global, \ + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ + class_num = data_loader(args.dataset, args.data_dir, args.partition_method, + args.partition_alpha, args.client_num_in_total, args.batch_size) + + + dataset = [train_data_num, test_data_num, train_data_global, test_data_global, + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] + return dataset + + +def create_model(args, model_name, output_dim): + logging.info("create_model. model_name = %s, output_dim = %s" % (model_name, output_dim)) + if model_name == 'mobilenet_v3': + '''model_mode \in {LARGE: 5.15M, SMALL: 2.94M}''' + # model = MobileNetV3(model_mode='LARGE') + model = timm_create_model( + model_name="mobilenetv3_large_100", + pretrained=args.pretrained, + num_classes=output_dim, + drop_rate=args.drop, + # drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path + drop_path_rate=args.drop_path, + drop_block_rate=args.drop_block, + global_pool=args.gp, + bn_tf=args.bn_tf, + bn_momentum=args.bn_momentum, + bn_eps=args.bn_eps) + + elif model_name == 'efficientnet': + model = timm_create_model( + model_name="efficientnet_b0", + pretrained=args.pretrained, + num_classes=output_dim, + drop_rate=args.drop, + # drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path + drop_path_rate=args.drop_path, + drop_block_rate=args.drop_block, + global_pool=args.gp, + bn_tf=args.bn_tf, + bn_momentum=args.bn_momentum, + bn_eps=args.bn_eps) + else: + raise Exception("no such model") + return model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="PyTorch DDP Demo") + args = add_args(parser) + # parser.add_argument("--local_rank", type=int, default=0) + args = parser.parse_args() + logging.info(args) + # args.weight_decay = args.wd + args.wd = args.weight_decay + + process_id = 0 + args.rank = 0 + + # customize the process name + str_process_name = "single_classification:" + str(process_id) + setproctitle.setproctitle(str_process_name) + + logging_config(args, process_id) + + + name_model_ema = "-model_ema" if args.model_ema else "-no_model_ema" + name_aa = args.aa if args.aa is not None else "_None" + # initialize the wandb machine learning experimental tracking platform (https://www.wandb.com/). + if process_id == 0: + wandb.init( + entity="automl", + project="fedcv-classification", + name="FedCV (c new)" + str(args.partition_method) + "-" +str(args.dataset)+ + "-e" + str(args.epochs) + "-" + str(args.model) + "-" + + args.data_transform + "-aa" + name_aa + "-" + str(args.opt) + + name_model_ema + "-bs" + str(args.batch_size) + + "-lr" + str(args.lr) + "-wd" + str(args.wd), + config=args + ) + + + random.seed(0) + np.random.seed(0) + torch.manual_seed(0) + torch.cuda.manual_seed_all(0) + + # GPU + device = torch.device("cuda:" + str(process_id)) + + # load data + dataset = load_data(args, args.dataset) + [train_data_num, test_data_num, train_data_global, test_data_global, + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] = dataset + + # create model. + # Note if the model is DNN (e.g., ResNet), the training will be very slow. + # In this case, please use our FedML distributed version (./fedml_experiments/distributed_fedavg) + model = create_model(args, model_name=args.model, output_dim=dataset[7]) + model = model.to(device) + print(model) + + metrics = Metrics(topks=[1], task="classification") + train_tracker = RuntimeTracker(things_to_track=metrics.metric_names) + test_tracker = RuntimeTracker(things_to_track=metrics.metric_names) + + model_trainer = ClassificationTrainer(model, device, args) + for epoch in range(args.epochs): + model_trainer.train_one_epoch(train_data_global, device, args, epoch, train_tracker, metrics) + model_trainer.test(test_data_global, device, args, test_tracker, metrics) + wandb_log(prefix='Test', sp_values=test_tracker(), com_values={"epoch": epoch}) + wandb_log(prefix='Train', sp_values=train_tracker(), com_values={"epoch": epoch}) + train_tracker.reset() + test_tracker.reset() diff --git a/experiments/centralized/classification/single_run_classification.sh b/experiments/centralized/classification/single_run_classification.sh new file mode 100755 index 0000000..eae0f4d --- /dev/null +++ b/experiments/centralized/classification/single_run_classification.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +GPU_UTIL=$1 +PYTHON=$2 +ARGS=$3 + + +CUDA_VISIBLE_DEVICES=$GPU_UTIL $PYTHON ./single_classification.py --client_num_in_total 1 $ARGS + +# CUDA_VISIBLE_DEVICES=$GPU_UTIL $PYTHON -m torch.distributed.launch \ +# --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODE --node_rank=$NODE_RANK \ +# --master_addr $MASTER_ADDR \ +# --master_port $MASTER_PORT \ +# ./ddp_classification.py diff --git a/experiments/centralized/classification/single_run_classification_with_conf.sh b/experiments/centralized/classification/single_run_classification_with_conf.sh new file mode 100644 index 0000000..fbaf85b --- /dev/null +++ b/experiments/centralized/classification/single_run_classification_with_conf.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +GPU_UTIL=$1 +DATASET=$2 +DATA_DIR=$3 +ARGS=$4 + +source configs/cluster.conf +PYTHON=`cat configs/cluster.conf | grep PYTHON | awk -F= "{print $2}"` +data_dir=`cat configs/cluster.conf | grep $DATA_DIR | awk -F= "{print $2}"` + + +CUDA_VISIBLE_DEVICES=$GPU_UTIL $PYTHON ./single_classification.py \ + --client_num_in_total 1 \ + --data_dir $data_dir --dataset $DATASET \ + $ARGS + +# CUDA_VISIBLE_DEVICES=$GPU_UTIL $PYTHON -m torch.distributed.launch \ +# --nproc_per_node=$NPROC_PER_NODE --nnodes=$NNODE --node_rank=$NODE_RANK \ +# --master_addr $MASTER_ADDR \ +# --master_port $MASTER_PORT \ +# ./ddp_classification.py diff --git a/experiments/distributed/Detection/data/coco.yaml b/experiments/distributed/Detection/data/coco.yaml new file mode 100644 index 0000000..09f3a78 --- /dev/null +++ b/experiments/distributed/Detection/data/coco.yaml @@ -0,0 +1,35 @@ +# COCO 2017 dataset http://cocodataset.org +# Train command: python train.py --data coco.yaml +# Default dataset location is next to /yolov5: +# /parent_folder +# /coco +# /yolov5 + + +# download command/URL (optional) +download: bash data/scripts/get_coco.sh + +# train and val data as 1) directory: path/images/, 2) file: path/images.txt, or 3) list: [path1/images/, path2/images/] +train: ../coco/train2017.txt # 118287 images +val: ../coco/val2017.txt # 5000 images +test: ../coco/test-dev2017.txt # 20288 of 40670 images, submit to https://competitions.codalab.org/competitions/20794 + +# number of classes +nc: 80 + +# class names +names: ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', + 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', + 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', + 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', + 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', + 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', + 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', + 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', + 'hair drier', 'toothbrush'] + +# Print classes +# with open('data/coco.yaml') as f: +# d = yaml.load(f, Loader=yaml.FullLoader) # dict +# for i, x in enumerate(d['names']): +# print(i, x) diff --git a/experiments/distributed/Detection/data/coco128.yaml b/experiments/distributed/Detection/data/coco128.yaml new file mode 100644 index 0000000..12e1d79 --- /dev/null +++ b/experiments/distributed/Detection/data/coco128.yaml @@ -0,0 +1,28 @@ +# COCO 2017 dataset http://cocodataset.org - first 128 training images +# Train command: python train.py --data coco128.yaml +# Default dataset location is next to /yolov5: +# /parent_folder +# /coco128 +# /yolov5 + + +# download command/URL (optional) +download: https://github.com/ultralytics/yolov5/releases/download/v1.0/coco128.zip + +# train and val data as 1) directory: path/images/, 2) file: path/images.txt, or 3) list: [path1/images/, path2/images/] +train: ../coco128/images/train2017/ # 128 images +val: ../coco128/images/train2017/ # 128 images + +# number of classes +nc: 80 + +# class names +names: ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', + 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', + 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', + 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', + 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', + 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', + 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', + 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', + 'hair drier', 'toothbrush'] diff --git a/experiments/distributed/Detection/data/hyp.scratch.yaml b/experiments/distributed/Detection/data/hyp.scratch.yaml new file mode 100644 index 0000000..44f26b6 --- /dev/null +++ b/experiments/distributed/Detection/data/hyp.scratch.yaml @@ -0,0 +1,33 @@ +# Hyperparameters for COCO training from scratch +# python train.py --batch 40 --cfg yolov5m.yaml --weights '' --data coco.yaml --img 640 --epochs 300 +# See tutorials for hyperparameter evolution https://github.com/ultralytics/yolov5#tutorials + + +lr0: 0.01 # initial learning rate (SGD=1E-2, Adam=1E-3) +lrf: 0.2 # final OneCycleLR learning rate (lr0 * lrf) +momentum: 0.937 # SGD momentum/Adam beta1 +weight_decay: 0.0005 # optimizer weight decay 5e-4 +warmup_epochs: 3.0 # warmup epochs (fractions ok) +warmup_momentum: 0.8 # warmup initial momentum +warmup_bias_lr: 0.1 # warmup initial bias lr +box: 0.05 # box loss gain +cls: 0.5 # cls loss gain +cls_pw: 1.0 # cls BCELoss positive_weight +obj: 1.0 # obj loss gain (scale with pixels) +obj_pw: 1.0 # obj BCELoss positive_weight +iou_t: 0.20 # IoU training threshold +anchor_t: 4.0 # anchor-multiple threshold +# anchors: 3 # anchors per output layer (0 to ignore) +fl_gamma: 0.0 # focal loss gamma (efficientDet default gamma=1.5) +hsv_h: 0.015 # image HSV-Hue augmentation (fraction) +hsv_s: 0.7 # image HSV-Saturation augmentation (fraction) +hsv_v: 0.4 # image HSV-Value augmentation (fraction) +degrees: 0.0 # image rotation (+/- deg) +translate: 0.1 # image translation (+/- fraction) +scale: 0.5 # image scale (+/- gain) +shear: 0.0 # image shear (+/- deg) +perspective: 0.0 # image perspective (+/- fraction), range 0-0.001 +flipud: 0.0 # image flip up-down (probability) +fliplr: 0.5 # image flip left-right (probability) +mosaic: 1.0 # image mosaic (probability) +mixup: 0.0 # image mixup (probability) diff --git a/experiments/distributed/Detection/data/scripts/get_coco.sh b/experiments/distributed/Detection/data/scripts/get_coco.sh new file mode 100644 index 0000000..157a0b0 --- /dev/null +++ b/experiments/distributed/Detection/data/scripts/get_coco.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# COCO 2017 dataset http://cocodataset.org +# Download command: bash data/scripts/get_coco.sh +# Train command: python train.py --data coco.yaml +# Default dataset location is next to /yolov5: +# /parent_folder +# /coco +# /yolov5 + +# Download/unzip labels +d='../' # unzip directory +url=https://github.com/ultralytics/yolov5/releases/download/v1.0/ +f='coco2017labels.zip' # 68 MB +echo 'Downloading' $url$f ' ...' && curl -L $url$f -o $f && unzip -q $f -d $d && rm $f # download, unzip, remove + +# Download/unzip images +d='../coco/images' # unzip directory +url=http://images.cocodataset.org/zips/ +f1='train2017.zip' # 19G, 118k images +f2='val2017.zip' # 1G, 5k images +f3='test2017.zip' # 7G, 41k images (optional) +for f in $f1 $f2; do + echo 'Downloading' $url$f ' ...' && curl -L $url$f -o $f && unzip -q $f -d $d && rm $f # download, unzip, remove +done diff --git a/experiments/distributed/Detection/data/scripts/get_voc.sh b/experiments/distributed/Detection/data/scripts/get_voc.sh new file mode 100644 index 0000000..6bdaa9b --- /dev/null +++ b/experiments/distributed/Detection/data/scripts/get_voc.sh @@ -0,0 +1,137 @@ +#!/bin/bash +# PASCAL VOC dataset http://host.robots.ox.ac.uk/pascal/VOC/ +# Download command: bash data/scripts/get_voc.sh +# Train command: python train.py --data voc.yaml +# Default dataset location is next to /yolov5: +# /parent_folder +# /VOC +# /yolov5 + +start=$(date +%s) +mkdir -p ../tmp +cd ../tmp/ + +# Download/unzip images and labels +d='.' # unzip directory +url=https://github.com/ultralytics/yolov5/releases/download/v1.0/ +f1=VOCtrainval_06-Nov-2007.zip # 446MB, 5012 images +f2=VOCtest_06-Nov-2007.zip # 438MB, 4953 images +f3=VOCtrainval_11-May-2012.zip # 1.95GB, 17126 images +for f in $f1 $f2 $f3; do + echo 'Downloading' $url$f ' ...' && curl -L $url$f -o $f && unzip -q $f -d $d && rm $f # download, unzip, remove +done + +end=$(date +%s) +runtime=$((end - start)) +echo "Completed in" $runtime "seconds" + +echo "Splitting dataset..." +python3 - "$@" <train.txt +cat 2007_train.txt 2007_val.txt 2007_test.txt 2012_train.txt 2012_val.txt >train.all.txt + +python3 - "$@" < + if process_ID == 0: + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + return device + process_gpu_dict = dict() + for client_index in range(fl_worker_num): + gpu_index = client_index % gpu_num_per_machine + process_gpu_dict[client_index] = gpu_index + + logging.info(process_gpu_dict) + device = torch.device("cuda:" + str(process_gpu_dict[process_ID - 1]) if torch.cuda.is_available() else "cpu") + logging.info(device) + return device + +if __name__ == '__main__': + import tracemalloc + tracemalloc.start() + parser = argparse.ArgumentParser() + parser.add_argument('--weights', type=str, default='yolov5s.pt', help='initial weights path') + parser.add_argument('--cfg', type=str, default='', help='model.yaml path') + parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path') + parser.add_argument('--hyp', type=str, default='data/hyp.scratch.yaml', help='hyperparameters path') + # parser.add_argument('--epochs', type=int, default=300) + # parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs') + parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='[train, test] image sizes') + parser.add_argument('--rect', action='store_true', help='rectangular training') + parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training') + parser.add_argument('--nosave', action='store_true', help='only save final checkpoint') + parser.add_argument('--notest', action='store_true', help='only test final epoch') + parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check') + parser.add_argument('--evolve', action='store_true', help='evolve hyperparameters') + parser.add_argument('--bucket', type=str, default='', help='gsutil bucket') + parser.add_argument('--cache-images', action='store_true', help='cache images for faster training') + parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training') + parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') + parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%') + parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset') + parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer') + + parser.add_argument('--optimizer', default=None, help='optimizer') + parser.add_argument('--scheduler', default=None , help='optimizer scheduler') + parser.add_argument('--wandb', default=None, help='wandb init') + parser.add_argument('--ema', default=None, help='ema init') + + parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode') + parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify') + parser.add_argument('--log-imgs', type=int, default=16, help='number of images for W&B logging, max 100') + parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers') + parser.add_argument('--project', default='runs/train', help='save to project/name') + parser.add_argument('--name', default='exp', help='save to project/name') + parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') + parser.add_argument('--linear-lr', action='store_true', help='linear LR') + parser.add_argument('--model_stride', default=0, type=int) + + # Training settings + parser.add_argument('--model', type=str, default='mobilenet', metavar='N', + help='neural network used in training') + + parser.add_argument('--dataset', type=str, default='cifar10', metavar='N', + help='dataset used for training') + + parser.add_argument('--data_dir', type=str, default='./../../../data/cifar10', + help='data directory') + + parser.add_argument('--partition_method', type=str, default='hetero', metavar='N', + help='how to partition the dataset on local workers') + + parser.add_argument('--partition_alpha', type=float, default=0.5, metavar='PA', + help='partition alpha (default: 0.5)') + + parser.add_argument('--client_num_in_total', type=int, default=1000, metavar='NN', + help='number of workers in a distributed cluster') + + parser.add_argument('--client_num_per_round', type=int, default=4, metavar='NN', + help='number of workers') + + parser.add_argument('--batch_size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') + + parser.add_argument('--client_optimizer', type=str, default='adam', + help='SGD with momentum; adam') + + parser.add_argument('--lr', type=float, default=0.001, metavar='LR', + help='learning rate (default: 0.001)') + + parser.add_argument('--wd', help='weight decay parameter;', type=float, default=0.001) + + parser.add_argument('--epochs', type=int, default=5, metavar='EP', + help='how many epochs will be trained locally') + + parser.add_argument('--comm_round', type=int, default=10, + help='how many round of communications we shoud use') + + parser.add_argument('--is_mobile', type=int, default=0, + help='whether the program is running on the FedML-Mobile server side') + + parser.add_argument('--frequency_of_the_test', type=int, default=1, + help='the frequency of the algorithms') + + parser.add_argument('--gpu_server_num', type=int, default=1, + help='gpu_server_num') + + parser.add_argument('--gpu_num_per_server', type=int, default=4, + help='gpu_num_per_server') + + parser.add_argument('--ci', type=int, default=0, + help='CI') + + parser.add_argument('--round_idx', type=int, default=0, + help='round_idx') + opt = parser.parse_args() + opt.total_batch_size = opt.batch_size + opt.world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1 + opt.global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1 + set_logging(opt.global_rank) + if opt.global_rank in [-1, 0]: + check_git_status() + + # fedml + comm, process_id, worker_number = FedML_init() + logging.info(opt) + print("process_id:", process_id) + #if process_id == 0: + wandb.init( + # project="federated_nas", + project="fedml distributed", + name="FedAVG(d)" + str(opt.partition_method) + "-c" + str(opt.comm_round) + "-e" + str( + opt.epochs), + config=opt + ) + device = init_training_device(process_id, worker_number - 1, opt.gpu_num_per_server) + str_process_name = "FedAvg (distributed):" + str(process_id) + setproctitle.setproctitle(str_process_name) + + # customize the log format + # logging.basicConfig(level=logging.INFO, + logging.basicConfig(level=logging.DEBUG, + format=str( + process_id) + ' - %(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', + datefmt='%a, %d %b %Y %H:%M:%S') + hostname = socket.gethostname() + logging.info("#############process ID = " + str(process_id) + + ", host name = " + hostname + "########" + + ", process ID = " + str(os.getpid()) + + ", process Name = " + str(psutil.Process(os.getpid()))) + + if opt.resume: # resume an interrupted run + ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path + assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist' + with open(Path(ckpt).parent.parent / 'opt.yaml') as f: + opt = argparse.Namespace(**yaml.load(f, Loader=yaml.FullLoader)) # replace + opt.cfg, opt.weights, opt.resume = '', ckpt, True + logger.info('Resuming training from %s' % ckpt) + else: + # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml') + opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp) # check files + assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified' + opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test) + opt.name = 'evolve' if opt.evolve else opt.name + opt.save_dir = increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok | opt.evolve) # increment run + + # DDP mode + #device = select_device(opt.device, batch_size=opt.batch_size) + if opt.local_rank != -1: + assert torch.cuda.device_count() > opt.local_rank + torch.cuda.set_device(opt.local_rank) + device = torch.device('cuda', opt.local_rank) + dist.init_process_group(backend='nccl', init_method='env://') # distributed backend + assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count' + opt.batch_size = opt.total_batch_size // opt.world_size + + # Hyperparameters + with open(opt.hyp) as f: + hyp = yaml.load(f, Loader=yaml.FullLoader) # load hyps + if 'box' not in hyp: + warn('Compatibility: %s missing "box" which was renamed from "giou" in %s' % + (opt.hyp, 'https://github.com/ultralytics/yolov5/pull/1120')) + hyp['box'] = hyp.pop('giou') + + + logger.info(f'Hyperparameters {hyp}') + save_dir, epochs, batch_size, total_batch_size, weights, rank = \ + Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank + + # Directories + wdir = save_dir / 'weights' + wdir.mkdir(parents=True, exist_ok=True) # make dir + last = wdir / 'last.pt' + best = wdir / 'best.pt' + results_file = save_dir / 'results.txt' + + # Save run settings + with open(save_dir / 'hyp.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + with open(save_dir / 'opt.yaml', 'w') as f: + yaml.dump(vars(opt), f, sort_keys=False) + + # Configure + plots = not opt.evolve # create plots + cuda = device.type != 'cpu' + init_seeds(2 + rank) + with open(opt.data) as f: + data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict + with torch_distributed_zero_first(rank): + check_dataset(data_dict) # check + train_path = data_dict['train'] + test_path = data_dict['val'] + nc, names = (1, ['item']) if opt.single_cls else ( + int(data_dict['nc']), data_dict['names']) # number classes, names + assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data) # check + + # Model + print("weights:", weights) + pretrained = weights.endswith('.pt') + if pretrained: + with torch_distributed_zero_first(rank): + attempt_download(weights) # download if not found locally + ckpt = torch.load(weights, map_location=device) # load checkpoint + if hyp.get('anchors'): + ckpt['model'].yaml['anchors'] = round(hyp['anchors']) # force autoanchor + model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create + exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [] # exclude keys + state_dict = ckpt['model'].float().state_dict() # to FP32 + state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect + model.load_state_dict(state_dict, strict=False) # load + logger.info( + 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report + else: + model = Model(opt.cfg, ch=3, nc=nc).to(device) # create + + # Freeze + freeze = [] # parameter names to freeze (full or partial) + for k, v in model.named_parameters(): + v.requires_grad = True # train all layers + if any(x in k for x in freeze): + print('freezing %s' % k) + v.requires_grad = False + + # fedml + + dataset = load_partition_data_coco(opt, hyp, model) + [train_data_num, test_data_num, train_data_global, test_data_global, + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] = dataset + + opt.model_stride = model.stride + gs = int(max(model.stride)) # grid size (max stride) + imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] # verify imgsz are gs-multiples + + + # SyncBatchNorm + if opt.sync_bn and cuda and rank != -1: + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) + logger.info('Using SyncBatchNorm()') + + # EMA + ema = ModelEMA(model) if rank in [-1, 0] else None + + # DDP mode + if cuda and rank != -1: + model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) + + hyp['cls'] *= nc / 80. + model.nc = nc # attach number of classes to model + model.hyp = hyp # attach hyperparameters to model + model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) + model.class_weights = labels_to_class_weights(train_data_global.dataset.labels, nc).to(device) # attach class weights + model.names = names + args = (opt, hyp) + # Optimizer + nbs = 64 # nominal batch size + accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing + hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay + # logger.info(f"Scaled weight_decay = {hyp['weight_decay']}") + + pg0, pg1, pg2 = [], [], [] # optimizer parameter groups + for k, v in model.named_modules(): + if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): + pg2.append(v.bias) # biases + if isinstance(v, nn.BatchNorm2d): + pg0.append(v.weight) # no decay + elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): + pg1.append(v.weight) # apply decay + + if opt.adam: + optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum + else: + optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) + + optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay + optimizer.add_param_group({'params': pg2}) # add pg2 (biases) + logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) + del pg0, pg1, pg2 + + total_epochs = epochs * opt.comm_round + + lf = lambda x: ((1 + math.cos(x * math.pi / total_epochs)) / 2) * (1 - hyp['lrf']) + hyp['lrf'] # cosine + scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) + + + + opt.scheduler = scheduler + opt.optimizer = optimizer + opt.ema = ema + + opt.hyp = hyp # add hyperparameters + + opt.wandb = wandb + device = init_training_device(process_id, worker_number - 1, opt.gpu_num_per_server) + # start "federated averaging (FedAvg)" + print("start distributed") + + try: + # start "federated averaging (FedAvg)" + print("start distributed") + FedML_FedAvg_distributed(process_id, worker_number, device, comm, + model, train_data_num, train_data_global, test_data_global, + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, opt, None, True, hyp) + except Exception as e: + print(e) + logging.info('traceback.format_exc():\n%s' % traceback.format_exc()) + MPI.COMM_WORLD.Abort() + + diff --git a/experiments/distributed/Detection/models/experimental.py b/experiments/distributed/Detection/models/experimental.py new file mode 100644 index 0000000..0835ba9 --- /dev/null +++ b/experiments/distributed/Detection/models/experimental.py @@ -0,0 +1,152 @@ +# This file contains experimental modules + +import numpy as np +import torch +import torch.nn as nn + +from .common import Conv, DWConv +from utils.google_utils import attempt_download + + +class CrossConv(nn.Module): + # Cross Convolution Downsample + def __init__(self, c1, c2, k=3, s=1, g=1, e=1.0, shortcut=False): + # ch_in, ch_out, kernel, stride, groups, expansion, shortcut + super(CrossConv, self).__init__() + c_ = int(c2 * e) # hidden channels + self.cv1 = Conv(c1, c_, (1, k), (1, s)) + self.cv2 = Conv(c_, c2, (k, 1), (s, 1), g=g) + self.add = shortcut and c1 == c2 + + def forward(self, x): + return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) + + +class C3(nn.Module): + # Cross Convolution CSP + def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion + super(C3, self).__init__() + c_ = int(c2 * e) # hidden channels + self.cv1 = Conv(c1, c_, 1, 1) + self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False) + self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False) + self.cv4 = Conv(2 * c_, c2, 1, 1) + self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3) + self.act = nn.LeakyReLU(0.1, inplace=True) + self.m = nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)]) + + def forward(self, x): + y1 = self.cv3(self.m(self.cv1(x))) + y2 = self.cv2(x) + return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim=1)))) + + +class Sum(nn.Module): + # Weighted sum of 2 or more layers https://arxiv.org/abs/1911.09070 + def __init__(self, n, weight=False): # n: number of inputs + super(Sum, self).__init__() + self.weight = weight # apply weights boolean + self.iter = range(n - 1) # iter object + if weight: + self.w = nn.Parameter(-torch.arange(1., n) / 2, requires_grad=True) # layer weights + + def forward(self, x): + y = x[0] # no weight + if self.weight: + w = torch.sigmoid(self.w) * 2 + for i in self.iter: + y = y + x[i + 1] * w[i] + else: + for i in self.iter: + y = y + x[i + 1] + return y + + +class GhostConv(nn.Module): + # Ghost Convolution https://github.com/huawei-noah/ghostnet + def __init__(self, c1, c2, k=1, s=1, g=1, act=True): # ch_in, ch_out, kernel, stride, groups + super(GhostConv, self).__init__() + c_ = c2 // 2 # hidden channels + self.cv1 = Conv(c1, c_, k, s, None, g, act) + self.cv2 = Conv(c_, c_, 5, 1, None, c_, act) + + def forward(self, x): + y = self.cv1(x) + return torch.cat([y, self.cv2(y)], 1) + + +class GhostBottleneck(nn.Module): + # Ghost Bottleneck https://github.com/huawei-noah/ghostnet + def __init__(self, c1, c2, k, s): + super(GhostBottleneck, self).__init__() + c_ = c2 // 2 + self.conv = nn.Sequential(GhostConv(c1, c_, 1, 1), # pw + DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(), # dw + GhostConv(c_, c2, 1, 1, act=False)) # pw-linear + self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False), + Conv(c1, c2, 1, 1, act=False)) if s == 2 else nn.Identity() + + def forward(self, x): + return self.conv(x) + self.shortcut(x) + + +class MixConv2d(nn.Module): + # Mixed Depthwise Conv https://arxiv.org/abs/1907.09595 + def __init__(self, c1, c2, k=(1, 3), s=1, equal_ch=True): + super(MixConv2d, self).__init__() + groups = len(k) + if equal_ch: # equal c_ per group + i = torch.linspace(0, groups - 1E-6, c2).floor() # c2 indices + c_ = [(i == g).sum() for g in range(groups)] # intermediate channels + else: # equal weight.numel() per group + b = [c2] + [0] * groups + a = np.eye(groups + 1, groups, k=-1) + a -= np.roll(a, 1, axis=1) + a *= np.array(k) ** 2 + a[0] = 1 + c_ = np.linalg.lstsq(a, b, rcond=None)[0].round() # solve for equal weight indices, ax = b + + self.m = nn.ModuleList([nn.Conv2d(c1, int(c_[g]), k[g], s, k[g] // 2, bias=False) for g in range(groups)]) + self.bn = nn.BatchNorm2d(c2) + self.act = nn.LeakyReLU(0.1, inplace=True) + + def forward(self, x): + return x + self.act(self.bn(torch.cat([m(x) for m in self.m], 1))) + + +class Ensemble(nn.ModuleList): + # Ensemble of models + def __init__(self): + super(Ensemble, self).__init__() + + def forward(self, x, augment=False): + y = [] + for module in self: + y.append(module(x, augment)[0]) + # y = torch.stack(y).max(0)[0] # max ensemble + # y = torch.cat(y, 1) # nms ensemble + y = torch.stack(y).mean(0) # mean ensemble + return y, None # inference, train output + + +def attempt_load(weights, map_location=None): + # Loads an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a + model = Ensemble() + for w in weights if isinstance(weights, list) else [weights]: + attempt_download(w) + model.append(torch.load(w, map_location=map_location)['model'].float().fuse().eval()) # load FP32 model + + # Compatibility updates + for m in model.modules(): + if type(m) in [nn.Hardswish, nn.LeakyReLU, nn.ReLU, nn.ReLU6]: + m.inplace = True # pytorch 1.7.0 compatibility + elif type(m) is Conv: + m._non_persistent_buffers_set = set() # pytorch 1.6.0 compatibility + + if len(model) == 1: + return model[-1] # return model + else: + print('Ensemble created with %s\n' % weights) + for k in ['names', 'stride']: + setattr(model, k, getattr(model[-1], k)) + return model # return ensemble diff --git a/experiments/distributed/Detection/models/yolov5l.yaml b/experiments/distributed/Detection/models/yolov5l.yaml new file mode 100644 index 0000000..1309554 --- /dev/null +++ b/experiments/distributed/Detection/models/yolov5l.yaml @@ -0,0 +1,48 @@ +# parameters +nc: 80 # number of classes +depth_multiple: 1.0 # model depth multiple +width_multiple: 1.0 # layer channel multiple + +# anchors +anchors: + - [10,13, 16,30, 33,23] # P3/8 + - [30,61, 62,45, 59,119] # P4/16 + - [116,90, 156,198, 373,326] # P5/32 + +# YOLOv5 backbone +backbone: + # [from, number, module, args] + [[-1, 1, Focus, [64, 3]], # 0-P1/2 + [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 + [-1, 3, BottleneckCSP, [128]], + [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 + [-1, 9, BottleneckCSP, [256]], + [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 + [-1, 9, BottleneckCSP, [512]], + [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 + [-1, 1, SPP, [1024, [5, 9, 13]]], + [-1, 3, BottleneckCSP, [1024, False]], # 9 + ] + +# YOLOv5 head +head: + [[-1, 1, Conv, [512, 1, 1]], + [-1, 1, nn.Upsample, [None, 2, 'nearest']], + [[-1, 6], 1, Concat, [1]], # cat backbone P4 + [-1, 3, BottleneckCSP, [512, False]], # 13 + + [-1, 1, Conv, [256, 1, 1]], + [-1, 1, nn.Upsample, [None, 2, 'nearest']], + [[-1, 4], 1, Concat, [1]], # cat backbone P3 + [-1, 3, BottleneckCSP, [256, False]], # 17 (P3/8-small) + + [-1, 1, Conv, [256, 3, 2]], + [[-1, 14], 1, Concat, [1]], # cat head P4 + [-1, 3, BottleneckCSP, [512, False]], # 20 (P4/16-medium) + + [-1, 1, Conv, [512, 3, 2]], + [[-1, 10], 1, Concat, [1]], # cat head P5 + [-1, 3, BottleneckCSP, [1024, False]], # 23 (P5/32-large) + + [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) + ] diff --git a/experiments/distributed/Detection/models/yolov5m.yaml b/experiments/distributed/Detection/models/yolov5m.yaml new file mode 100644 index 0000000..eb50a71 --- /dev/null +++ b/experiments/distributed/Detection/models/yolov5m.yaml @@ -0,0 +1,48 @@ +# parameters +nc: 80 # number of classes +depth_multiple: 0.67 # model depth multiple +width_multiple: 0.75 # layer channel multiple + +# anchors +anchors: + - [10,13, 16,30, 33,23] # P3/8 + - [30,61, 62,45, 59,119] # P4/16 + - [116,90, 156,198, 373,326] # P5/32 + +# YOLOv5 backbone +backbone: + # [from, number, module, args] + [[-1, 1, Focus, [64, 3]], # 0-P1/2 + [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 + [-1, 3, BottleneckCSP, [128]], + [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 + [-1, 9, BottleneckCSP, [256]], + [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 + [-1, 9, BottleneckCSP, [512]], + [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 + [-1, 1, SPP, [1024, [5, 9, 13]]], + [-1, 3, BottleneckCSP, [1024, False]], # 9 + ] + +# YOLOv5 head +head: + [[-1, 1, Conv, [512, 1, 1]], + [-1, 1, nn.Upsample, [None, 2, 'nearest']], + [[-1, 6], 1, Concat, [1]], # cat backbone P4 + [-1, 3, BottleneckCSP, [512, False]], # 13 + + [-1, 1, Conv, [256, 1, 1]], + [-1, 1, nn.Upsample, [None, 2, 'nearest']], + [[-1, 4], 1, Concat, [1]], # cat backbone P3 + [-1, 3, BottleneckCSP, [256, False]], # 17 (P3/8-small) + + [-1, 1, Conv, [256, 3, 2]], + [[-1, 14], 1, Concat, [1]], # cat head P4 + [-1, 3, BottleneckCSP, [512, False]], # 20 (P4/16-medium) + + [-1, 1, Conv, [512, 3, 2]], + [[-1, 10], 1, Concat, [1]], # cat head P5 + [-1, 3, BottleneckCSP, [1024, False]], # 23 (P5/32-large) + + [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) + ] diff --git a/experiments/distributed/Detection/models/yolov5s.yaml b/experiments/distributed/Detection/models/yolov5s.yaml new file mode 100644 index 0000000..2bec452 --- /dev/null +++ b/experiments/distributed/Detection/models/yolov5s.yaml @@ -0,0 +1,48 @@ +# parameters +nc: 80 # number of classes +depth_multiple: 0.33 # model depth multiple +width_multiple: 0.50 # layer channel multiple + +# anchors +anchors: + - [10,13, 16,30, 33,23] # P3/8 + - [30,61, 62,45, 59,119] # P4/16 + - [116,90, 156,198, 373,326] # P5/32 + +# YOLOv5 backbone +backbone: + # [from, number, module, args] + [[-1, 1, Focus, [64, 3]], # 0-P1/2 + [-1, 1, Conv, [128, 3, 2]], # 1-P2/4 + [-1, 3, BottleneckCSP, [128]], + [-1, 1, Conv, [256, 3, 2]], # 3-P3/8 + [-1, 9, BottleneckCSP, [256]], + [-1, 1, Conv, [512, 3, 2]], # 5-P4/16 + [-1, 9, BottleneckCSP, [512]], + [-1, 1, Conv, [1024, 3, 2]], # 7-P5/32 + [-1, 1, SPP, [1024, [5, 9, 13]]], + [-1, 3, BottleneckCSP, [1024, False]], # 9 + ] + +# YOLOv5 head +head: + [[-1, 1, Conv, [512, 1, 1]], + [-1, 1, nn.Upsample, [None, 2, 'nearest']], + [[-1, 6], 1, Concat, [1]], # cat backbone P4 + [-1, 3, BottleneckCSP, [512, False]], # 13 + + [-1, 1, Conv, [256, 1, 1]], + [-1, 1, nn.Upsample, [None, 2, 'nearest']], + [[-1, 4], 1, Concat, [1]], # cat backbone P3 + [-1, 3, BottleneckCSP, [256, False]], # 17 (P3/8-small) + + [-1, 1, Conv, [256, 3, 2]], + [[-1, 14], 1, Concat, [1]], # cat head P4 + [-1, 3, BottleneckCSP, [512, False]], # 20 (P4/16-medium) + + [-1, 1, Conv, [512, 3, 2]], + [[-1, 10], 1, Concat, [1]], # cat head P5 + [-1, 3, BottleneckCSP, [1024, False]], # 23 (P5/32-large) + + [[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5) + ] diff --git a/experiments/distributed/Detection/run_fedavg_distributed_pytorch.sh b/experiments/distributed/Detection/run_fedavg_distributed_pytorch.sh new file mode 100644 index 0000000..6a1b944 --- /dev/null +++ b/experiments/distributed/Detection/run_fedavg_distributed_pytorch.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +CLIENT_NUM=$1 +WORKER_NUM=$2 +SERVER_NUM=$3 +GPU_NUM_PER_SERVER=$4 +DATA=$5 +DISTRIBUTION=$6 +ROUND=$7 +EPOCH=$8 +BATCH_SIZE=$9 +LR=${10} +DATASET=${11} +DATA_DIR=${12} +WEIGHTS=${13} +CI=${14} +DEVICE=${15} + +PROCESS_NUM=`expr $WORKER_NUM + 1` +#echo $PROCESS_NUM +echo $DATA +echo $DATASET +echo $DATA_DIR +export PYTHONWARNINGS='ignore:semaphore_tracker:UserWarning' + +hostname > mpi_host_file + +mpirun -np $PROCESS_NUM -hostfile ./mpi_host_file python ./main_fedavg_yolo.py \ + --gpu_server_num $SERVER_NUM \ + --gpu_num_per_server $GPU_NUM_PER_SERVER \ + --data $DATA \ + --cfg $DATASET \ + --device $DATA_DIR \ + --partition_method $DISTRIBUTION \ + --client_num_in_total $CLIENT_NUM \ + --client_num_per_round $WORKER_NUM \ + --comm_round $ROUND \ + --epochs $EPOCH \ + --weights $WEIGHTS \ + --batch_size $BATCH_SIZE \ + --lr $LR \ + --ci $CI \ +# --notest \ + --device $DEVICE diff --git a/experiments/distributed/Detection/test.py b/experiments/distributed/Detection/test.py new file mode 100644 index 0000000..7626c07 --- /dev/null +++ b/experiments/distributed/Detection/test.py @@ -0,0 +1,346 @@ +import argparse +import json +import os +from pathlib import Path +from threading import Thread + +import numpy as np +import torch +import yaml +from tqdm import tqdm + +import sys + +sys.path.append("../../../") +# sys.path.append('/home/weiyaowu/Documents/project_doing/fedml/FedML-master') + +from fedml_api.model.object_detection.yolov5.models.experimental import attempt_load +from fedml_api.model.object_detection.yolov5.utils.datasets import create_dataloader +from fedml_api.model.object_detection.yolov5.utils.general import coco80_to_coco91_class, check_dataset, check_file, check_img_size, box_iou, \ + non_max_suppression, scale_coords, xyxy2xywh, xywh2xyxy, set_logging, increment_path +from fedml_api.model.object_detection.yolov5.utils.loss import compute_loss +from fedml_api.model.object_detection.yolov5.utils.metrics import ap_per_class, ConfusionMatrix +from fedml_api.model.object_detection.yolov5.utils.plots import plot_images, output_to_target, plot_study_txt +from fedml_api.model.object_detection.yolov5.utils.torch_utils import select_device, time_synchronized + + +def test(data, + weights=None, + batch_size=16, + imgsz=640, + conf_thres=0.001, + iou_thres=0.6, # for NMS + save_json=False, + single_cls=False, + augment=False, + verbose=False, + model=None, + dataloader=None, + save_dir=Path(''), # for saving images + save_txt=False, # for auto-labelling + save_hybrid=False, # for hybrid auto-labelling + save_conf=False, # save auto-label confidences + plots=True, + log_imgs=0): # number of logged images + + # Initialize/load model and set device + training = model is not None + if training: # called by train.py + device = next(model.parameters()).device # get model device + + else: # called directly + set_logging() + device = select_device(opt.device, batch_size=batch_size) + + # device = 'cpu' + # Directories + save_dir = Path(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok)) # increment run + (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True) # make dir + + # Load model + model = attempt_load(weights, map_location=device) # load FP32 model + imgsz = check_img_size(imgsz, s=model.stride.max()) # check img_size + + # Multi-GPU disabled, incompatible with .half() https://github.com/ultralytics/yolov5/issues/99 + # if device.type != 'cpu' and torch.cuda.device_count() > 1: + # model = nn.DataParallel(model) + + device = torch.device('cpu') + # Half + half = False + # half = device.type != 'cpu' # half precision only supported on CUDA + if False and half: + model.half() + + # Configure + model = model.to(device) + model.eval() + is_coco = data.endswith('coco.yaml') # is COCO dataset + with open(data) as f: + data = yaml.load(f, Loader=yaml.FullLoader) # model dict + check_dataset(data) # check + nc = 1 if single_cls else int(data['nc']) # number of classes + iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for mAP@0.5:0.95 + niou = iouv.numel() + + # Logging + log_imgs, wandb = min(log_imgs, 100), None # ceil + try: + import wandb # Weights & Biases + except ImportError: + log_imgs = 0 + + # Dataloader + if not training: + img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img + _ = model(img.half() if half else img) if device.type != 'cpu' else None # run once + path = data['test'] if opt.task == 'test' else data['val'] # path to val/test images + dataloader = create_dataloader(path, imgsz, batch_size, model.stride.max(), opt, pad=0.5, rect=True)[0] + + seen = 0 + confusion_matrix = ConfusionMatrix(nc=nc) + names = {k: v for k, v in enumerate(model.names if hasattr(model, 'names') else model.module.names)} + coco91class = coco80_to_coco91_class() + s = ('%20s' + '%12s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@.5', 'mAP@.5:.95') + p, r, f1, mp, mr, map50, map, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0. + loss = torch.zeros(3, device=device) + jdict, stats, ap, ap_class, wandb_images = [], [], [], [], [] + for batch_i, (img, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)): + img = img.to(device, non_blocking=True) + img = img.half() if half else img.float() # uint8 to fp16/32 + img /= 255.0 # 0 - 255 to 0.0 - 1.0 + targets = targets.to(device) + nb, _, height, width = img.shape # batch size, channels, height, width + + with torch.no_grad(): + # Run model + t = time_synchronized() + inf_out, train_out = model(img, augment=augment) # inference and training outputs + t0 += time_synchronized() - t + + # Compute loss + if training: + loss += compute_loss([x.float() for x in train_out], targets, model)[1][:3] # box, obj, cls + + # Run NMS + targets[:, 2:] *= torch.Tensor([width, height, width, height]).to(device) # to pixels + lb = [targets[targets[:, 0] == i, 1:] for i in range(nb)] if save_hybrid else [] # for autolabelling + t = time_synchronized() + inf_out = inf_out.cpu() + output = non_max_suppression(inf_out, conf_thres=conf_thres, iou_thres=iou_thres, labels=lb) + t1 += time_synchronized() - t + + # Statistics per image + targets = targets.cpu() + for si, pred in enumerate(output): + labels = targets[targets[:, 0] == si, 1:] + nl = len(labels) + tcls = labels[:, 0].tolist() if nl else [] # target class + path = Path(paths[si]) + seen += 1 + + if len(pred) == 0: + if nl: + stats.append((torch.zeros(0, niou, dtype=torch.bool), torch.Tensor(), torch.Tensor(), tcls)) + continue + + # Predictions + predn = pred.clone() + scale_coords(img[si].shape[1:], predn[:, :4], shapes[si][0], shapes[si][1]) # native-space pred + + # Append to text file + if save_txt: + gn = torch.tensor(shapes[si][0])[[1, 0, 1, 0]] # normalization gain whwh + for *xyxy, conf, cls in predn.tolist(): + xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh + line = (cls, *xywh, conf) if save_conf else (cls, *xywh) # label format + with open(save_dir / 'labels' / (path.stem + '.txt'), 'a') as f: + f.write(('%g ' * len(line)).rstrip() % line + '\n') + + # W&B logging + if plots and len(wandb_images) < log_imgs: + box_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]}, + "class_id": int(cls), + "box_caption": "%s %.3f" % (names[cls], conf), + "scores": {"class_score": conf}, + "domain": "pixel"} for *xyxy, conf, cls in pred.tolist()] + boxes = {"predictions": {"box_data": box_data, "class_labels": names}} # inference-space + wandb_images.append(wandb.Image(img[si], boxes=boxes, caption=path.name)) + + # Append to pycocotools JSON dictionary + if save_json: + # [{"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}, ... + image_id = int(path.stem) if path.stem.isnumeric() else path.stem + box = xyxy2xywh(predn[:, :4]) # xywh + box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner + for p, b in zip(pred.tolist(), box.tolist()): + jdict.append({'image_id': image_id, + 'category_id': coco91class[int(p[5])] if is_coco else int(p[5]), + 'bbox': [round(x, 3) for x in b], + 'score': round(p[4], 5)}) + + # Assign all predictions as incorrect + correct = torch.zeros(pred.shape[0], niou, dtype=torch.bool, device=device) + if nl: + detected = [] # target indices + tcls_tensor = labels[:, 0] + + # target boxes + tbox = xywh2xyxy(labels[:, 1:5]) + scale_coords(img[si].shape[1:], tbox, shapes[si][0], shapes[si][1]) # native-space labels + if plots: + confusion_matrix.process_batch(pred, torch.cat((labels[:, 0:1], tbox), 1)) + + # Per target class + for cls in torch.unique(tcls_tensor): + ti = (cls == tcls_tensor).nonzero(as_tuple=False).view(-1) # prediction indices + pi = (cls == pred[:, 5]).nonzero(as_tuple=False).view(-1) # target indices + + # Search for detections + if pi.shape[0]: + # Prediction to target ious + ious, i = box_iou(predn[pi, :4], tbox[ti]).max(1) # best ious, indices + + # Append detections + detected_set = set() + for j in (ious > iouv[0]).nonzero(as_tuple=False): + d = ti[i[j]] # detected target + if d.item() not in detected_set: + detected_set.add(d.item()) + detected.append(d) + correct[pi[j]] = ious[j] > iouv # iou_thres is 1xn + if len(detected) == nl: # all targets already located in image + break + + # Append statistics (correct, conf, pcls, tcls) + stats.append((correct.cpu(), pred[:, 4].cpu(), pred[:, 5].cpu(), tcls)) + + # Plot images + if plots and batch_i < 3: + f = save_dir / f'test_batch{batch_i}_labels.jpg' # labels + Thread(target=plot_images, args=(img, targets, paths, f, names), daemon=True).start() + f = save_dir / f'test_batch{batch_i}_pred.jpg' # predictions + Thread(target=plot_images, args=(img, output_to_target(output), paths, f, names), daemon=True).start() + + # Compute statistics + stats = [np.concatenate(x, 0) for x in zip(*stats)] # to numpy + if len(stats) and stats[0].any(): + p, r, ap, f1, ap_class = ap_per_class(*stats, plot=plots, save_dir=save_dir, names=names) + p, r, ap50, ap = p[:, 0], r[:, 0], ap[:, 0], ap.mean(1) # [P, R, AP@0.5, AP@0.5:0.95] + mp, mr, map50, map = p.mean(), r.mean(), ap50.mean(), ap.mean() + nt = np.bincount(stats[3].astype(np.int64), minlength=nc) # number of targets per class + else: + nt = torch.zeros(1) + + # Print results + pf = '%20s' + '%12.3g' * 6 # print format + print(pf % ('all', seen, nt.sum(), mp, mr, map50, map)) + + # Print results per class + if verbose and nc > 1 and len(stats): + for i, c in enumerate(ap_class): + print(pf % (names[c], seen, nt[c], p[i], r[i], ap50[i], ap[i])) + + # Print speeds + t = tuple(x / seen * 1E3 for x in (t0, t1, t0 + t1)) + (imgsz, imgsz, batch_size) # tuple + if not training: + print('Speed: %.1f/%.1f/%.1f ms inference/NMS/total per %gx%g image at batch-size %g' % t) + + # Plots + if plots: + confusion_matrix.plot(save_dir=save_dir, names=list(names.values())) + if wandb and wandb.run: + wandb.log({"Images": wandb_images}) + wandb.log({"Validation": [wandb.Image(str(f), caption=f.name) for f in sorted(save_dir.glob('test*.jpg'))]}) + + # Save JSON + if save_json and len(jdict): + w = Path(weights[0] if isinstance(weights, list) else weights).stem if weights is not None else '' # weights + anno_json = '../coco/annotations/instances_val2017.json' # annotations json + pred_json = str(save_dir / f"{w}_predictions.json") # predictions json + print('\nEvaluating pycocotools mAP... saving %s...' % pred_json) + with open(pred_json, 'w') as f: + json.dump(jdict, f) + + try: # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb + from pycocotools.coco import COCO + from pycocotools.cocoeval import COCOeval + + anno = COCO(anno_json) # init annotations api + pred = anno.loadRes(pred_json) # init predictions api + eval = COCOeval(anno, pred, 'bbox') + if is_coco: + eval.params.imgIds = [int(Path(x).stem) for x in dataloader.dataset.img_files] # image IDs to evaluate + eval.evaluate() + eval.accumulate() + eval.summarize() + map, map50 = eval.stats[:2] # update results (mAP@0.5:0.95, mAP@0.5) + except Exception as e: + print(f'pycocotools unable to run: {e}') + + # Return results + if not training: + s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else '' + print(f"Results saved to {save_dir}{s}") + model.float() # for training + model.cuda() + maps = np.zeros(nc) + map + for i, c in enumerate(ap_class): + maps[c] = ap[i] + return (mp, mr, map50, map, *(loss.cpu() / len(dataloader)).tolist()), maps, t + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(prog='test.py') + parser.add_argument('--weights', nargs='+', type=str, default='yolov5s.pt', help='model.pt path(s)') + parser.add_argument('--data', type=str, default='data/coco128.yaml', help='*.data path') + parser.add_argument('--batch-size', type=int, default=32, help='size of each image batch') + parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)') + parser.add_argument('--conf-thres', type=float, default=0.001, help='object confidence threshold') + parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS') + parser.add_argument('--task', default='val', help="'val', 'test', 'study'") + parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') + parser.add_argument('--single-cls', action='store_true', help='treat as single-class dataset') + parser.add_argument('--augment', action='store_true', help='augmented inference') + parser.add_argument('--verbose', action='store_true', help='report mAP by class') + parser.add_argument('--save-txt', action='store_true', help='save results to *.txt') + parser.add_argument('--save-hybrid', action='store_true', help='save label+prediction hybrid results to *.txt') + parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels') + parser.add_argument('--save-json', action='store_true', help='save a cocoapi-compatible JSON results file') + parser.add_argument('--project', default='runs/test', help='save to project/name') + parser.add_argument('--name', default='exp', help='save to project/name') + parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') + opt = parser.parse_args() + opt.save_json |= opt.data.endswith('coco.yaml') + opt.data = check_file(opt.data) # check file + print(opt) + # opt.device = 'cpu' + if opt.task in ['val', 'test']: # run normally + test(opt.data, + opt.weights, + opt.batch_size, + opt.img_size, + opt.conf_thres, + opt.iou_thres, + opt.save_json, + opt.single_cls, + opt.augment, + opt.verbose, + save_txt=opt.save_txt | opt.save_hybrid, + save_hybrid=opt.save_hybrid, + save_conf=opt.save_conf, + ) + + elif opt.task == 'study': # run over a range of settings and save/plot + for weights in ['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt']: + f = 'study_%s_%s.txt' % (Path(opt.data).stem, Path(weights).stem) # filename to save to + x = list(range(320, 800, 64)) # x axis + y = [] # y axis + for i in x: # img-size + print('\nRunning %s point %s...' % (f, i)) + r, _, t = test(opt.data, weights, opt.batch_size, i, opt.conf_thres, opt.iou_thres, opt.save_json, + plots=False) + y.append(r + t) # results and times + np.savetxt(f, y, fmt='%10.4g') # save + os.system('zip -r study.zip study_*.txt') + plot_study_txt(f, x) # plot diff --git a/experiments/distributed/Detection/utils/activations.py b/experiments/distributed/Detection/utils/activations.py new file mode 100644 index 0000000..24f5a30 --- /dev/null +++ b/experiments/distributed/Detection/utils/activations.py @@ -0,0 +1,72 @@ +# Activation functions + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +# Swish https://arxiv.org/pdf/1905.02244.pdf --------------------------------------------------------------------------- +class Swish(nn.Module): # + @staticmethod + def forward(x): + return x * torch.sigmoid(x) + + +class Hardswish(nn.Module): # export-friendly version of nn.Hardswish() + @staticmethod + def forward(x): + # return x * F.hardsigmoid(x) # for torchscript and CoreML + return x * F.hardtanh(x + 3, 0., 6.) / 6. # for torchscript, CoreML and ONNX + + +class MemoryEfficientSwish(nn.Module): + class F(torch.autograd.Function): + @staticmethod + def forward(ctx, x): + ctx.save_for_backward(x) + return x * torch.sigmoid(x) + + @staticmethod + def backward(ctx, grad_output): + x = ctx.saved_tensors[0] + sx = torch.sigmoid(x) + return grad_output * (sx * (1 + x * (1 - sx))) + + def forward(self, x): + return self.F.apply(x) + + +# Mish https://github.com/digantamisra98/Mish -------------------------------------------------------------------------- +class Mish(nn.Module): + @staticmethod + def forward(x): + return x * F.softplus(x).tanh() + + +class MemoryEfficientMish(nn.Module): + class F(torch.autograd.Function): + @staticmethod + def forward(ctx, x): + ctx.save_for_backward(x) + return x.mul(torch.tanh(F.softplus(x))) # x * tanh(ln(1 + exp(x))) + + @staticmethod + def backward(ctx, grad_output): + x = ctx.saved_tensors[0] + sx = torch.sigmoid(x) + fx = F.softplus(x).tanh() + return grad_output * (fx + x * sx * (1 - fx * fx)) + + def forward(self, x): + return self.F.apply(x) + + +# FReLU https://arxiv.org/abs/2007.11824 ------------------------------------------------------------------------------- +class FReLU(nn.Module): + def __init__(self, c1, k=3): # ch_in, kernel + super().__init__() + self.conv = nn.Conv2d(c1, c1, k, 1, 1, groups=c1, bias=False) + self.bn = nn.BatchNorm2d(c1) + + def forward(self, x): + return torch.max(x, self.bn(self.conv(x))) diff --git a/experiments/distributed/Detection/utils/datasets.py b/experiments/distributed/Detection/utils/datasets.py new file mode 100644 index 0000000..313180f --- /dev/null +++ b/experiments/distributed/Detection/utils/datasets.py @@ -0,0 +1,933 @@ +# Dataset utils and dataloaders + +import glob +import logging +import math +import os +import random +import shutil +import time +from itertools import repeat +from multiprocessing.pool import ThreadPool +from pathlib import Path +from threading import Thread + +import cv2 +import numpy as np +import torch +from PIL import Image, ExifTags +from torch.utils.data import Dataset +from tqdm import tqdm + +from .general import xyxy2xywh, xywh2xyxy +from .torch_utils import torch_distributed_zero_first + +# Parameters +help_url = 'https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data' +img_formats = ['bmp', 'jpg', 'jpeg', 'png', 'tif', 'tiff', 'dng'] # acceptable image suffixes +vid_formats = ['mov', 'avi', 'mp4', 'mpg', 'mpeg', 'm4v', 'wmv', 'mkv'] # acceptable video suffixes +logger = logging.getLogger(__name__) + +# Get orientation exif tag +for orientation in ExifTags.TAGS.keys(): + if ExifTags.TAGS[orientation] == 'Orientation': + break + + +def get_hash(files): + # Returns a single hash value of a list of files + return sum(os.path.getsize(f) for f in files if os.path.isfile(f)) + + +def exif_size(img): + # Returns exif-corrected PIL size + s = img.size # (width, height) + try: + rotation = dict(img._getexif().items())[orientation] + if rotation == 6: # rotation 270 + s = (s[1], s[0]) + elif rotation == 8: # rotation 90 + s = (s[1], s[0]) + except: + pass + + return s + + +def create_dataloader(path, imgsz, batch_size, stride, opt, hyp=None, augment=False, cache=False, pad=0.0, rect=False, + rank=-1, world_size=1, workers=8, image_weights=False): + # Make sure only the first process in DDP process the dataset first, and the following others can use the cache + with torch_distributed_zero_first(rank): + dataset = LoadImagesAndLabels(path, imgsz, batch_size, + augment=augment, # augment images + hyp=hyp, # augmentation hyperparameters + rect=rect, # rectangular training + cache_images=cache, + single_cls=opt.single_cls, + stride=int(stride), + pad=pad, + rank=rank, + image_weights=image_weights) + + batch_size = min(batch_size, len(dataset)) + nw = min([os.cpu_count() // world_size, batch_size if batch_size > 1 else 0, workers]) # number of workers + sampler = torch.utils.data.distributed.DistributedSampler(dataset) if rank != -1 else None + loader = torch.utils.data.DataLoader if image_weights else InfiniteDataLoader + # Use torch.utils.data.DataLoader() if dataset.properties will update during training else InfiniteDataLoader() + dataloader = loader(dataset, + batch_size=batch_size, + num_workers=nw, + sampler=sampler, + pin_memory=True, + collate_fn=LoadImagesAndLabels.collate_fn) + return dataloader, dataset + + +class InfiniteDataLoader(torch.utils.data.dataloader.DataLoader): + """ Dataloader that reuses workers + + Uses same syntax as vanilla DataLoader + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + object.__setattr__(self, 'batch_sampler', _RepeatSampler(self.batch_sampler)) + self.iterator = super().__iter__() + + def __len__(self): + return len(self.batch_sampler.sampler) + + def __iter__(self): + for i in range(len(self)): + yield next(self.iterator) + + +class _RepeatSampler(object): + """ Sampler that repeats forever + + Args: + sampler (Sampler) + """ + + def __init__(self, sampler): + self.sampler = sampler + + def __iter__(self): + while True: + yield from iter(self.sampler) + + +class LoadImages: # for inference + def __init__(self, path, img_size=640): + p = str(Path(path)) # os-agnostic + p = os.path.abspath(p) # absolute path + if '*' in p: + files = sorted(glob.glob(p, recursive=True)) # glob + elif os.path.isdir(p): + files = sorted(glob.glob(os.path.join(p, '*.*'))) # dir + elif os.path.isfile(p): + files = [p] # files + else: + raise Exception('ERROR: %s does not exist' % p) + + images = [x for x in files if x.split('.')[-1].lower() in img_formats] + videos = [x for x in files if x.split('.')[-1].lower() in vid_formats] + ni, nv = len(images), len(videos) + + self.img_size = img_size + self.files = images + videos + self.nf = ni + nv # number of files + self.video_flag = [False] * ni + [True] * nv + self.mode = 'image' + if any(videos): + self.new_video(videos[0]) # new video + else: + self.cap = None + assert self.nf > 0, 'No images or videos found in %s. Supported formats are:\nimages: %s\nvideos: %s' % \ + (p, img_formats, vid_formats) + + def __iter__(self): + self.count = 0 + return self + + def __next__(self): + if self.count == self.nf: + raise StopIteration + path = self.files[self.count] + + if self.video_flag[self.count]: + # Read video + self.mode = 'video' + ret_val, img0 = self.cap.read() + if not ret_val: + self.count += 1 + self.cap.release() + if self.count == self.nf: # last video + raise StopIteration + else: + path = self.files[self.count] + self.new_video(path) + ret_val, img0 = self.cap.read() + + self.frame += 1 + print('video %g/%g (%g/%g) %s: ' % (self.count + 1, self.nf, self.frame, self.nframes, path), end='') + + else: + # Read image + self.count += 1 + img0 = cv2.imread(path) # BGR + assert img0 is not None, 'Image Not Found ' + path + print('image %g/%g %s: ' % (self.count, self.nf, path), end='') + + # Padded resize + img = letterbox(img0, new_shape=self.img_size)[0] + + # Convert + img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416 + img = np.ascontiguousarray(img) + + return path, img, img0, self.cap + + def new_video(self, path): + self.frame = 0 + self.cap = cv2.VideoCapture(path) + self.nframes = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) + + def __len__(self): + return self.nf # number of files + + +class LoadWebcam: # for inference + def __init__(self, pipe='0', img_size=640): + self.img_size = img_size + + if pipe.isnumeric(): + pipe = eval(pipe) # local camera + # pipe = 'rtsp://192.168.1.64/1' # IP camera + # pipe = 'rtsp://username:password@192.168.1.64/1' # IP camera with login + # pipe = 'http://wmccpinetop.axiscam.net/mjpg/video.mjpg' # IP golf camera + + self.pipe = pipe + self.cap = cv2.VideoCapture(pipe) # video capture object + self.cap.set(cv2.CAP_PROP_BUFFERSIZE, 3) # set buffer size + + def __iter__(self): + self.count = -1 + return self + + def __next__(self): + self.count += 1 + if cv2.waitKey(1) == ord('q'): # q to quit + self.cap.release() + cv2.destroyAllWindows() + raise StopIteration + + # Read frame + if self.pipe == 0: # local camera + ret_val, img0 = self.cap.read() + img0 = cv2.flip(img0, 1) # flip left-right + else: # IP camera + n = 0 + while True: + n += 1 + self.cap.grab() + if n % 30 == 0: # skip frames + ret_val, img0 = self.cap.retrieve() + if ret_val: + break + + # Print + assert ret_val, 'Camera Error %s' % self.pipe + img_path = 'webcam.jpg' + print('webcam %g: ' % self.count, end='') + + # Padded resize + img = letterbox(img0, new_shape=self.img_size)[0] + + # Convert + img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416 + img = np.ascontiguousarray(img) + + return img_path, img, img0, None + + def __len__(self): + return 0 + + +class LoadStreams: # multiple IP or RTSP cameras + def __init__(self, sources='streams.txt', img_size=640): + self.mode = 'stream' + self.img_size = img_size + + if os.path.isfile(sources): + with open(sources, 'r') as f: + sources = [x.strip() for x in f.read().strip().splitlines() if len(x.strip())] + else: + sources = [sources] + + n = len(sources) + self.imgs = [None] * n + self.sources = sources + for i, s in enumerate(sources): + # Start the thread to read frames from the video stream + print('%g/%g: %s... ' % (i + 1, n, s), end='') + cap = cv2.VideoCapture(eval(s) if s.isnumeric() else s) + assert cap.isOpened(), 'Failed to open %s' % s + w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fps = cap.get(cv2.CAP_PROP_FPS) % 100 + _, self.imgs[i] = cap.read() # guarantee first frame + thread = Thread(target=self.update, args=([i, cap]), daemon=True) + print(' success (%gx%g at %.2f FPS).' % (w, h, fps)) + thread.start() + print('') # newline + + # check for common shapes + s = np.stack([letterbox(x, new_shape=self.img_size)[0].shape for x in self.imgs], 0) # inference shapes + self.rect = np.unique(s, axis=0).shape[0] == 1 # rect inference if all shapes equal + if not self.rect: + print('WARNING: Different stream shapes detected. For optimal performance supply similarly-shaped streams.') + + def update(self, index, cap): + # Read next stream frame in a daemon thread + n = 0 + while cap.isOpened(): + n += 1 + # _, self.imgs[index] = cap.read() + cap.grab() + if n == 4: # read every 4th frame + _, self.imgs[index] = cap.retrieve() + n = 0 + time.sleep(0.01) # wait time + + def __iter__(self): + self.count = -1 + return self + + def __next__(self): + self.count += 1 + img0 = self.imgs.copy() + if cv2.waitKey(1) == ord('q'): # q to quit + cv2.destroyAllWindows() + raise StopIteration + + # Letterbox + img = [letterbox(x, new_shape=self.img_size, auto=self.rect)[0] for x in img0] + + # Stack + img = np.stack(img, 0) + + # Convert + img = img[:, :, :, ::-1].transpose(0, 3, 1, 2) # BGR to RGB, to bsx3x416x416 + img = np.ascontiguousarray(img) + + return self.sources, img, img0, None + + def __len__(self): + return 0 # 1E12 frames = 32 streams at 30 FPS for 30 years + + +def img2label_paths(img_paths): + # Define label paths as a function of image paths + sa, sb = os.sep + 'images' + os.sep, os.sep + 'labels' + os.sep # /images/, /labels/ substrings + return [x.replace(sa, sb, 1).replace('.' + x.split('.')[-1], '.txt') for x in img_paths] + + +class LoadImagesAndLabels(Dataset): # for training/testing + def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False, + cache_images=False, single_cls=False, stride=32, pad=0.0, rank=-1): + self.img_size = img_size + self.augment = augment + self.hyp = hyp + self.image_weights = image_weights + self.rect = False if image_weights else rect + self.mosaic = self.augment and not self.rect # load 4 images at a time into a mosaic (only during training) + self.mosaic_border = [-img_size // 2, -img_size // 2] + self.stride = stride + + try: + f = [] # image files + for p in path if isinstance(path, list) else [path]: + p = Path(p) # os-agnostic + if p.is_dir(): # dir + f += glob.glob(str(p / '**' / '*.*'), recursive=True) + elif p.is_file(): # file + with open(p, 'r') as t: + t = t.read().strip().splitlines() + parent = str(p.parent) + os.sep + f += [x.replace('./', parent) if x.startswith('./') else x for x in t] # local to global path + else: + raise Exception('%s does not exist' % p) + self.img_files = sorted([x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in img_formats]) + assert self.img_files, 'No images found' + except Exception as e: + raise Exception('Error loading data from %s: %s\nSee %s' % (path, e, help_url)) + + # Check cache + self.label_files = img2label_paths(self.img_files) # labels + cache_path = Path(self.label_files[0]).parent.with_suffix('.cache') # cached labels + if cache_path.is_file(): + cache = torch.load(cache_path) # load + if cache['hash'] != get_hash(self.label_files + self.img_files) or 'results' not in cache: # changed + cache = self.cache_labels(cache_path) # re-cache + else: + cache = self.cache_labels(cache_path) # cache + + # Display cache + [nf, nm, ne, nc, n] = cache.pop('results') # found, missing, empty, corrupted, total + desc = f"Scanning '{cache_path}' for images and labels... {nf} found, {nm} missing, {ne} empty, {nc} corrupted" + tqdm(None, desc=desc, total=n, initial=n) + assert nf > 0 or not augment, f'No labels found in {cache_path}. Can not train without labels. See {help_url}' + + # Read cache + cache.pop('hash') # remove hash + labels, shapes = zip(*cache.values()) + self.labels = list(labels) + self.shapes = np.array(shapes, dtype=np.float64) + self.img_files = list(cache.keys()) # update + self.label_files = img2label_paths(cache.keys()) # update + if single_cls: + for x in self.labels: + x[:, 0] = 0 + + n = len(shapes) # number of images + bi = np.floor(np.arange(n) / batch_size).astype(np.int) # batch index + nb = bi[-1] + 1 # number of batches + self.batch = bi # batch index of image + self.n = n + self.indices = range(n) + + # Rectangular Training + if self.rect: + # Sort by aspect ratio + s = self.shapes # wh + ar = s[:, 1] / s[:, 0] # aspect ratio + irect = ar.argsort() + self.img_files = [self.img_files[i] for i in irect] + self.label_files = [self.label_files[i] for i in irect] + self.labels = [self.labels[i] for i in irect] + self.shapes = s[irect] # wh + ar = ar[irect] + + # Set training image shapes + shapes = [[1, 1]] * nb + for i in range(nb): + ari = ar[bi == i] + mini, maxi = ari.min(), ari.max() + if maxi < 1: + shapes[i] = [maxi, 1] + elif mini > 1: + shapes[i] = [1, 1 / mini] + + self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(np.int) * stride + + # Cache images into memory for faster training (WARNING: large datasets may exceed system RAM) + self.imgs = [None] * n + if cache_images: + gb = 0 # Gigabytes of cached images + self.img_hw0, self.img_hw = [None] * n, [None] * n + results = ThreadPool(8).imap(lambda x: load_image(*x), zip(repeat(self), range(n))) # 8 threads + pbar = tqdm(enumerate(results), total=n) + for i, x in pbar: + self.imgs[i], self.img_hw0[i], self.img_hw[i] = x # img, hw_original, hw_resized = load_image(self, i) + gb += self.imgs[i].nbytes + pbar.desc = 'Caching images (%.1fGB)' % (gb / 1E9) + + def cache_labels(self, path=Path('./labels.cache')): + # Cache dataset labels, check images and read shapes + x = {} # dict + nm, nf, ne, nc = 0, 0, 0, 0 # number missing, found, empty, duplicate + pbar = tqdm(zip(self.img_files, self.label_files), desc='Scanning images', total=len(self.img_files)) + for i, (im_file, lb_file) in enumerate(pbar): + try: + # verify images + im = Image.open(im_file) + im.verify() # PIL verify + shape = exif_size(im) # image size + assert (shape[0] > 9) & (shape[1] > 9), 'image size <10 pixels' + + # verify labels + if os.path.isfile(lb_file): + nf += 1 # label found + with open(lb_file, 'r') as f: + l = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32) # labels + if len(l): + assert l.shape[1] == 5, 'labels require 5 columns each' + assert (l >= 0).all(), 'negative labels' + assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels' + assert np.unique(l, axis=0).shape[0] == l.shape[0], 'duplicate labels' + else: + ne += 1 # label empty + l = np.zeros((0, 5), dtype=np.float32) + else: + nm += 1 # label missing + l = np.zeros((0, 5), dtype=np.float32) + x[im_file] = [l, shape] + except Exception as e: + nc += 1 + print('WARNING: Ignoring corrupted image and/or label %s: %s' % (im_file, e)) + + pbar.desc = f"Scanning '{path.parent / path.stem}' for images and labels... " \ + f"{nf} found, {nm} missing, {ne} empty, {nc} corrupted" + + if nf == 0: + print(f'WARNING: No labels found in {path}. See {help_url}') + + x['hash'] = get_hash(self.label_files + self.img_files) + x['results'] = [nf, nm, ne, nc, i + 1] + torch.save(x, path) # save for next time + logging.info(f"New cache created: {path}") + return x + + def __len__(self): + return len(self.img_files) + + # def __iter__(self): + # self.count = -1 + # print('ran dataset iter') + # #self.shuffled_vector = np.random.permutation(self.nF) if self.augment else np.arange(self.nF) + # return self + + def __getitem__(self, index): + index = self.indices[index] # linear, shuffled, or image_weights + + hyp = self.hyp + mosaic = self.mosaic and random.random() < hyp['mosaic'] + if mosaic: + # Load mosaic + img, labels = load_mosaic(self, index) + shapes = None + + # MixUp https://arxiv.org/pdf/1710.09412.pdf + if random.random() < hyp['mixup']: + img2, labels2 = load_mosaic(self, random.randint(0, self.n - 1)) + r = np.random.beta(8.0, 8.0) # mixup ratio, alpha=beta=8.0 + img = (img * r + img2 * (1 - r)).astype(np.uint8) + labels = np.concatenate((labels, labels2), 0) + + else: + # Load image + img, (h0, w0), (h, w) = load_image(self, index) + + # Letterbox + shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size # final letterboxed shape + img, ratio, pad = letterbox(img, shape, auto=False, scaleup=self.augment) + shapes = (h0, w0), ((h / h0, w / w0), pad) # for COCO mAP rescaling + + # Load labels + labels = [] + x = self.labels[index] + if x.size > 0: + # Normalized xywh to pixel xyxy format + labels = x.copy() + labels[:, 1] = ratio[0] * w * (x[:, 1] - x[:, 3] / 2) + pad[0] # pad width + labels[:, 2] = ratio[1] * h * (x[:, 2] - x[:, 4] / 2) + pad[1] # pad height + labels[:, 3] = ratio[0] * w * (x[:, 1] + x[:, 3] / 2) + pad[0] + labels[:, 4] = ratio[1] * h * (x[:, 2] + x[:, 4] / 2) + pad[1] + + if self.augment: + # Augment imagespace + if not mosaic: + img, labels = random_perspective(img, labels, + degrees=hyp['degrees'], + translate=hyp['translate'], + scale=hyp['scale'], + shear=hyp['shear'], + perspective=hyp['perspective']) + + # Augment colorspace + augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v']) + + # Apply cutouts + # if random.random() < 0.9: + # labels = cutout(img, labels) + + nL = len(labels) # number of labels + if nL: + labels[:, 1:5] = xyxy2xywh(labels[:, 1:5]) # convert xyxy to xywh + labels[:, [2, 4]] /= img.shape[0] # normalized height 0-1 + labels[:, [1, 3]] /= img.shape[1] # normalized width 0-1 + + if self.augment: + # flip up-down + if random.random() < hyp['flipud']: + img = np.flipud(img) + if nL: + labels[:, 2] = 1 - labels[:, 2] + + # flip left-right + if random.random() < hyp['fliplr']: + img = np.fliplr(img) + if nL: + labels[:, 1] = 1 - labels[:, 1] + + labels_out = torch.zeros((nL, 6)) + if nL: + labels_out[:, 1:] = torch.from_numpy(labels) + + # Convert + img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416 + img = np.ascontiguousarray(img) + + return torch.from_numpy(img), labels_out, self.img_files[index], shapes + + @staticmethod + def collate_fn(batch): + img, label, path, shapes = zip(*batch) # transposed + for i, l in enumerate(label): + l[:, 0] = i # add target image index for build_targets() + return torch.stack(img, 0), torch.cat(label, 0), path, shapes + + +# Ancillary functions -------------------------------------------------------------------------------------------------- +def load_image(self, index): + # loads 1 image from dataset, returns img, original hw, resized hw + img = self.imgs[index] + if img is None: # not cached + path = self.img_files[index] + img = cv2.imread(path) # BGR + assert img is not None, 'Image Not Found ' + path + h0, w0 = img.shape[:2] # orig hw + r = self.img_size / max(h0, w0) # resize image to img_size + if r != 1: # always resize down, only resize up if training with augmentation + interp = cv2.INTER_AREA if r < 1 and not self.augment else cv2.INTER_LINEAR + img = cv2.resize(img, (int(w0 * r), int(h0 * r)), interpolation=interp) + return img, (h0, w0), img.shape[:2] # img, hw_original, hw_resized + else: + return self.imgs[index], self.img_hw0[index], self.img_hw[index] # img, hw_original, hw_resized + + +def augment_hsv(img, hgain=0.5, sgain=0.5, vgain=0.5): + r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1 # random gains + hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV)) + dtype = img.dtype # uint8 + + x = np.arange(0, 256, dtype=np.int16) + lut_hue = ((x * r[0]) % 180).astype(dtype) + lut_sat = np.clip(x * r[1], 0, 255).astype(dtype) + lut_val = np.clip(x * r[2], 0, 255).astype(dtype) + + img_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))).astype(dtype) + cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) # no return needed + + # Histogram equalization + # if random.random() < 0.2: + # for i in range(3): + # img[:, :, i] = cv2.equalizeHist(img[:, :, i]) + + +def load_mosaic(self, index): + # loads images in a mosaic + + labels4 = [] + s = self.img_size + yc, xc = [int(random.uniform(-x, 2 * s + x)) for x in self.mosaic_border] # mosaic center x, y + indices = [index] + [self.indices[random.randint(0, self.n - 1)] for _ in range(3)] # 3 additional image indices + for i, index in enumerate(indices): + # Load image + img, _, (h, w) = load_image(self, index) + + # place img in img4 + if i == 0: # top left + img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8) # base image with 4 tiles + x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc # xmin, ymin, xmax, ymax (large image) + x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h # xmin, ymin, xmax, ymax (small image) + elif i == 1: # top right + x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc + x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h + elif i == 2: # bottom left + x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h) + x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h) + elif i == 3: # bottom right + x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h) + x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h) + + img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b] # img4[ymin:ymax, xmin:xmax] + padw = x1a - x1b + padh = y1a - y1b + + # Labels + x = self.labels[index] + labels = x.copy() + if x.size > 0: # Normalized xywh to pixel xyxy format + labels[:, 1] = w * (x[:, 1] - x[:, 3] / 2) + padw + labels[:, 2] = h * (x[:, 2] - x[:, 4] / 2) + padh + labels[:, 3] = w * (x[:, 1] + x[:, 3] / 2) + padw + labels[:, 4] = h * (x[:, 2] + x[:, 4] / 2) + padh + labels4.append(labels) + + # Concat/clip labels + if len(labels4): + labels4 = np.concatenate(labels4, 0) + np.clip(labels4[:, 1:], 0, 2 * s, out=labels4[:, 1:]) # use with random_perspective + # img4, labels4 = replicate(img4, labels4) # replicate + + # Augment + img4, labels4 = random_perspective(img4, labels4, + degrees=self.hyp['degrees'], + translate=self.hyp['translate'], + scale=self.hyp['scale'], + shear=self.hyp['shear'], + perspective=self.hyp['perspective'], + border=self.mosaic_border) # border to remove + + return img4, labels4 + + +def replicate(img, labels): + # Replicate labels + h, w = img.shape[:2] + boxes = labels[:, 1:].astype(int) + x1, y1, x2, y2 = boxes.T + s = ((x2 - x1) + (y2 - y1)) / 2 # side length (pixels) + for i in s.argsort()[:round(s.size * 0.5)]: # smallest indices + x1b, y1b, x2b, y2b = boxes[i] + bh, bw = y2b - y1b, x2b - x1b + yc, xc = int(random.uniform(0, h - bh)), int(random.uniform(0, w - bw)) # offset x, y + x1a, y1a, x2a, y2a = [xc, yc, xc + bw, yc + bh] + img[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b] # img4[ymin:ymax, xmin:xmax] + labels = np.append(labels, [[labels[i, 0], x1a, y1a, x2a, y2a]], axis=0) + + return img, labels + + +def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True): + # Resize image to a 32-pixel-multiple rectangle https://github.com/ultralytics/yolov3/issues/232 + shape = img.shape[:2] # current shape [height, width] + if isinstance(new_shape, int): + new_shape = (new_shape, new_shape) + + # Scale ratio (new / old) + r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) + if not scaleup: # only scale down, do not scale up (for better test mAP) + r = min(r, 1.0) + + # Compute padding + ratio = r, r # width, height ratios + new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) + dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding + if auto: # minimum rectangle + dw, dh = np.mod(dw, 32), np.mod(dh, 32) # wh padding + elif scaleFill: # stretch + dw, dh = 0.0, 0.0 + new_unpad = (new_shape[1], new_shape[0]) + ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios + + dw /= 2 # divide padding into 2 sides + dh /= 2 + + if shape[::-1] != new_unpad: # resize + img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR) + top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) + left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) + img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border + return img, ratio, (dw, dh) + + +def random_perspective(img, targets=(), degrees=10, translate=.1, scale=.1, shear=10, perspective=0.0, border=(0, 0)): + # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10)) + # targets = [cls, xyxy] + + height = img.shape[0] + border[0] * 2 # shape(h,w,c) + width = img.shape[1] + border[1] * 2 + + # Center + C = np.eye(3) + C[0, 2] = -img.shape[1] / 2 # x translation (pixels) + C[1, 2] = -img.shape[0] / 2 # y translation (pixels) + + # Perspective + P = np.eye(3) + P[2, 0] = random.uniform(-perspective, perspective) # x perspective (about y) + P[2, 1] = random.uniform(-perspective, perspective) # y perspective (about x) + + # Rotation and Scale + R = np.eye(3) + a = random.uniform(-degrees, degrees) + # a += random.choice([-180, -90, 0, 90]) # add 90deg rotations to small rotations + s = random.uniform(1 - scale, 1 + scale) + # s = 2 ** random.uniform(-scale, scale) + R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s) + + # Shear + S = np.eye(3) + S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # x shear (deg) + S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # y shear (deg) + + # Translation + T = np.eye(3) + T[0, 2] = random.uniform(0.5 - translate, 0.5 + translate) * width # x translation (pixels) + T[1, 2] = random.uniform(0.5 - translate, 0.5 + translate) * height # y translation (pixels) + + # Combined rotation matrix + M = T @ S @ R @ P @ C # order of operations (right to left) is IMPORTANT + if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any(): # image changed + if perspective: + img = cv2.warpPerspective(img, M, dsize=(width, height), borderValue=(114, 114, 114)) + else: # affine + img = cv2.warpAffine(img, M[:2], dsize=(width, height), borderValue=(114, 114, 114)) + + # Visualize + # import matplotlib.pyplot as plt + # ax = plt.subplots(1, 2, figsize=(12, 6))[1].ravel() + # ax[0].imshow(img[:, :, ::-1]) # base + # ax[1].imshow(img2[:, :, ::-1]) # warped + + # Transform label coordinates + n = len(targets) + if n: + # warp points + xy = np.ones((n * 4, 3)) + xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1 + xy = xy @ M.T # transform + if perspective: + xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8) # rescale + else: # affine + xy = xy[:, :2].reshape(n, 8) + + # create new boxes + x = xy[:, [0, 2, 4, 6]] + y = xy[:, [1, 3, 5, 7]] + xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T + + # # apply angle-based reduction of bounding boxes + # radians = a * math.pi / 180 + # reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5 + # x = (xy[:, 2] + xy[:, 0]) / 2 + # y = (xy[:, 3] + xy[:, 1]) / 2 + # w = (xy[:, 2] - xy[:, 0]) * reduction + # h = (xy[:, 3] - xy[:, 1]) * reduction + # xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T + + # clip boxes + xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width) + xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height) + + # filter candidates + i = box_candidates(box1=targets[:, 1:5].T * s, box2=xy.T) + targets = targets[i] + targets[:, 1:5] = xy[i] + + return img, targets + + +def box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.1): # box1(4,n), box2(4,n) + # Compute candidate boxes: box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio + w1, h1 = box1[2] - box1[0], box1[3] - box1[1] + w2, h2 = box2[2] - box2[0], box2[3] - box2[1] + ar = np.maximum(w2 / (h2 + 1e-16), h2 / (w2 + 1e-16)) # aspect ratio + return (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + 1e-16) > area_thr) & (ar < ar_thr) # candidates + + +def cutout(image, labels): + # Applies image cutout augmentation https://arxiv.org/abs/1708.04552 + h, w = image.shape[:2] + + def bbox_ioa(box1, box2): + # Returns the intersection over box2 area given box1, box2. box1 is 4, box2 is nx4. boxes are x1y1x2y2 + box2 = box2.transpose() + + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3] + b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3] + + # Intersection area + inter_area = (np.minimum(b1_x2, b2_x2) - np.maximum(b1_x1, b2_x1)).clip(0) * \ + (np.minimum(b1_y2, b2_y2) - np.maximum(b1_y1, b2_y1)).clip(0) + + # box2 area + box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + 1e-16 + + # Intersection over box2 area + return inter_area / box2_area + + # create random masks + scales = [0.5] * 1 + [0.25] * 2 + [0.125] * 4 + [0.0625] * 8 + [0.03125] * 16 # image size fraction + for s in scales: + mask_h = random.randint(1, int(h * s)) + mask_w = random.randint(1, int(w * s)) + + # box + xmin = max(0, random.randint(0, w) - mask_w // 2) + ymin = max(0, random.randint(0, h) - mask_h // 2) + xmax = min(w, xmin + mask_w) + ymax = min(h, ymin + mask_h) + + # apply random color mask + image[ymin:ymax, xmin:xmax] = [random.randint(64, 191) for _ in range(3)] + + # return unobscured labels + if len(labels) and s > 0.03: + box = np.array([xmin, ymin, xmax, ymax], dtype=np.float32) + ioa = bbox_ioa(box, labels[:, 1:5]) # intersection over area + labels = labels[ioa < 0.60] # remove >60% obscured labels + + return labels + + +def create_folder(path='./new'): + # Create folder + if os.path.exists(path): + shutil.rmtree(path) # delete output folder + os.makedirs(path) # make new output folder + + +def flatten_recursive(path='../coco128'): + # Flatten a recursive directory by bringing all files to top level + new_path = Path(path + '_flat') + create_folder(new_path) + for file in tqdm(glob.glob(str(Path(path)) + '/**/*.*', recursive=True)): + shutil.copyfile(file, new_path / Path(file).name) + + +def extract_boxes(path='../coco128/'): # from utils.datasets import *; extract_boxes('../coco128') + # Convert detection dataset into classification dataset, with one directory per class + + path = Path(path) # images dir + shutil.rmtree(path / 'classifier') if (path / 'classifier').is_dir() else None # remove existing + files = list(path.rglob('*.*')) + n = len(files) # number of files + for im_file in tqdm(files, total=n): + if im_file.suffix[1:] in img_formats: + # image + im = cv2.imread(str(im_file))[..., ::-1] # BGR to RGB + h, w = im.shape[:2] + + # labels + lb_file = Path(img2label_paths([str(im_file)])[0]) + if Path(lb_file).exists(): + with open(lb_file, 'r') as f: + lb = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32) # labels + + for j, x in enumerate(lb): + c = int(x[0]) # class + f = (path / 'classifier') / f'{c}' / f'{path.stem}_{im_file.stem}_{j}.jpg' # new filename + if not f.parent.is_dir(): + f.parent.mkdir(parents=True) + + b = x[1:] * [w, h, w, h] # box + # b[2:] = b[2:].max() # rectangle to square + b[2:] = b[2:] * 1.2 + 3 # pad + b = xywh2xyxy(b.reshape(-1, 4)).ravel().astype(np.int) + + b[[0, 2]] = np.clip(b[[0, 2]], 0, w) # clip boxes outside of image + b[[1, 3]] = np.clip(b[[1, 3]], 0, h) + assert cv2.imwrite(str(f), im[b[1]:b[3], b[0]:b[2]]), f'box failure in {f}' + + +def autosplit(path='../coco128', weights=(0.9, 0.1, 0.0)): # from utils.datasets import *; autosplit('../coco128') + """ Autosplit a dataset into train/val/test splits and save path/autosplit_*.txt files + # Arguments + path: Path to images directory + weights: Train, val, test weights (list) + """ + path = Path(path) # images dir + files = list(path.rglob('*.*')) + n = len(files) # number of files + indices = random.choices([0, 1, 2], weights=weights, k=n) # assign each image to a split + txt = ['autosplit_train.txt', 'autosplit_val.txt', 'autosplit_test.txt'] # 3 txt files + [(path / x).unlink() for x in txt if (path / x).exists()] # remove existing + for i, img in tqdm(zip(indices, files), total=n): + if img.suffix[1:] in img_formats: + with open(path / txt[i], 'a') as f: + f.write(str(img) + '\n') # add image to txt file diff --git a/experiments/distributed/Detection/utils/google_app_engine/Dockerfile b/experiments/distributed/Detection/utils/google_app_engine/Dockerfile new file mode 100644 index 0000000..0155618 --- /dev/null +++ b/experiments/distributed/Detection/utils/google_app_engine/Dockerfile @@ -0,0 +1,25 @@ +FROM gcr.io/google-appengine/python + +# Create a virtualenv for dependencies. This isolates these packages from +# system-level packages. +# Use -p python3 or -p python3.7 to select python version. Default is version 2. +RUN virtualenv /env -p python3 + +# Setting these environment variables are the same as running +# source /env/bin/activate. +ENV VIRTUAL_ENV /env +ENV PATH /env/bin:$PATH + +RUN apt-get update && apt-get install -y python-opencv + +# Copy the application's requirements.txt and run pip to install all +# dependencies into the virtualenv. +ADD requirements.txt /app/requirements.txt +RUN pip install -r /app/requirements.txt + +# Add the application source code. +ADD . /app + +# Run a WSGI server to serve the application. gunicorn must be declared as +# a dependency in requirements.txt. +CMD gunicorn -b :$PORT main:app diff --git a/experiments/distributed/Detection/utils/google_app_engine/additional_requirements.txt b/experiments/distributed/Detection/utils/google_app_engine/additional_requirements.txt new file mode 100644 index 0000000..5fcc305 --- /dev/null +++ b/experiments/distributed/Detection/utils/google_app_engine/additional_requirements.txt @@ -0,0 +1,4 @@ +# add these requirements in your app on top of the existing ones +pip==18.1 +Flask==1.0.2 +gunicorn==19.9.0 diff --git a/experiments/distributed/Detection/utils/google_app_engine/app.yaml b/experiments/distributed/Detection/utils/google_app_engine/app.yaml new file mode 100644 index 0000000..ac29d10 --- /dev/null +++ b/experiments/distributed/Detection/utils/google_app_engine/app.yaml @@ -0,0 +1,14 @@ +runtime: custom +env: flex + +service: yolov5app + +liveness_check: + initial_delay_sec: 600 + +manual_scaling: + instances: 1 +resources: + cpu: 1 + memory_gb: 4 + disk_size_gb: 20 \ No newline at end of file diff --git a/experiments/distributed/Detection/utils/metrics.py b/experiments/distributed/Detection/utils/metrics.py new file mode 100644 index 0000000..99d5bcf --- /dev/null +++ b/experiments/distributed/Detection/utils/metrics.py @@ -0,0 +1,200 @@ +# Model validation metrics + +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import torch + +from . import general + + +def fitness(x): + # Model fitness as a weighted combination of metrics + w = [0.0, 0.0, 0.1, 0.9] # weights for [P, R, mAP@0.5, mAP@0.5:0.95] + return (x[:, :4] * w).sum(1) + + +def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='precision-recall_curve.png', names=[]): + """ Compute the average precision, given the recall and precision curves. + Source: https://github.com/rafaelpadilla/Object-Detection-Metrics. + # Arguments + tp: True positives (nparray, nx1 or nx10). + conf: Objectness value from 0-1 (nparray). + pred_cls: Predicted object classes (nparray). + target_cls: True object classes (nparray). + plot: Plot precision-recall curve at mAP@0.5 + save_dir: Plot save directory + # Returns + The average precision as computed in py-faster-rcnn. + """ + + # Sort by objectness + i = np.argsort(-conf) + tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] + + # Find unique classes + unique_classes = np.unique(target_cls) + + # Create Precision-Recall curve and compute AP for each class + px, py = np.linspace(0, 1, 1000), [] # for plotting + pr_score = 0.1 # score to evaluate P and R https://github.com/ultralytics/yolov3/issues/898 + s = [unique_classes.shape[0], tp.shape[1]] # number class, number iou thresholds (i.e. 10 for mAP0.5...0.95) + ap, p, r = np.zeros(s), np.zeros(s), np.zeros(s) + for ci, c in enumerate(unique_classes): + i = pred_cls == c + n_l = (target_cls == c).sum() # number of labels + n_p = i.sum() # number of predictions + + if n_p == 0 or n_l == 0: + continue + else: + # Accumulate FPs and TPs + fpc = (1 - tp[i]).cumsum(0) + tpc = tp[i].cumsum(0) + + # Recall + recall = tpc / (n_l + 1e-16) # recall curve + r[ci] = np.interp(-pr_score, -conf[i], recall[:, 0]) # r at pr_score, negative x, xp because xp decreases + + # Precision + precision = tpc / (tpc + fpc) # precision curve + p[ci] = np.interp(-pr_score, -conf[i], precision[:, 0]) # p at pr_score + + # AP from recall-precision curve + for j in range(tp.shape[1]): + ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j]) + if plot and (j == 0): + py.append(np.interp(px, mrec, mpre)) # precision at mAP@0.5 + + # Compute F1 score (harmonic mean of precision and recall) + f1 = 2 * p * r / (p + r + 1e-16) + + if plot: + plot_pr_curve(px, py, ap, save_dir, names) + + return p, r, ap, f1, unique_classes.astype('int32') + + +def compute_ap(recall, precision): + """ Compute the average precision, given the recall and precision curves + # Arguments + recall: The recall curve (list) + precision: The precision curve (list) + # Returns + Average precision, precision curve, recall curve + """ + + # Append sentinel values to beginning and end + mrec = np.concatenate(([0.], recall, [recall[-1] + 0.01])) + mpre = np.concatenate(([1.], precision, [0.])) + + # Compute the precision envelope + mpre = np.flip(np.maximum.accumulate(np.flip(mpre))) + + # Integrate area under curve + method = 'interp' # methods: 'continuous', 'interp' + if method == 'interp': + x = np.linspace(0, 1, 101) # 101-point interp (COCO) + ap = np.trapz(np.interp(x, mrec, mpre), x) # integrate + else: # 'continuous' + i = np.where(mrec[1:] != mrec[:-1])[0] # points where x axis (recall) changes + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) # area under curve + + return ap, mpre, mrec + + +class ConfusionMatrix: + # Updated version of https://github.com/kaanakan/object_detection_confusion_matrix + def __init__(self, nc, conf=0.25, iou_thres=0.45): + self.matrix = np.zeros((nc + 1, nc + 1)) + self.nc = nc # number of classes + self.conf = conf + self.iou_thres = iou_thres + + def process_batch(self, detections, labels): + """ + Return intersection-over-union (Jaccard index) of boxes. + Both sets of boxes are expected to be in (x1, y1, x2, y2) format. + Arguments: + detections (Array[N, 6]), x1, y1, x2, y2, conf, class + labels (Array[M, 5]), class, x1, y1, x2, y2 + Returns: + None, updates confusion matrix accordingly + """ + detections = detections[detections[:, 4] > self.conf] + gt_classes = labels[:, 0].int() + detection_classes = detections[:, 5].int() + iou = general.box_iou(labels[:, 1:], detections[:, :4]) + + x = torch.where(iou > self.iou_thres) + if x[0].shape[0]: + matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy() + if x[0].shape[0] > 1: + matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 1], return_index=True)[1]] + matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 0], return_index=True)[1]] + else: + matches = np.zeros((0, 3)) + + n = matches.shape[0] > 0 + m0, m1, _ = matches.transpose().astype(np.int16) + for i, gc in enumerate(gt_classes): + j = m0 == i + if n and sum(j) == 1: + self.matrix[gc, detection_classes[m1[j]]] += 1 # correct + else: + self.matrix[gc, self.nc] += 1 # background FP + + if n: + for i, dc in enumerate(detection_classes): + if not any(m1 == i): + self.matrix[self.nc, dc] += 1 # background FN + + def matrix(self): + return self.matrix + + def plot(self, save_dir='', names=()): + try: + import seaborn as sn + + array = self.matrix / (self.matrix.sum(0).reshape(1, self.nc + 1) + 1E-6) # normalize + array[array < 0.005] = np.nan # don't annotate (would appear as 0.00) + + fig = plt.figure(figsize=(12, 9), tight_layout=True) + sn.set(font_scale=1.0 if self.nc < 50 else 0.8) # for label size + labels = (0 < len(names) < 99) and len(names) == self.nc # apply names to ticklabels + sn.heatmap(array, annot=self.nc < 30, annot_kws={"size": 8}, cmap='Blues', fmt='.2f', square=True, + xticklabels=names + ['background FN'] if labels else "auto", + yticklabels=names + ['background FP'] if labels else "auto").set_facecolor((1, 1, 1)) + fig.axes[0].set_xlabel('True') + fig.axes[0].set_ylabel('Predicted') + fig.savefig(Path(save_dir) / 'confusion_matrix.png', dpi=250) + except Exception as e: + pass + + def print(self): + for i in range(self.nc + 1): + print(' '.join(map(str, self.matrix[i]))) + + +# Plots ---------------------------------------------------------------------------------------------------------------- + +def plot_pr_curve(px, py, ap, save_dir='.', names=()): + fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True) + py = np.stack(py, axis=1) + + if 0 < len(names) < 21: # show mAP in legend if < 10 classes + for i, y in enumerate(py.T): + ax.plot(px, y, linewidth=1, label=f'{names[i]} %.3f' % ap[i, 0]) # plot(recall, precision) + else: + ax.plot(px, py, linewidth=1, color='grey') # plot(recall, precision) + + ax.plot(px, py.mean(1), linewidth=3, color='blue', label='all classes %.3f mAP@0.5' % ap[:, 0].mean()) + ax.set_xlabel('Recall') + ax.set_ylabel('Precision') + ax.set_xlim(0, 1) + ax.set_ylim(0, 1) + plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left") + fig.savefig(Path(save_dir) / 'precision_recall_curve.png', dpi=250) diff --git a/experiments/distributed/classification/configs/example.conf b/experiments/distributed/classification/configs/example.conf new file mode 100644 index 0000000..a1c3d4d --- /dev/null +++ b/experiments/distributed/classification/configs/example.conf @@ -0,0 +1,4 @@ +PYTHON=~/anaconda3/envs/py36/bin/python +imagenet_data_dir=/home/datasets/imagenet/ILSVRC2012_dataset +gld_data_dir=~/datasets/landmarks +cifar10_data_dir=~/datasets/cifar10 \ No newline at end of file diff --git a/experiments/distributed/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md b/experiments/distributed/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md new file mode 100644 index 0000000..1ce71f3 --- /dev/null +++ b/experiments/distributed/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md @@ -0,0 +1,15 @@ +# ILSVRC2012-100 + + + + + + + + + + + + + + diff --git a/experiments/distributed/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md b/experiments/distributed/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md new file mode 100644 index 0000000..e3797fd --- /dev/null +++ b/experiments/distributed/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md @@ -0,0 +1,41 @@ +# ILSVRC2012-100 + + + + + + + + + + +# 10 clients +``` + +# DAAI +# srun -N2 -B 4-4:2-2 \ +# srun -w hkbugpusrv03 -n 21 -B 21:4 \ +salloc -w hkbugpusrv03 -n 21 --cpus-per-task=4 \ +mpiexec \ + ~/py36/bin/python ./main.py \ + --gpu_util_parse "hkbugpusrv03:6,5,5,5" \ + --client_num_per_round 20 --client_num_in_total 100 \ + --gpu_server_num 1 --gpu_num_per_server 1 --ci 0 \ + --frequency_of_the_test 10 \ + --dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ + --if-timm-dataset -b 16 --data_transform FLTransform \ + --data_load_num_workers 2 \ + --comm_round 1000 --epochs 1 \ + --model mobilenet_v3 \ + --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ + --opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ + --sched step --decay-rounds 1 --decay-rate .992 + +``` + + + + + + + diff --git a/experiments/distributed/classification/experiment_scripts/gld160k EfficientNet.md b/experiments/distributed/classification/experiment_scripts/gld160k EfficientNet.md new file mode 100644 index 0000000..e69de29 diff --git a/experiments/distributed/classification/experiment_scripts/gld160k MobileNetV3.md b/experiments/distributed/classification/experiment_scripts/gld160k MobileNetV3.md new file mode 100644 index 0000000..e69de29 diff --git a/experiments/distributed/classification/experiment_scripts/gld23k EfficientNet.md b/experiments/distributed/classification/experiment_scripts/gld23k EfficientNet.md new file mode 100644 index 0000000..e69de29 diff --git a/experiments/distributed/classification/experiment_scripts/gld23k MobileNetV3.md b/experiments/distributed/classification/experiment_scripts/gld23k MobileNetV3.md new file mode 100644 index 0000000..9d7a5ef --- /dev/null +++ b/experiments/distributed/classification/experiment_scripts/gld23k MobileNetV3.md @@ -0,0 +1,148 @@ +# ILSVRC2012-100 MobileNetV3-Large-100 + +# t716 +PYTHON=/nfs_home/zhtang/miniconda3/bin/python +imagenet_data_dir=/nfs_home/datasets/ILSVRC2012 +gld_data_dir=/nfs_home/datasets/landmarks +cifar10_data_dir=/nfs_home/datasets/cifar10 +mnist_data_dir=/nfs_home/datasets/mnist + + + +``` +# directly run +# on scigpu + +mpirun -np 3 -host scigpu10:2,scigpu13:1 \ + ~/anaconda3/envs/py36/bin/python ./main.py \ + --gpu_util_parse "scigpu10:1,1,0,0;scigpu13:0,0,1,0" \ + --client_num_per_round 2 --client_num_in_total 233 \ + --gpu_server_num 1 --gpu_num_per_server 1 --ci 0 \ + --frequency_of_the_test 10 \ + --dataset gld23k --data_dir ~/datasets/landmarks \ + --if-timm-dataset -b 16 --data_transform FLTransform \ + --comm_round 300 --epochs 1 \ + --model mobilenet_v3 \ + --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ + --opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ + --sched step --decay-rounds 1 --decay-rate .97 + + + +# on t716 + + +mpirun -np 3 -host gpu1:2,gpu3:1 \ + -mca btl_tcp_if_include 192.168.0.101/24 \ + ~/miniconda3/bin/python ./main.py \ + --client_num_per_round 2 --client_num_in_total 233 \ + --gpu_util_parse "gpu1:2;gpu3:1" \ + --gpu_server_num 1 --gpu_num_per_server 1 --ci 0 \ + --frequency_of_the_test 10 \ + --dataset gld23k --data_dir /nfs_home/datasets/landmarks \ + --data_load_num_workers 2 \ + --if-timm-dataset -b 16 --data_transform FLTransform \ + --comm_round 300 --epochs 1 \ + --model mobilenet_v3 \ + --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ + --opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ + --sched step --decay-rounds 1 --decay-rate .97 + + +``` + +# 5 clients +``` +mpirun -np 6 -host gpu1:1,gpu3:1,gpu4:1,gpu5:1,gpu6:1,gpu7:1 \ + -mca btl_tcp_if_include 192.168.0.101/24 \ + ~/miniconda3/bin/python ./main.py \ + --gpu_util_parse "gpu1:1;gpu3:1;gpu4:1;gpu5:1;gpu6:1;gpu7:1" \ + --client_num_per_round 5 --client_num_in_total 233 \ + --gpu_server_num 1 --gpu_num_per_server 1 --ci 0 \ + --frequency_of_the_test 10 \ + --dataset gld23k --data_dir /nfs_home/datasets/landmarks \ + --data_load_num_workers 2 \ + --if-timm-dataset -b 16 --data_transform FLTransform \ + --comm_round 300 --epochs 1 \ + --model mobilenet_v3 \ + --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ + --opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ + --sched step --decay-rounds 1 --decay-rate .97 +``` + + +# 10 clients +``` +# bad, killed +mpirun -np 11 -host gpu1:1,gpu3:1,gpu4:1,gpu5:1,gpu6:1,gpu7:1,gpu8:1,gpu9:1,gpu10:1,gpu11:1,gpu13:1 \ + -mca btl_tcp_if_include 192.168.0.101/24 \ + ~/miniconda3/bin/python ./main.py \ + --gpu_util_parse "gpu1:1;gpu3:1;gpu4:1;gpu5:1;gpu6:1;gpu7:1;gpu8:1;gpu9:1;gpu10:1;gpu11:1;gpu13:1" \ + --client_num_per_round 10 --client_num_in_total 233 \ + --gpu_server_num 1 --gpu_num_per_server 1 --ci 0 \ + --frequency_of_the_test 10 \ + --dataset gld23k --data_dir /nfs_home/datasets/landmarks \ + --if-timm-dataset -b 16 --data_transform FLTransform \ + --data_load_num_workers 2 \ + --comm_round 1000 --epochs 1 \ + --model mobilenet_v3 \ + --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ + --opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ + --sched step --decay-rounds 1 --decay-rate .97 + +mpirun -np 11 -host gpu1:1,gpu27:1,gpu4:1,gpu5:1,gpu6:1,gpu7:1,gpu8:1,gpu9:1,gpu10:1,gpu11:1,gpu13:1 \ + -mca btl_tcp_if_include 192.168.0.101/24 \ + ~/miniconda3/bin/python ./main.py \ + --gpu_util_parse "gpu1:1;gpu27:1;gpu4:1;gpu5:1;gpu6:1;gpu7:1;gpu8:1;gpu9:1;gpu10:1;gpu11:1;gpu13:1" \ + --client_num_per_round 10 --client_num_in_total 233 \ + --gpu_server_num 1 --gpu_num_per_server 1 --ci 0 \ + --frequency_of_the_test 10 \ + --dataset gld23k --data_dir /nfs_home/datasets/landmarks \ + --if-timm-dataset -b 16 --data_transform FLTransform \ + --comm_round 1000 --epochs 1 \ + --model mobilenet_v3 \ + --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ + --opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ + --sched step --decay-rounds 1 --decay-rate .97 + +mpirun -np 11 -host gpu1:1,gpu3:1,gpu4:1,gpu5:1,gpu6:1,gpu7:1,gpu8:1,gpu9:1,gpu10:1,gpu11:1,gpu13:1 \ + -mca btl_tcp_if_include 192.168.0.101/24 \ + ~/miniconda3/bin/python ./main.py \ + --gpu_util_parse "gpu1:1;gpu3:1;gpu4:1;gpu5:1;gpu6:1;gpu7:1;gpu8:1;gpu9:1;gpu10:1;gpu11:1;gpu13:1" \ + --client_num_per_round 10 --client_num_in_total 233 \ + --gpu_server_num 1 --gpu_num_per_server 1 --ci 0 \ + --frequency_of_the_test 10 \ + --dataset gld23k --data_dir /nfs_home/datasets/landmarks \ + --if-timm-dataset -b 16 --data_transform FLTransform \ + --data_load_num_workers 2 \ + --comm_round 1000 --epochs 1 \ + --model mobilenet_v3 \ + --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ + --opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ + --sched step --decay-rounds 1 --decay-rate .992 + +mpirun -np 11 -host gpu14:1,gpu15:1,gpu16:1,gpu17:1,gpu19:1,gpu20:1,gpu21:1,gpu22:1,gpu23:1,gpu24:1,gpu26:1 \ + -mca btl_tcp_if_include 192.168.0.101/24 \ + ~/miniconda3/bin/python ./main.py \ + --gpu_util_parse "gpu14:1;gpu15:1;gpu16:1;gpu17:1;gpu19:1;gpu20:1;gpu21:1;gpu22:1;gpu23:1;gpu24:1;gpu26:1" \ + --client_num_per_round 10 --client_num_in_total 233 \ + --gpu_server_num 1 --gpu_num_per_server 1 --ci 0 \ + --frequency_of_the_test 10 \ + --dataset gld23k --data_dir /nfs_home/datasets/landmarks \ + --if-timm-dataset -b 64 --data_transform FLTransform \ + --data_load_num_workers 2 \ + --comm_round 1000 --epochs 1 \ + --model mobilenet_v3 \ + --drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ + --opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ + --sched step --decay-rounds 1 --decay-rate .992 +``` + + + + + + + + + diff --git a/experiments/distributed/classification/gpuutils/DAAI_gpu_util.yaml b/experiments/distributed/classification/gpuutils/DAAI_gpu_util.yaml new file mode 100644 index 0000000..e69de29 diff --git a/experiments/distributed/classification/gpuutils/scigpu_gpu_util.yaml b/experiments/distributed/classification/gpuutils/scigpu_gpu_util.yaml new file mode 100644 index 0000000..8020d5a --- /dev/null +++ b/experiments/distributed/classification/gpuutils/scigpu_gpu_util.yaml @@ -0,0 +1,36 @@ +local_10: + localhost: [2, 2, 3, 3] + +local_11: + localhost: [2, 3, 3, 3] + +local_20: + localhost: [5, 5, 5, 5] + +local_21: + localhost: [6, 5, 5, 5] + + +gpu_util_11: + scigpu10: [2, 1 ,1 ,1] + scigpu11: [1, 2, 2, 1] + +gpu_util_21: + scigpu10: [2, 1, 1, 1] + scigpu11: [1, 1, 1, 1] + scigpu13: [2, 2, 1, 2] + scigpu14: [2, 1, 1, 1] + +gpu_util_41: + scigpu10: [5, 4 ,4 ,4] + scigpu11: [4, 4, 4, 4] + scigpu12: [8, 8 ,0 ,0] + scigpu14: [2, 2, 2, 2] + +gpu_util_51: + scigpu10: [5, 4 ,4 ,4] + scigpu11: [4, 4, 4, 4] + scigpu12: [8, 8 ,0 ,0] + scigpu13: [2, 2, 2, 2] + scigpu14: [2, 2, 2, 2] + diff --git a/experiments/distributed/classification/gpuutils/t716_gpu_util.yaml b/experiments/distributed/classification/gpuutils/t716_gpu_util.yaml new file mode 100644 index 0000000..0fdd753 --- /dev/null +++ b/experiments/distributed/classification/gpuutils/t716_gpu_util.yaml @@ -0,0 +1,121 @@ +4gpus: + gpu1: [1] + gpu5: [1] + gpu3: [1] + gpu4: [1] + +4gpus_1: + gpu6: [1] + gpu7: [1] + gpu8: [1] + gpu9: [1] + +4gpus_2: + gpu10: [1] + gpu11: [1] + gpu13: [1] + gpu14: [1] +4gpus_3: + gpu15: [1] + gpu16: [1] + gpu17: [1] + gpu19: [1] +4gpus_4: + gpu20: [1] + gpu21: [1] + gpu22: [1] + gpu23: [1] + +5gpus: + gpu1: [1] + gpu2: [1] + gpu3: [1] + gpu4: [1] + gpu5: [1] + +10gpus: + gpu1: [2] + gpu2: [2] + gpu3: [2] + gpu4: [2] + gpu5: [2] + +11gpus_2: + gpu1: [1] + gpu3: [1] + gpu4: [1] + gpu5: [1] + gpu6: [1] + gpu7: [1] + gpu8: [1] + gpu9: [1] + gpu10: [1] + gpu11: [1] + gpu13: [1] + +20gpus: + gpu1: [2] + gpu2: [2] + gpu3: [2] + gpu4: [2] + gpu5: [2] + gpu6: [2] + gpu7: [2] + gpu8: [2] + gpu9: [2] + gpu10: [2] + +21gpus_2: + gpu1: [1] + gpu3: [1] + gpu4: [1] + gpu5: [1] + gpu6: [1] + gpu7: [1] + gpu8: [1] + gpu9: [1] + gpu10: [1] + gpu11: [1] + gpu13: [1] + gpu14: [1] + gpu15: [1] + gpu16: [1] + gpu17: [1] + gpu19: [1] + gpu20: [1] + gpu21: [1] + gpu22: [1] + gpu23: [1] + gpu24: [1] + +40gpus: + gpu1: [2] + gpu2: [2] + gpu3: [2] + gpu4: [2] + gpu5: [2] + gpu6: [2] + gpu7: [2] + gpu8: [2] + gpu9: [2] + gpu10: [2] + gpu11: [2] + gpu12: [2] + gpu13: [2] + gpu14: [2] + gpu15: [2] + gpu16: [2] + gpu17: [2] + gpu18: [2] + gpu19: [2] + gpu20: [2] + + + + + + + + + + diff --git a/experiments/distributed/classification/helloworld.py b/experiments/distributed/classification/helloworld.py new file mode 100644 index 0000000..b1de7cf --- /dev/null +++ b/experiments/distributed/classification/helloworld.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python +""" +Parallel Hello World +""" + +from mpi4py import MPI +import sys +import time + +size = MPI.COMM_WORLD.Get_size() +rank = MPI.COMM_WORLD.Get_rank() +name = MPI.Get_processor_name() + +sys.stdout.write("Hello, World! I am process %d of %d on %s.\n" % (rank, size, name)) +time.sleep(300) diff --git a/experiments/distributed/classification/hostfiles/scigpu_local_hostfile b/experiments/distributed/classification/hostfiles/scigpu_local_hostfile new file mode 100644 index 0000000..75c8e45 --- /dev/null +++ b/experiments/distributed/classification/hostfiles/scigpu_local_hostfile @@ -0,0 +1 @@ +localhost:10 \ No newline at end of file diff --git a/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_11 b/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_11 new file mode 100644 index 0000000..ab8648d --- /dev/null +++ b/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_11 @@ -0,0 +1 @@ +localhost:11 \ No newline at end of file diff --git a/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_20 b/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_20 new file mode 100644 index 0000000..d211bf4 --- /dev/null +++ b/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_20 @@ -0,0 +1 @@ +localhost:20 \ No newline at end of file diff --git a/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_21 b/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_21 new file mode 100644 index 0000000..952486e --- /dev/null +++ b/experiments/distributed/classification/hostfiles/scigpu_local_hostfile_21 @@ -0,0 +1 @@ +localhost:21 \ No newline at end of file diff --git a/experiments/distributed/classification/hostfiles/t716_hostfile_10 b/experiments/distributed/classification/hostfiles/t716_hostfile_10 new file mode 100644 index 0000000..e5a064e --- /dev/null +++ b/experiments/distributed/classification/hostfiles/t716_hostfile_10 @@ -0,0 +1,5 @@ +gpu1 slots=2 +gpu2 slots=2 +gpu3 slots=2 +gpu4 slots=2 +gpu5 slots=2 \ No newline at end of file diff --git a/experiments/distributed/classification/hostfiles/t716_hostfile_11_2 b/experiments/distributed/classification/hostfiles/t716_hostfile_11_2 new file mode 100644 index 0000000..e06c571 --- /dev/null +++ b/experiments/distributed/classification/hostfiles/t716_hostfile_11_2 @@ -0,0 +1,11 @@ +gpu1 slots=1 +gpu3 slots=1 +gpu4 slots=1 +gpu5 slots=1 +gpu6 slots=1 +gpu7 slots=1 +gpu8 slots=1 +gpu9 slots=1 +gpu10 slots=1 +gpu11 slots=1 +gpu13 slots=1 \ No newline at end of file diff --git a/experiments/distributed/classification/hostfiles/t716_hostfile_20 b/experiments/distributed/classification/hostfiles/t716_hostfile_20 new file mode 100644 index 0000000..096ab7e --- /dev/null +++ b/experiments/distributed/classification/hostfiles/t716_hostfile_20 @@ -0,0 +1,10 @@ +gpu1 slots=2 +gpu2 slots=2 +gpu3 slots=2 +gpu4 slots=2 +gpu5 slots=2 +gpu6 slots=2 +gpu7 slots=2 +gpu8 slots=2 +gpu9 slots=2 +gpu10 slots=2 \ No newline at end of file diff --git a/experiments/distributed/classification/hostfiles/t716_hostfile_21_2 b/experiments/distributed/classification/hostfiles/t716_hostfile_21_2 new file mode 100644 index 0000000..d12b1c3 --- /dev/null +++ b/experiments/distributed/classification/hostfiles/t716_hostfile_21_2 @@ -0,0 +1,21 @@ +gpu1 slots=1 +gpu3 slots=1 +gpu4 slots=1 +gpu5 slots=1 +gpu6 slots=1 +gpu7 slots=1 +gpu8 slots=1 +gpu9 slots=1 +gpu10 slots=1 +gpu11 slots=1 +gpu13 slots=1 +gpu14 slots=1 +gpu15 slots=1 +gpu16 slots=1 +gpu17 slots=1 +gpu19 slots=1 +gpu20 slots=1 +gpu21 slots=1 +gpu22 slots=1 +gpu23 slots=1 +gpu24 slots=1 \ No newline at end of file diff --git a/experiments/distributed/classification/hostfiles/t716_hostfile_4 b/experiments/distributed/classification/hostfiles/t716_hostfile_4 new file mode 100644 index 0000000..4b82e19 --- /dev/null +++ b/experiments/distributed/classification/hostfiles/t716_hostfile_4 @@ -0,0 +1,4 @@ +gpu1 slots=1 +gpu3 slots=1 +gpu4 slots=1 +gpu5 slots=1 \ No newline at end of file diff --git a/experiments/distributed/classification/hostfiles/t716_hostfile_5 b/experiments/distributed/classification/hostfiles/t716_hostfile_5 new file mode 100644 index 0000000..c48ee2c --- /dev/null +++ b/experiments/distributed/classification/hostfiles/t716_hostfile_5 @@ -0,0 +1,5 @@ +gpu1 slots=1 +gpu3 slots=1 +gpu4 slots=1 +gpu5 slots=1 +gpu6 slots=1 \ No newline at end of file diff --git a/experiments/distributed/classification/main.py b/experiments/distributed/classification/main.py new file mode 100644 index 0000000..a421a6e --- /dev/null +++ b/experiments/distributed/classification/main.py @@ -0,0 +1,539 @@ +import argparse +import logging +import os +import random +import socket +import sys +import traceback +import yaml + +import numpy as np +import psutil +import setproctitle +import torch +import wandb +from mpi4py import MPI + +from timm import create_model as timm_create_model +from timm.models import resume_checkpoint, load_checkpoint, convert_splitbn_model + + +sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../../"))) + +from FedML.fedml_api.distributed.fedavg.FedAvgAPI import FedML_init, FedML_FedAvg_distributed + + +from data_preprocessing.ImageNet.data_loader import load_partition_data_ImageNet +from data_preprocessing.Landmarks.data_loader import load_partition_data_landmarks +from data_preprocessing.cifar10.iid_data_loader import load_iid_cifar10 +from data_preprocessing.cifar10.data_loader import load_partition_data_cifar10 +from data_preprocessing.cifar100.data_loader import load_partition_data_cifar100 +from data_preprocessing.cinic10.data_loader import load_partition_data_cinic10 + +from training.fedavg_classification_trainer import ClassificationTrainer + +from utils.context import ( + raise_MPI_error +) +from utils.logger import ( + logging_config +) + + +def add_args(parser): + """ + parser : argparse.ArgumentParser + return a parser added with args required by fit + """ + # Training settings + parser.add_argument('--model', type=str, default='mobilenet', metavar='N', + help='neural network used in training') + + parser.add_argument('--dataset', type=str, default='cifar10', metavar='N', + help='dataset used for training') + + parser.add_argument('--data_dir', type=str, default='./../../../data/cifar10', + help='data directory') + + parser.add_argument('--partition_method', type=str, default='hetero', metavar='N', + help='how to partition the dataset on local workers') + + parser.add_argument('--partition_alpha', type=float, default=0.5, metavar='PA', + help='partition alpha (default: 0.5)') + + parser.add_argument('--client_num_in_total', type=int, default=1000, metavar='NN', + help='number of workers in a distributed cluster') + + parser.add_argument('--client_num_per_round', type=int, default=4, metavar='NN', + help='number of workers') + + # parser.add_argument('--batch_size', type=int, default=64, metavar='N', + # help='input batch size for training (default: 64)') + + parser.add_argument('--client_optimizer', type=str, default='adam', + help='SGD with momentum; adam') + + # parser.add_argument('--lr', type=float, default=0.001, metavar='LR', + # help='learning rate (default: 0.001)') + + parser.add_argument('--wd', help='weight decay parameter;', type=float, default=0.001) + + parser.add_argument('--epochs', type=int, default=5, metavar='EP', + help='how many epochs will be trained locally') + + parser.add_argument('--comm_round', type=int, default=10, + help='how many round of communications we shoud use') + + parser.add_argument('--is_mobile', type=int, default=0, + help='whether the program is running on the FedML-Mobile server side') + + parser.add_argument('--frequency_of_the_test', type=int, default=1, + help='the frequency of the algorithms') + + parser.add_argument('--gpu_server_num', type=int, default=1, + help='gpu_server_num') + + parser.add_argument('--gpu_num_per_server', type=int, default=4, + help='gpu_num_per_server') + + parser.add_argument('--ci', type=int, default=0, + help='CI') + + parser.add_argument('--gpu_util_file', type=str, default=None, + help='the gpu utilization file for servers and clients. If there is no \ + gpu_util_file, gpu will not be used.') + parser.add_argument('--gpu_util_key', type=str, default=None, + help='the key in gpu utilization file') + parser.add_argument('--gpu_util_parse', type=str, default=None, + help='the gpu utilization string for servers and clients. If there is no \ + gpu_util_parse, gpu will not be used. Note if this and gpu_util_file are \ + both defined, gpu_util_parse will be used but not gpu_util_file') + + parser.add_argument('--pretrained',action='store_true', default=False, + help='Start with pretrained version of specified network (if avail)') + + parser.add_argument('--distributed', action='store_true', default=False, + help='If distributed training') + + parser.add_argument('--if-timm-dataset', action='store_true', default=False, + help='If use timm dataset augmentation') + + parser.add_argument('--data_load_num_workers', type=int, default=4, + help='number of workers when loading data') + + + # logging settings + parser.add_argument('--level', type=str, default='INFO', + help='level of logging') + + # Dataset + parser.add_argument('--img-size', type=int, default=None, metavar='N', + help='Image patch size (default: None => model default)') + parser.add_argument('--crop-pct', default=None, type=float, + metavar='N', help='Input image center crop percent (for validation only)') + parser.add_argument('--data_transform', default=None, type=str, metavar='TRANSFORM', + help='How to do data transform') + parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN', + help='Override mean pixel value of dataset') + parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD', + help='Override std deviation of of dataset') + parser.add_argument('--interpolation', default='', type=str, metavar='NAME', + help='Image resize interpolation type (overrides model)') + parser.add_argument('-b', '--batch-size', type=int, default=32, metavar='N', + help='input batch size for training (default: 32)') + parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N', + help='ratio of validation batch size to training batch size (default: 1)') + + + # Model parameters + parser.add_argument('--gp', default=None, type=str, metavar='POOL', + help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.') + + # Optimizer parameters + parser.add_argument('--opt', default='sgd', type=str, metavar='OPTIMIZER', + help='Optimizer (default: "sgd"') + parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', + help='Optimizer Epsilon (default: None, use opt default)') + parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', + help='Optimizer Betas (default: None, use opt default)') + parser.add_argument('--momentum', type=float, default=0.9, metavar='M', + help='Optimizer momentum (default: 0.9)') + parser.add_argument('--weight-decay', type=float, default=0.0001, + help='weight decay (default: 0.0001)') + parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM', + help='Clip gradient norm (default: None, no clipping)') + + + # Learning rate schedule parameters + parser.add_argument('--sched', default=None, type=str, metavar='SCHEDULER', + help='LR scheduler (default: "step"') + parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') + parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct', + help='learning rate noise on/off epoch percentages') + parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT', + help='learning rate noise limit percent (default: 0.67)') + parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV', + help='learning rate noise std-dev (default: 1.0)') + parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT', + help='learning rate cycle len multiplier (default: 1.0)') + parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N', + help='learning rate cycle limit') + parser.add_argument('--warmup-lr', type=float, default=0.0001, metavar='LR', + help='warmup learning rate (default: 0.0001)') + parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0 (1e-5)') + # parser.add_argument('--epochs', type=int, default=200, metavar='N', + # help='number of epochs to train (default: 2)') + parser.add_argument('--start-epoch', default=None, type=int, metavar='N', + help='manual epoch number (useful on restarts)') + parser.add_argument('--decay-epochs', type=float, default=30, metavar='N', + help='epoch interval to decay LR') + parser.add_argument('--warmup-epochs', type=int, default=3, metavar='N', + help='epochs to warmup LR, if scheduler supports') + parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N', + help='epochs to cooldown LR at min_lr, after cyclic schedule ends') + parser.add_argument('--patience-epochs', type=int, default=10, metavar='N', + help='patience epochs for Plateau LR scheduler (default: 10') + parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE', + help='LR decay rate (default: 0.1)') + + parser.add_argument('--decay-rounds', type=float, default=30, metavar='N', + help='round interval to decay LR') + + + # Augmentation & regularization parameters + parser.add_argument('--no-aug', action='store_true', default=False, + help='Disable all training augmentation, override other train aug args') + parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT', + help='Random resize scale (default: 0.08 1.0)') + parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO', + help='Random resize aspect ratio (default: 0.75 1.33)') + parser.add_argument('--hflip', type=float, default=0.5, + help='Horizontal flip training aug probability') + parser.add_argument('--vflip', type=float, default=0., + help='Vertical flip training aug probability') + parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT', + help='Color jitter factor (default: 0.4)') + parser.add_argument('--aa', type=str, default=None, metavar='NAME', + help='Use AutoAugment policy. "v0" or "original". (default: None)'), + parser.add_argument('--aug-splits', type=int, default=0, + help='Number of augmentation splits (default: 0, valid: 0 or >=2)') + parser.add_argument('--jsd', action='store_true', default=False, + help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.') + parser.add_argument('--reprob', type=float, default=0., metavar='PCT', + help='Random erase prob (default: 0.)') + parser.add_argument('--remode', type=str, default='const', + help='Random erase mode (default: "const")') + parser.add_argument('--recount', type=int, default=1, + help='Random erase count (default: 1)') + parser.add_argument('--resplit', action='store_true', default=False, + help='Do not random erase first (clean) augmentation split') + parser.add_argument('--mixup', type=float, default=0.0, + help='mixup alpha, mixup enabled if > 0. (default: 0.)') + parser.add_argument('--cutmix', type=float, default=0.0, + help='cutmix alpha, cutmix enabled if > 0. (default: 0.)') + parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None, + help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)') + parser.add_argument('--mixup-prob', type=float, default=1.0, + help='Probability of performing mixup or cutmix when either/both is enabled') + parser.add_argument('--mixup-switch-prob', type=float, default=0.5, + help='Probability of switching to cutmix when both mixup and cutmix enabled') + parser.add_argument('--mixup-mode', type=str, default='batch', + help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"') + parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N', + help='Turn off mixup after this epoch, disabled if 0 (default: 0)') + parser.add_argument('--smoothing', type=float, default=0.1, + help='Label smoothing (default: 0.1)') + parser.add_argument('--train-interpolation', type=str, default='random', + help='Training interpolation (random, bilinear, bicubic default: "random")') + parser.add_argument('--drop', type=float, default=0.0, metavar='PCT', + help='Dropout rate (default: 0.)') + parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT', + help='Drop connect rate, DEPRECATED, use drop-path (default: None)') + parser.add_argument('--drop-path', type=float, default=None, metavar='PCT', + help='Drop path rate (default: None)') + parser.add_argument('--drop-block', type=float, default=None, metavar='PCT', + help='Drop block rate (default: None)') + + # Batch norm parameters (only works with gen_efficientnet based models currently) + parser.add_argument('--bn-tf', type=bool, default=False, + help='Use Tensorflow BatchNorm defaults for models that support it (default: False)') + parser.add_argument('--bn-momentum', type=float, default=None, + help='BatchNorm momentum override (if not None)') + parser.add_argument('--bn-eps', type=float, default=None, + help='BatchNorm epsilon override (if not None)') + parser.add_argument('--sync-bn', action='store_true', + help='Enable NVIDIA Apex or Torch synchronized BatchNorm.') + parser.add_argument('--dist-bn', type=str, default='', + help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")') + parser.add_argument('--split-bn', action='store_true', + help='Enable separate BN layers per augmentation split.') + + # Model Exponential Moving Average + parser.add_argument('--model-ema', action='store_true', default=False, + help='Enable tracking moving average of model weights') + parser.add_argument('--model-ema-force-cpu', action='store_true', default=False, + help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.') + parser.add_argument('--model-ema-decay', type=float, default=0.9998, + help='decay factor for model weights moving average (default: 0.9998)') + + + args = parser.parse_args() + return args + + +def load_data(args, dataset_name): + if dataset_name in ["ILSVRC2012", "ILSVRC2012-100"]: + logging.info("load_data. dataset_name = %s" % dataset_name) + train_data_num, test_data_num, train_data_global, test_data_global, \ + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ + class_num = load_partition_data_ImageNet(dataset=dataset_name, data_dir=args.data_dir, + partition_method=None, partition_alpha=None, + client_number=args.client_num_in_total, + batch_size=args.batch_size, args=args) + + elif dataset_name == "gld23k": + logging.info("load_data. dataset_name = %s" % dataset_name) + args.client_num_in_total = 233 + # fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_train.csv') + # fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_test.csv') + fed_train_map_file = os.path.join(args.data_dir, 'mini_gld_train_split.csv') + fed_test_map_file = os.path.join(args.data_dir, 'mini_gld_test.csv') + + args.data_dir = os.path.join(args.data_dir, 'images') + + train_data_num, test_data_num, train_data_global, test_data_global, \ + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ + class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir, + fed_train_map_file=fed_train_map_file, + fed_test_map_file=fed_test_map_file, + partition_method=None, partition_alpha=None, + client_number=args.client_num_in_total, + batch_size=args.batch_size, args=args) + elif dataset_name == "gld160k": + logging.info("load_data. dataset_name = %s" % dataset_name) + args.client_num_in_total = 1262 + fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_train.csv') + fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_test.csv') + args.data_dir = os.path.join(args.data_dir, 'images') + + train_data_num, test_data_num, train_data_global, test_data_global, \ + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ + class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir, + fed_train_map_file=fed_train_map_file, + fed_test_map_file=fed_test_map_file, + partition_method=None, partition_alpha=None, + client_number=args.client_num_in_total, + batch_size=args.batch_size, args=args) + else: + if dataset_name == "cifar10": + data_loader = load_partition_data_cifar10 + elif dataset_name == "cifar100": + data_loader = load_partition_data_cifar100 + elif dataset_name == "cinic10": + data_loader = load_partition_data_cinic10 + else: + raise Exception("no such dataset") + + dataset = [train_data_num, test_data_num, train_data_global, test_data_global, + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] + return dataset + + +def create_model(args, model_name, output_dim): + logging.info("create_model. model_name = %s, output_dim = %s" % (model_name, output_dim)) + if model_name == 'mobilenet_v3': + '''model_mode \in {LARGE: 5.15M, SMALL: 2.94M}''' + # model = MobileNetV3(model_mode='LARGE') + model = timm_create_model( + model_name="mobilenetv3_large_100", + pretrained=args.pretrained, + num_classes=output_dim, + drop_rate=args.drop, + # drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path + drop_path_rate=args.drop_path, + drop_block_rate=args.drop_block, + global_pool=args.gp, + bn_tf=args.bn_tf, + bn_momentum=args.bn_momentum, + bn_eps=args.bn_eps) + + elif model_name == 'efficientnet': + model = timm_create_model( + model_name="efficientnet_b0", + pretrained=args.pretrained, + num_classes=output_dim, + drop_rate=args.drop, + # drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path + drop_path_rate=args.drop_path, + drop_block_rate=args.drop_block, + global_pool=args.gp, + bn_tf=args.bn_tf, + bn_momentum=args.bn_momentum, + bn_eps=args.bn_eps) + else: + raise Exception("no such model") + return model + + + +def init_training_device(process_ID, fl_worker_num, gpu_num_per_machine): + # initialize the mapping from process ID to GPU ID: + if process_ID == 0: + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + return device + process_gpu_dict = dict() + for client_index in range(fl_worker_num): + gpu_index = client_index % gpu_num_per_machine + process_gpu_dict[client_index] = gpu_index + + logging.info(process_gpu_dict) + device = torch.device("cuda:" + str(process_gpu_dict[process_ID - 1]) if torch.cuda.is_available() else "cpu") + logging.info(device) + return device + +def init_training_device_from_gpu_util_file(process_id, worker_number, gpu_util_file, gpu_util_key): + + if gpu_util_file == None: + device = torch.device("cpu") + logging.info(" !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + logging.info(" ################## Not Indicate gpu_util_file, using cpu #################") + logging.info(device) + #return gpu_util_map[process_id][1] + return device + else: + with open(gpu_util_file, 'r') as f: + gpu_util_yaml = yaml.load(f, Loader=yaml.FullLoader) + # gpu_util_num_process = 'gpu_util_' + str(worker_number) + # gpu_util = gpu_util_yaml[gpu_util_num_process] + gpu_util = gpu_util_yaml[gpu_util_key] + gpu_util_map = {} + i = 0 + for host, gpus_util_map_host in gpu_util.items(): + for gpu_j, num_process_on_gpu in enumerate(gpus_util_map_host): + for _ in range(num_process_on_gpu): + gpu_util_map[i] = (host, gpu_j) + i += 1 + logging.info("Process %d running on host: %s,gethostname: %s, gpu: %d ..." % ( + process_id, gpu_util_map[process_id][0], socket.gethostname(), gpu_util_map[process_id][1])) + assert i == worker_number + + device = torch.device("cuda:" + str(gpu_util_map[process_id][1]) if torch.cuda.is_available() else "cpu") + logging.info(device) + #return gpu_util_map[process_id][1] + return device + +def init_training_device_from_gpu_util_parse(process_id, worker_number, gpu_util_parse): + if gpu_util_parse == None: + device = torch.device("cpu") + logging.info(" !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + logging.info(" ################## Not Indicate gpu_util_file, using cpu #################") + logging.info(device) + #return gpu_util_map[process_id][1] + return device + else: + # example parse str `gpu_util_parse`: + # "gpu1:0,1,1,2;gpu2:3,3,3;gpu3:0,0,0,1,2,4,4,0" + gpu_util_parse_temp = gpu_util_parse.split(';') + gpu_util_parse_temp = [(item.split(':')[0], item.split(':')[1]) for item in gpu_util_parse_temp ] + + gpu_util = {} + for (host, gpus_str) in gpu_util_parse_temp: + gpu_util[host] = [int(num_process_on_gpu) for num_process_on_gpu in gpus_str.split(',')] + + gpu_util_map = {} + i = 0 + for host, gpus_util_map_host in gpu_util.items(): + for gpu_j, num_process_on_gpu in enumerate(gpus_util_map_host): + for _ in range(num_process_on_gpu): + gpu_util_map[i] = (host, gpu_j) + i += 1 + logging.info("Process %d running on host: %s,gethostname: %s, gpu: %d ..." % ( + process_id, gpu_util_map[process_id][0], socket.gethostname(), gpu_util_map[process_id][1])) + assert i == worker_number + + device = torch.device("cuda:" + str(gpu_util_map[process_id][1]) if torch.cuda.is_available() else "cpu") + logging.info(device) + #return gpu_util_map[process_id][1] + return device + + + +if __name__ == "__main__": + # initialize distributed computing (MPI) + comm, process_id, worker_number = FedML_init() + + with raise_MPI_error(): + # parse python script input parameters + parser = argparse.ArgumentParser() + args = add_args(parser) + args.rank = process_id + args.wd = args.weight_decay + + logging.info(args) + + # customize the process name + str_process_name = 'fedavg' + " :" + str(process_id) + setproctitle.setproctitle(str_process_name) + + logging_config(args, process_id) + + # initialize the wandb machine learning experimental tracking platform (https://www.wandb.com/). + name_model_ema = "-model_ema" if args.model_ema else "-no_model_ema" + name_aa = args.aa if args.aa is not None else "_None" + if process_id == 0: + wandb.init( + entity="automl", + project="fedcv-classification", + name="fedavg (d)" + str(args.partition_method) + "-" +str(args.dataset)+ + "-e" + str(args.epochs) + "-" + str(args.model) + "-" + + args.data_transform + "-aa" + name_aa + "-" + str(args.opt) + + name_model_ema + "-bs" + str(args.batch_size) + + "-lr" + str(args.lr) + "-wd" + str(args.wd), + config=args + ) + + # Set the random seed. The np.random seed determines the dataset partition. + # The torch_manual_seed determines the initial weight. + # We fix these two, so that we can reproduce the result. + random.seed(0) + np.random.seed(0) + torch.manual_seed(0) + torch.cuda.manual_seed_all(0) + + # GPU arrangement: Please customize this function according your own topology. + # The GPU server list is configured at "mpi_host_file". + # If we have 4 machines and each has two GPUs, and your FL network has 8 workers and a central worker. + # The 4 machines will be assigned as follows: + # machine 1: worker0, worker4, worker8; + # machine 2: worker1, worker5; + # machine 3: worker2, worker6; + # machine 4: worker3, worker7; + # Therefore, we can see that workers are assigned according to the order of machine list. + logging.info("process_id = %d, size = %d" % (process_id, worker_number)) + if args.gpu_util_parse is not None: + device = init_training_device_from_gpu_util_parse(process_id, worker_number, args.gpu_util_parse) + else: + device = init_training_device_from_gpu_util_file(process_id, worker_number, args.gpu_util_file, args.gpu_util_key) + + # load data + dataset = load_data(args, args.dataset) + [train_data_num, test_data_num, train_data_global, test_data_global, + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] = dataset + + # create model. + # Note if the model is DNN (e.g., ResNet), the training will be very slow. + # In this case, please use our FedML distributed version (./fedml_experiments/distributed_fedavg) + model = create_model(args, model_name=args.model, output_dim=dataset[7]) + + model_trainer = ClassificationTrainer(model, device, args) + FedML_FedAvg_distributed(process_id, worker_number, device, comm, + model, train_data_num, train_data_global, test_data_global, + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, args, model_trainer) + + + + + + diff --git a/experiments/distributed/classification/main_fedavg.py b/experiments/distributed/classification/main_fedavg.py deleted file mode 100644 index 939db22..0000000 --- a/experiments/distributed/classification/main_fedavg.py +++ /dev/null @@ -1,236 +0,0 @@ -import argparse -import logging -import os -import random -import socket -import sys -import traceback - -import numpy as np -import psutil -import setproctitle -import torch -import wandb -from mpi4py import MPI - -sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../../"))) - -from FedML.fedml_api.distributed.fedavg.FedAvgAPI import FedML_init, FedML_FedAvg_distributed - -from data_preprocessing.ImageNet.data_loader import load_partition_data_ImageNet -from data_preprocessing.Landmarks.data_loader import load_partition_data_landmarks -from model.classification.efficientnet import EfficientNet -from model.classification.mobilenet_v3 import MobileNetV3 -from training.classification_trainer import ClassificationTrainer - - -def add_args(parser): - """ - parser : argparse.ArgumentParser - return a parser added with args required by fit - """ - # Training settings - parser.add_argument('--model', type=str, default='mobilenet', metavar='N', - help='neural network used in training') - - parser.add_argument('--dataset', type=str, default='cifar10', metavar='N', - help='dataset used for training') - - parser.add_argument('--data_dir', type=str, default='./../../../data/cifar10', - help='data directory') - - parser.add_argument('--partition_method', type=str, default='hetero', metavar='N', - help='how to partition the dataset on local workers') - - parser.add_argument('--partition_alpha', type=float, default=0.5, metavar='PA', - help='partition alpha (default: 0.5)') - - parser.add_argument('--client_num_in_total', type=int, default=1000, metavar='NN', - help='number of workers in a distributed cluster') - - parser.add_argument('--client_num_per_round', type=int, default=4, metavar='NN', - help='number of workers') - - parser.add_argument('--batch_size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') - - parser.add_argument('--client_optimizer', type=str, default='adam', - help='SGD with momentum; adam') - - parser.add_argument('--lr', type=float, default=0.001, metavar='LR', - help='learning rate (default: 0.001)') - - parser.add_argument('--wd', help='weight decay parameter;', type=float, default=0.001) - - parser.add_argument('--epochs', type=int, default=5, metavar='EP', - help='how many epochs will be trained locally') - - parser.add_argument('--comm_round', type=int, default=10, - help='how many round of communications we shoud use') - - parser.add_argument('--is_mobile', type=int, default=0, - help='whether the program is running on the FedML-Mobile server side') - - parser.add_argument('--frequency_of_the_test', type=int, default=1, - help='the frequency of the algorithms') - - parser.add_argument('--gpu_server_num', type=int, default=1, - help='gpu_server_num') - - parser.add_argument('--gpu_num_per_server', type=int, default=4, - help='gpu_num_per_server') - - parser.add_argument('--ci', type=int, default=0, - help='CI') - args = parser.parse_args() - return args - - -def load_data(args, dataset_name): - if dataset_name == "ILSVRC2012": - logging.info("load_data. dataset_name = %s" % dataset_name) - train_data_num, test_data_num, train_data_global, test_data_global, \ - train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ - class_num = load_partition_data_ImageNet(dataset=dataset_name, data_dir=args.data_dir, - partition_method=None, partition_alpha=None, - client_number=args.client_num_in_total, batch_size=args.batch_size) - - elif dataset_name == "gld23k": - logging.info("load_data. dataset_name = %s" % dataset_name) - args.client_num_in_total = 233 - fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_train.csv') - fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_test.csv') - args.data_dir = os.path.join(args.data_dir, 'images') - - train_data_num, test_data_num, train_data_global, test_data_global, \ - train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ - class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir, - fed_train_map_file=fed_train_map_file, - fed_test_map_file=fed_test_map_file, - partition_method=None, partition_alpha=None, - client_number=args.client_num_in_total, batch_size=args.batch_size) - - elif dataset_name == "gld160k": - logging.info("load_data. dataset_name = %s" % dataset_name) - args.client_num_in_total = 1262 - fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_train.csv') - fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_test.csv') - args.data_dir = os.path.join(args.data_dir, 'images') - - train_data_num, test_data_num, train_data_global, test_data_global, \ - train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ - class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir, - fed_train_map_file=fed_train_map_file, - fed_test_map_file=fed_test_map_file, - partition_method=None, partition_alpha=None, - client_number=args.client_num_in_total, batch_size=args.batch_size) - else: - raise Exception("no such dataset") - - dataset = [train_data_num, test_data_num, train_data_global, test_data_global, - train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] - return dataset - - -def create_model(args, model_name, output_dim): - logging.info("create_model. model_name = %s, output_dim = %s" % (model_name, output_dim)) - if model_name == 'mobilenet_v3': - '''model_mode \in {LARGE: 5.15M, SMALL: 2.94M}''' - model = MobileNetV3(model_mode='LARGE') - elif model_name == 'efficientnet': - model = EfficientNet() - else: - raise Exception("no such model") - return model - - -def init_training_device(process_ID, fl_worker_num, gpu_num_per_machine): - # initialize the mapping from process ID to GPU ID: - if process_ID == 0: - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - return device - process_gpu_dict = dict() - for client_index in range(fl_worker_num): - gpu_index = client_index % gpu_num_per_machine - process_gpu_dict[client_index] = gpu_index - - logging.info(process_gpu_dict) - device = torch.device("cuda:" + str(process_gpu_dict[process_ID - 1]) if torch.cuda.is_available() else "cpu") - logging.info(device) - return device - - -if __name__ == "__main__": - # initialize distributed computing (MPI) - comm, process_id, worker_number = FedML_init() - - # parse python script input parameters - parser = argparse.ArgumentParser() - args = add_args(parser) - logging.info(args) - - # customize the process name - str_process_name = "FedAvg (distributed):" + str(process_id) - setproctitle.setproctitle(str_process_name) - - # customize the log format - # logging.basicConfig(level=logging.INFO, - logging.basicConfig(level=logging.DEBUG, - format=str( - process_id) + ' - %(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', - datefmt='%a, %d %b %Y %H:%M:%S') - hostname = socket.gethostname() - logging.info("#############process ID = " + str(process_id) + - ", host name = " + hostname + "########" + - ", process ID = " + str(os.getpid()) + - ", process Name = " + str(psutil.Process(os.getpid()))) - - # initialize the wandb machine learning experimental tracking platform (https://www.wandb.com/). - if process_id == 0: - wandb.init( - # project="federated_nas", - project="fedml", - name="FedAVG(d)" + str(args.partition_method) + "r" + str(args.comm_round) + "-e" + str( - args.epochs) + "-lr" + str( - args.lr), - config=args - ) - - # Set the random seed. The np.random seed determines the dataset partition. - # The torch_manual_seed determines the initial weight. - # We fix these two, so that we can reproduce the result. - random.seed(0) - np.random.seed(0) - torch.manual_seed(0) - torch.cuda.manual_seed_all(0) - - # GPU arrangement: Please customize this function according your own topology. - # The GPU server list is configured at "mpi_host_file". - # If we have 4 machines and each has two GPUs, and your FL network has 8 workers and a central worker. - # The 4 machines will be assigned as follows: - # machine 1: worker0, worker4, worker8; - # machine 2: worker1, worker5; - # machine 3: worker2, worker6; - # machine 4: worker3, worker7; - # Therefore, we can see that workers are assigned according to the order of machine list. - logging.info("process_id = %d, size = %d" % (process_id, worker_number)) - device = init_training_device(process_id, worker_number - 1, args.gpu_num_per_server) - - # load data - dataset = load_data(args, args.dataset) - [train_data_num, test_data_num, train_data_global, test_data_global, - train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] = dataset - - # create model. - # Note if the model is DNN (e.g., ResNet), the training will be very slow. - # In this case, please use our FedML distributed version (./fedml_experiments/distributed_fedavg) - model = create_model(args, model_name=args.model, output_dim=dataset[7]) - - # define my own trainer - model_trainer = ClassificationTrainer(model) - - # start "federated averaging (FedAvg)" - FedML_FedAvg_distributed(process_id, worker_number, device, comm, - model, train_data_num, train_data_global, test_data_global, - train_data_local_num_dict, train_data_local_dict, test_data_local_dict, - args, model_trainer) \ No newline at end of file diff --git a/experiments/distributed/classification/mpi_host_file b/experiments/distributed/classification/mpi_host_file index cf5c6bd..9f3c558 100644 --- a/experiments/distributed/classification/mpi_host_file +++ b/experiments/distributed/classification/mpi_host_file @@ -1 +1 @@ -ChaoyangHe-GPU-RTX2080Tix4 +ChaoyangHe-GPU-RTX2080Tix4 \ No newline at end of file diff --git a/experiments/distributed/classification/parse.py b/experiments/distributed/classification/parse.py new file mode 100644 index 0000000..556608c --- /dev/null +++ b/experiments/distributed/classification/parse.py @@ -0,0 +1,25 @@ +def parse(process_id, worker_number, gpu_util_parse): + gpu_util_parse_temp = gpu_util_parse.split(';') + gpu_util_parse_temp = [(item.split(':')[0], item.split(':')[1]) for item in gpu_util_parse_temp ] + + gpu_util = {} + for (host, gpus_str) in gpu_util_parse_temp: + gpu_util[host] = [int(num_process_on_gpu) for num_process_on_gpu in gpus_str.split(',')] + + gpu_util_map = {} + i = 0 + for host, gpus_util_map_host in gpu_util.items(): + for gpu_j, num_process_on_gpu in enumerate(gpus_util_map_host): + for _ in range(num_process_on_gpu): + gpu_util_map[i] = (host, gpu_j) + i += 1 + print(gpu_util_map) + + +parse(0, 10, "local:2,2,0,6") + + + + + + diff --git a/experiments/distributed/classification/run_fedavg_distributed_pytorch.sh b/experiments/distributed/classification/run_fedavg_distributed_pytorch.sh index 8e98eef..b6f1578 100644 --- a/experiments/distributed/classification/run_fedavg_distributed_pytorch.sh +++ b/experiments/distributed/classification/run_fedavg_distributed_pytorch.sh @@ -1,37 +1,19 @@ #!/usr/bin/env bash -CLIENT_NUM=$1 -WORKER_NUM=$2 -SERVER_NUM=$3 -GPU_NUM_PER_SERVER=$4 -MODEL=$5 -DISTRIBUTION=$6 -ROUND=$7 -EPOCH=$8 -BATCH_SIZE=$9 -LR=${10} -DATASET=${11} -DATA_DIR=${12} -CLIENT_OPTIMIZER=${13} -CI=${14} +WORKER_NUM=$1 +MPI_HOST_FILE=$2 +PYTHON=$3 +ARGS=$4 + PROCESS_NUM=`expr $WORKER_NUM + 1` echo $PROCESS_NUM +echo $MPI_HOST_FILE + -hostname > mpi_host_file -mpirun -np $PROCESS_NUM -hostfile ./mpi_host_file python3 ./main_fedavg.py \ - --gpu_server_num $SERVER_NUM \ - --gpu_num_per_server $GPU_NUM_PER_SERVER \ - --model $MODEL \ - --dataset $DATASET \ - --data_dir $DATA_DIR \ - --partition_method $DISTRIBUTION \ - --client_num_in_total $CLIENT_NUM \ +mpirun -np $PROCESS_NUM -hostfile ./$MPI_HOST_FILE $PYTHON ./main.py \ --client_num_per_round $WORKER_NUM \ - --comm_round $ROUND \ - --epochs $EPOCH \ - --client_optimizer $CLIENT_OPTIMIZER \ - --batch_size $BATCH_SIZE \ - --lr $LR \ - --ci $CI + $ARGS + + diff --git a/experiments/distributed/classification/run_with_conf.sh b/experiments/distributed/classification/run_with_conf.sh new file mode 100755 index 0000000..25dd94d --- /dev/null +++ b/experiments/distributed/classification/run_with_conf.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +WORKER_NUM=$1 +MPI_HOST_FILE=$2 +DATASET=$3 +DATA_DIR=$4 +ARGS=$5 + + +source configs/cluster.conf +PYTHON=`cat configs/cluster.conf | grep PYTHON | awk -F "=" '{print $2}'` +data_dir=`cat configs/cluster.conf | grep $DATA_DIR | awk -F "=" '{print $2}'` + + +PROCESS_NUM=`expr $WORKER_NUM + 1` +echo $PROCESS_NUM +echo $MPI_HOST_FILE +echo $PYTHON +echo $data_dir + + + + +mpirun -np $PROCESS_NUM -hostfile ./$MPI_HOST_FILE \ + $PYTHON ./main.py \ + --data_dir $data_dir --dataset $DATASET \ + --client_num_per_round $WORKER_NUM \ + $ARGS + + diff --git a/experiments/distributed/classification/run_with_conf_t716.sh b/experiments/distributed/classification/run_with_conf_t716.sh new file mode 100644 index 0000000..ef67922 --- /dev/null +++ b/experiments/distributed/classification/run_with_conf_t716.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +WORKER_NUM=$1 +MPI_HOST_FILE=$2 +DATASET=$3 +DATA_DIR=$4 +ARGS=$5 + + +source configs/cluster.conf +PYTHON=`cat configs/cluster.conf | grep PYTHON | awk -F "=" '{print $2}'` +data_dir=`cat configs/cluster.conf | grep $DATA_DIR | awk -F "=" '{print $2}'` + + +PROCESS_NUM=`expr $WORKER_NUM + 1` +echo $PROCESS_NUM +echo $MPI_HOST_FILE +echo $PYTHON +echo $data_dir + + + + +mpirun -np $PROCESS_NUM -hostfile ./$MPI_HOST_FILE \ + -mca btl_tcp_if_include 192.168.0.101/24 \ + $PYTHON ./main.py \ + --data_dir $data_dir --dataset $DATASET \ + --client_num_per_round $WORKER_NUM \ + $ARGS + + diff --git a/experiments/distributed/classification/sbatch_run.sh b/experiments/distributed/classification/sbatch_run.sh new file mode 100644 index 0000000..37f1e02 --- /dev/null +++ b/experiments/distributed/classification/sbatch_run.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +#SBATCH -o /apps/mpi/myjob.out +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=2 +mpirun python /apps/mpi/helloworld.py + + + diff --git a/experiments/distributed/classification/slurm-5793.out b/experiments/distributed/classification/slurm-5793.out new file mode 100644 index 0000000..abf1d78 --- /dev/null +++ b/experiments/distributed/classification/slurm-5793.out @@ -0,0 +1,4 @@ +Traceback (most recent call last): + File "/var/spool/slurmd/job05793/slurm_script", line 6, in + from mpi4py import MPI +ModuleNotFoundError: No module named 'mpi4py' diff --git a/experiments/standalone/__init__.py b/experiments/standalone/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/experiments/standalone/classification/.main.py.swp b/experiments/standalone/classification/.main.py.swp new file mode 100644 index 0000000..ff42f13 Binary files /dev/null and b/experiments/standalone/classification/.main.py.swp differ diff --git a/experiments/standalone/classification/README.md b/experiments/standalone/classification/README.md new file mode 100644 index 0000000..e69de29 diff --git a/experiments/standalone/classification/__init__.py b/experiments/standalone/classification/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/experiments/standalone/classification/client.py b/experiments/standalone/classification/client.py new file mode 100644 index 0000000..6e4f5db --- /dev/null +++ b/experiments/standalone/classification/client.py @@ -0,0 +1,40 @@ +import logging + + +class Client: + + def __init__(self, client_idx, local_training_data, local_test_data, local_sample_number, args, device, + model_trainer): + self.client_idx = client_idx + self.local_training_data = local_training_data + self.local_test_data = local_test_data + self.local_sample_number = local_sample_number + logging.info("self.local_sample_number = " + str(self.local_sample_number)) + + self.args = args + self.device = device + self.model_trainer = model_trainer + + def update_local_dataset(self, client_idx, local_training_data, local_test_data, local_sample_number): + self.client_idx = client_idx + self.local_training_data = local_training_data + self.local_test_data = local_test_data + self.local_sample_number = local_sample_number + + def get_sample_number(self): + return self.local_sample_number + + def train(self, w_global, round_idx): + self.args.round_idx = round_idx + self.model_trainer.set_model_params(w_global) + self.model_trainer.train(self.local_training_data, self.device, self.args) + weights = self.model_trainer.get_model_params() + return weights + + def local_test(self, b_use_test_dataset): + if b_use_test_dataset: + test_data = self.local_test_data + else: + test_data = self.local_training_data + metrics = self.model_trainer.test(test_data, self.device, self.args) + return metrics diff --git a/experiments/standalone/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md b/experiments/standalone/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md new file mode 100644 index 0000000..2c24943 --- /dev/null +++ b/experiments/standalone/classification/experiment_scripts/ILSVRC2012-100 EfficientNet.md @@ -0,0 +1,61 @@ +# ILSVRC2012-100 + + +``` +~/py36/bin/python ./main.py \ +--gpu 0 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 32 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .997 + +~/py36/bin/python ./main.py \ +--gpu 1 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 32 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .997 + +~/py36/bin/python ./main.py \ +--gpu 2 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .997 + +~/py36/bin/python ./main.py \ +--gpu 3 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .997 + +``` + + + + + + + + diff --git a/experiments/standalone/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md b/experiments/standalone/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md new file mode 100644 index 0000000..554ef6c --- /dev/null +++ b/experiments/standalone/classification/experiment_scripts/ILSVRC2012-100 MobileNetV3.md @@ -0,0 +1,202 @@ +# ILSVRC2012-100 + + + +# 10 clients +``` + +# DAAI + +~/py36/bin/python ./main.py \ +--gpu 0 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 16 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + + +~/py36/bin/python ./main.py \ +--gpu 1 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 64 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + +~/py36/bin/python ./main.py \ +--gpu 2 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 64 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + +~/py36/bin/python ./main.py \ +--gpu 3 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 16 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + + +~/py36/bin/python ./main.py \ +--gpu 0 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 256 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + +~/py36/bin/python ./main.py \ +--gpu 1 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + +~/py36/bin/python ./main.py \ +--gpu 2 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + +~/py36/bin/python ./main.py \ +--gpu 3 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + +# scigpu +~/anaconda3/envs/py36/bin/python ./main.py \ +--gpu 2 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/imagenet/ILSVRC2012_dataset \ +--if-timm-dataset -b 16 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + + + +~/py36/bin/python ./main.py \ +--gpu 0 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 256 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + +~/py36/bin/python ./main.py \ +--gpu 1 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + +# ================================================================================================== +~/py36/bin/python ./main.py \ +--gpu 0 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 32 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .997 + +~/py36/bin/python ./main.py \ +--gpu 1 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 32 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .997 + +~/py36/bin/python ./main.py \ +--gpu 2 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.001 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .997 + +~/py36/bin/python ./main.py \ +--gpu 3 \ +--client_num_per_round 20 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset ILSVRC2012-100 --data_dir /home/datasets/ILSVRC2012_dataset \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .997 +``` + + + + + + + diff --git a/experiments/standalone/classification/experiment_scripts/cifar100 EfficientNet.md b/experiments/standalone/classification/experiment_scripts/cifar100 EfficientNet.md new file mode 100644 index 0000000..eaec2ca --- /dev/null +++ b/experiments/standalone/classification/experiment_scripts/cifar100 EfficientNet.md @@ -0,0 +1,30 @@ +# gld23k + +# 10 clients +``` +# DAAI + +~/py36/bin/python ./main.py \ +--gpu 3 \ +--client_num_per_round 10 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 64 --data_transform FLTransform \ +--comm_round 3000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .9992 + + +``` + + + + + + + + + + diff --git a/experiments/standalone/classification/experiment_scripts/cifar100 MobileNetV3.md b/experiments/standalone/classification/experiment_scripts/cifar100 MobileNetV3.md new file mode 100644 index 0000000..19df2ff --- /dev/null +++ b/experiments/standalone/classification/experiment_scripts/cifar100 MobileNetV3.md @@ -0,0 +1,32 @@ +# gld23k + +# 10 clients +``` +# DAAI + +~/py36/bin/python ./main.py \ +--gpu 3 \ +--client_num_per_round 10 --client_num_in_total 100 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 64 --data_transform FLTransform \ +--comm_round 3000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .9992 + + + + +``` + + + + + + + + + + diff --git a/experiments/standalone/classification/experiment_scripts/gld23k EfficientNet.md b/experiments/standalone/classification/experiment_scripts/gld23k EfficientNet.md new file mode 100644 index 0000000..0c011e7 --- /dev/null +++ b/experiments/standalone/classification/experiment_scripts/gld23k EfficientNet.md @@ -0,0 +1,255 @@ +# gld23k + + +# 4 clients +``` + +# DAAI + + +~/py36/bin/python ./main.py \ +--gpu 0 \ +--client_num_per_round 4 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 32 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + +~/py36/bin/python ./main.py \ +--gpu 2 \ +--client_num_per_round 4 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 32 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + +~/py36/bin/python ./main.py \ +--gpu 3 \ +--client_num_per_round 4 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 32 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.003 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + + + +~/py36/bin/python ./main.py \ +--gpu 0 \ +--client_num_per_round 4 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + + +~/py36/bin/python ./main.py \ +--gpu 1 \ +--client_num_per_round 4 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 +``` + + +# 10 clients + +``` +~/py36/bin/python ./main.py \ +--gpu 2 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + +~/py36/bin/python ./main.py \ +--gpu 3 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + +~/py36/bin/python ./main.py \ +--gpu 1 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .997 + + +~/py36/bin/python ./main.py \ +--gpu 2 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .997 + +~/py36/bin/python ./main.py \ +--gpu 3 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 32 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .997 + + +# ========================================================================= +# high decay-rate + +~/py36/bin/python ./main.py \ +--gpu 0 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 32 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .999 + +~/py36/bin/python ./main.py \ +--gpu 1 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .999 + +~/py36/bin/python ./main.py \ +--gpu 2 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 32 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .999 + +~/py36/bin/python ./main.py \ +--gpu 3 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.1 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .999 + + + +~/py36/bin/python ./main.py \ +--gpu 0 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 32 --data_transform FLTransform \ +--comm_round 3000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .9992 + +~/py36/bin/python ./main.py \ +--gpu 1 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 3000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .9992 + +~/py36/bin/python ./main.py \ +--gpu 2 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 16 --data_transform FLTransform \ +--comm_round 3000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .9992 + +~/py36/bin/python ./main.py \ +--gpu 3 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 16 --data_transform FLTransform \ +--comm_round 3000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.005 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .9992 + +``` + + + + + + + + + + + + + + diff --git a/experiments/standalone/classification/experiment_scripts/gld23k MobileNetV3.md b/experiments/standalone/classification/experiment_scripts/gld23k MobileNetV3.md new file mode 100644 index 0000000..ba67014 --- /dev/null +++ b/experiments/standalone/classification/experiment_scripts/gld23k MobileNetV3.md @@ -0,0 +1,173 @@ +# gld23k + + + +# 10 clients +``` + +# DAAI + +~/py36/bin/python ./main.py \ +--gpu 0 \ +--client_num_per_round 4 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 32 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + +~/py36/bin/python ./main.py \ +--gpu 3 \ +--client_num_per_round 4 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 32 --data_transform FLTransform \ +--comm_round 1000 --epochs 1 \ +--model efficientnet \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .992 + +# ========================================================================= +# high decay-rate + +~/py36/bin/python ./main.py \ +--gpu 2 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .997 + +~/py36/bin/python ./main.py \ +--gpu 3 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .997 + +~/py36/bin/python ./main.py \ +--gpu 0 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 32 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .999 + +~/py36/bin/python ./main.py \ +--gpu 1 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 32 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .999 + +~/py36/bin/python ./main.py \ +--gpu 2 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .999 + +~/py36/bin/python ./main.py \ +--gpu 3 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 2000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.1 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .999 + + +~/py36/bin/python ./main.py \ +--gpu 0 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 32 --data_transform FLTransform \ +--comm_round 3000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .9992 + +~/py36/bin/python ./main.py \ +--gpu 1 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 128 --data_transform FLTransform \ +--comm_round 3000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.03 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .9992 + +~/py36/bin/python ./main.py \ +--gpu 2 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 16 --data_transform FLTransform \ +--comm_round 3000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.01 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .9992 + +~/py36/bin/python ./main.py \ +--gpu 3 \ +--client_num_per_round 10 --client_num_in_total 233 \ +--frequency_of_the_test 10 \ +--dataset gld23k --data_dir /home/datasets/landmarks \ +--if-timm-dataset -b 16 --data_transform FLTransform \ +--comm_round 3000 --epochs 1 \ +--model mobilenet_v3 \ +--drop 0.2 --drop-connect 0.2 --remode pixel --reprob 0.2 \ +--opt rmsproptf --lr 0.005 --opt-eps .001 --warmup-lr 1e-6 --weight-decay 1e-5 \ +--sched step --decay-rounds 1 --decay-rate .9992 + + +``` + + + + + + + + + + + + + + diff --git a/experiments/standalone/classification/fedavg_api.py b/experiments/standalone/classification/fedavg_api.py new file mode 100644 index 0000000..c4d3154 --- /dev/null +++ b/experiments/standalone/classification/fedavg_api.py @@ -0,0 +1,216 @@ +import copy +import logging +import random + +import numpy as np +import torch +import wandb + +from client import Client + + +class FedAvgAPI(object): + def __init__(self, dataset, device, args, model_trainer): + self.device = device + self.args = args + [train_data_num, test_data_num, train_data_global, test_data_global, + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] = dataset + self.train_global = train_data_global + self.test_global = test_data_global + self.val_global = None + self.train_data_num_in_total = train_data_num + self.test_data_num_in_total = test_data_num + + self.client_list = [] + self.train_data_local_num_dict = train_data_local_num_dict + self.train_data_local_dict = train_data_local_dict + self.test_data_local_dict = test_data_local_dict + + self.model_trainer = model_trainer + self._setup_clients(train_data_local_num_dict, train_data_local_dict, test_data_local_dict, model_trainer) + + def _setup_clients(self, train_data_local_num_dict, train_data_local_dict, test_data_local_dict, model_trainer): + logging.info("############setup_clients (START)#############") + for client_idx in range(self.args.client_num_per_round): + c = Client(client_idx, train_data_local_dict[client_idx], test_data_local_dict[client_idx], + train_data_local_num_dict[client_idx], self.args, self.device, model_trainer) + self.client_list.append(c) + logging.info("############setup_clients (END)#############") + + def train(self): + w_global = self.model_trainer.get_model_params() + for round_idx in range(self.args.comm_round): + + logging.info("################Communication round : {}".format(round_idx)) + + w_locals = [] + + """ + for scalability: following the original FedAvg algorithm, we uniformly sample a fraction of clients in each round. + Instead of changing the 'Client' instances, our implementation keeps the 'Client' instances and then updates their local dataset + """ + client_indexes = self._client_sampling(round_idx, self.args.client_num_in_total, + self.args.client_num_per_round) + logging.info("client_indexes = " + str(client_indexes)) + + for idx, client in enumerate(self.client_list): + # update dataset + client_idx = client_indexes[idx] + client.update_local_dataset(client_idx, self.train_data_local_dict[client_idx], + self.test_data_local_dict[client_idx], + self.train_data_local_num_dict[client_idx]) + + # train on new dataset + w = client.train(w_global, round_idx) + # self.logger.info("local weights = " + str(w)) + w_locals.append((client.get_sample_number(), copy.deepcopy(w))) + + # update global weights + w_global = self._aggregate(w_locals) + self.model_trainer.set_model_params(w_global) + + # test results + # at last round + if round_idx == self.args.comm_round - 1: + self._local_test_on_all_clients(round_idx) + # per {frequency_of_the_test} round + elif round_idx % self.args.frequency_of_the_test == 0: + if self.args.dataset.startswith("stackoverflow"): + self._local_test_on_validation_set(round_idx) + else: + self._local_test_on_all_clients(round_idx) + + def _client_sampling(self, round_idx, client_num_in_total, client_num_per_round): + if client_num_in_total == client_num_per_round: + client_indexes = [client_index for client_index in range(client_num_in_total)] + else: + num_clients = min(client_num_per_round, client_num_in_total) + np.random.seed(round_idx) # make sure for each comparison, we are selecting the same clients each round + client_indexes = np.random.choice(range(client_num_in_total), num_clients, replace=False) + logging.info("client_indexes = %s" % str(client_indexes)) + return client_indexes + + def _generate_validation_set(self, num_samples=10000): + test_data_num = len(self.test_global.dataset) + sample_indices = random.sample(range(test_data_num), min(num_samples, test_data_num)) + subset = torch.utils.data.Subset(self.test_global.dataset, sample_indices) + sample_testset = torch.utils.data.DataLoader(subset, batch_size=self.args.batch_size) + self.val_global = sample_testset + + def _aggregate(self, w_locals): + training_num = 0 + for idx in range(len(w_locals)): + (sample_num, averaged_params) = w_locals[idx] + training_num += sample_num + + (sample_num, averaged_params) = w_locals[0] + for k in averaged_params.keys(): + for i in range(0, len(w_locals)): + local_sample_number, local_model_params = w_locals[i] + w = local_sample_number / training_num + if i == 0: + averaged_params[k] = local_model_params[k] * w + else: + averaged_params[k] += local_model_params[k] * w + return averaged_params + + def _local_test_on_all_clients(self, round_idx): + + logging.info("################local_test_on_all_clients : {}".format(round_idx)) + + # train_metrics = { + # 'num_samples': [], + # 'num_correct': [], + # 'losses': [] + # } + + test_metrics = { + 'num_samples': [], + 'num_correct': [], + 'losses': [] + } + + client = self.client_list[0] + + for client_idx in range(self.args.client_num_in_total): + """ + Note: for datasets like "fed_CIFAR100" and "fed_shakespheare", + the training client number is larger than the testing client number + """ + if self.args.dataset in ['gld23k', 'gld160k'] and client_idx > 1: + break + if self.test_data_local_dict[client_idx] is None: + continue + client.update_local_dataset(0, self.train_data_local_dict[client_idx], + self.test_data_local_dict[client_idx], + self.train_data_local_num_dict[client_idx]) + # # train data + # train_local_metrics = client.local_test(False) + # train_metrics['num_samples'].append(copy.deepcopy(train_local_metrics['test_total'])) + # train_metrics['num_correct'].append(copy.deepcopy(train_local_metrics['test_correct'])) + # train_metrics['losses'].append(copy.deepcopy(train_local_metrics['test_loss'])) + + # test data + test_local_metrics = client.local_test(True) + test_metrics['num_samples'].append(copy.deepcopy(test_local_metrics['test_total'])) + test_metrics['num_correct'].append(copy.deepcopy(test_local_metrics['test_correct'])) + test_metrics['losses'].append(copy.deepcopy(test_local_metrics['test_loss'])) + + """ + Note: CI environment is CPU-based computing. + The training speed for RNN training is to slow in this setting, so we only test a client to make sure there is no programming error. + """ + if self.args.ci == 1: + break + + # test on training dataset + # train_acc = sum(train_metrics['num_correct']) / sum(train_metrics['num_samples']) + # train_loss = sum(train_metrics['losses']) / sum(train_metrics['num_samples']) + + # test on test dataset + test_acc = sum(test_metrics['num_correct']) / sum(test_metrics['num_samples']) + test_loss = sum(test_metrics['losses']) / sum(test_metrics['num_samples']) + + # stats = {'training_acc': train_acc, 'training_loss': train_loss} + # wandb.log({"Train/Acc": train_acc, "round": round_idx}) + # wandb.log({"Train/Loss": train_loss, "round": round_idx}) + # logging.info(stats) + + stats = {'test_acc': test_acc, 'test_loss': test_loss} + wandb.log({"Test/Acc": test_acc, "round": round_idx}) + wandb.log({"Test/Loss": test_loss, "round": round_idx}) + logging.info(stats) + + + def _local_test_on_validation_set(self, round_idx): + + logging.info("################local_test_on_validation_set : {}".format(round_idx)) + + if self.val_global is None: + self._generate_validation_set() + + client = self.client_list[0] + client.update_local_dataset(0, None, self.val_global, None) + # test data + test_metrics = client.local_test(True) + + if self.args.dataset == "stackoverflow_nwp": + test_acc = test_metrics['test_correct'] / test_metrics['test_total'] + test_loss = test_metrics['test_loss'] / test_metrics['test_total'] + stats = {'test_acc': test_acc, 'test_loss': test_loss} + wandb.log({"Test/Acc": test_acc, "round": round_idx}) + wandb.log({"Test/Loss": test_loss, "round": round_idx}) + elif self.args.dataset == "stackoverflow_lr": + test_acc = test_metrics['test_correct'] / test_metrics['test_total'] + test_pre = test_metrics['test_precision'] / test_metrics['test_total'] + test_rec = test_metrics['test_recall'] / test_metrics['test_total'] + test_loss = test_metrics['test_loss'] / test_metrics['test_total'] + stats = {'test_acc': test_acc, 'test_pre': test_pre, 'test_rec': test_rec, 'test_loss': test_loss} + wandb.log({"Test/Acc": test_acc, "round": round_idx}) + wandb.log({"Test/Pre": test_pre, "round": round_idx}) + wandb.log({"Test/Rec": test_rec, "round": round_idx}) + wandb.log({"Test/Loss": test_loss, "round": round_idx}) + else: + raise Exception("Unknown format to log metrics for dataset {}!"%self.args.dataset) + + logging.info(stats) diff --git a/experiments/standalone/classification/main.py b/experiments/standalone/classification/main.py new file mode 100644 index 0000000..65023bb --- /dev/null +++ b/experiments/standalone/classification/main.py @@ -0,0 +1,436 @@ +import argparse +import logging +import os +import random +import socket +import sys +import traceback +import yaml + +import numpy as np +import psutil +import setproctitle +import torch +import wandb + + +from timm import create_model as timm_create_model +from timm.models import resume_checkpoint, load_checkpoint, convert_splitbn_model + + +sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "../../../"))) + +# from FedML.fedml_api.standalone.fedavg.fedavg_api import FedAvgAPI +from fedavg_api import FedAvgAPI + +from data_preprocessing.ImageNet.data_loader import load_partition_data_ImageNet +from data_preprocessing.Landmarks.data_loader import load_partition_data_landmarks +from data_preprocessing.cifar10.iid_data_loader import load_iid_cifar10 +from data_preprocessing.cifar10.data_loader import load_partition_data_cifar10 +from data_preprocessing.cifar100.data_loader import load_partition_data_cifar100 +from data_preprocessing.cinic10.data_loader import load_partition_data_cinic10 + +from training.fedavg_classification_trainer import ClassificationTrainer + +from utils.logger import ( + logging_config +) + + + +def add_args(parser): + """ + parser : argparse.ArgumentParser + return a parser added with args required by fit + """ + # Training settings + parser.add_argument('--model', type=str, default='mobilenet', metavar='N', + help='neural network used in training') + + parser.add_argument('--dataset', type=str, default='cifar10', metavar='N', + help='dataset used for training') + + parser.add_argument('--data_dir', type=str, default='./../../../data/cifar10', + help='data directory') + + parser.add_argument('--partition_method', type=str, default='hetero', metavar='N', + help='how to partition the dataset on local workers') + + parser.add_argument('--partition_alpha', type=float, default=0.5, metavar='PA', + help='partition alpha (default: 0.5)') + + parser.add_argument('--client_num_in_total', type=int, default=1000, metavar='NN', + help='number of workers in a distributed cluster') + + parser.add_argument('--client_num_per_round', type=int, default=4, metavar='NN', + help='number of workers') + + # parser.add_argument('--batch_size', type=int, default=64, metavar='N', + # help='input batch size for training (default: 64)') + + parser.add_argument('--client_optimizer', type=str, default='adam', + help='SGD with momentum; adam') + + # parser.add_argument('--lr', type=float, default=0.001, metavar='LR', + # help='learning rate (default: 0.001)') + + parser.add_argument('--wd', help='weight decay parameter;', type=float, default=0.00001) + + parser.add_argument('--epochs', type=int, default=5, metavar='EP', + help='how many epochs will be trained locally') + + parser.add_argument('--comm_round', type=int, default=10, + help='how many round of communications we shoud use') + + parser.add_argument('--is_mobile', type=int, default=0, + help='whether the program is running on the FedML-Mobile server side') + + parser.add_argument('--frequency_of_the_test', type=int, default=1, + help='the frequency of the algorithms') + + parser.add_argument('--gpu_server_num', type=int, default=1, + help='gpu_server_num') + + parser.add_argument('--gpu_num_per_server', type=int, default=4, + help='gpu_num_per_server') + + parser.add_argument('--gpu', type=int, default=0, + help='gpu') + + parser.add_argument('--ci', type=int, default=0, + help='CI') + + parser.add_argument('--gpu_util_file', type=str, default=None, + help='the gpu utilization file for servers and clients. If there is no \ + gpu_util_file, gpu will not be used.') + parser.add_argument('--gpu_util_key', type=str, default=None, + help='the key in gpu utilization file') + parser.add_argument('--gpu_util_parse', type=str, default=None, + help='the gpu utilization string for servers and clients. If there is no \ + gpu_util_parse, gpu will not be used. Note if this and gpu_util_file are \ + both defined, gpu_util_parse will be used but not gpu_util_file') + + parser.add_argument('--pretrained',action='store_true', default=False, + help='Start with pretrained version of specified network (if avail)') + + parser.add_argument('--distributed', action='store_true', default=False, + help='If distributed training') + + parser.add_argument('--if-timm-dataset', action='store_true', default=False, + help='If use timm dataset augmentation') + + parser.add_argument('--data_load_num_workers', type=int, default=4, + help='number of workers when loading data') + + + # logging settings + parser.add_argument('--level', type=str, default='INFO', + help='level of logging') + + # Dataset + parser.add_argument('--img-size', type=int, default=None, metavar='N', + help='Image patch size (default: None => model default)') + parser.add_argument('--crop-pct', default=None, type=float, + metavar='N', help='Input image center crop percent (for validation only)') + parser.add_argument('--data_transform', default=None, type=str, metavar='TRANSFORM', + help='How to do data transform') + parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN', + help='Override mean pixel value of dataset') + parser.add_argument('--std', type=float, nargs='+', default=None, metavar='STD', + help='Override std deviation of of dataset') + parser.add_argument('--interpolation', default='', type=str, metavar='NAME', + help='Image resize interpolation type (overrides model)') + parser.add_argument('-b', '--batch-size', type=int, default=32, metavar='N', + help='input batch size for training (default: 32)') + parser.add_argument('-vb', '--validation-batch-size-multiplier', type=int, default=1, metavar='N', + help='ratio of validation batch size to training batch size (default: 1)') + + + # Model parameters + parser.add_argument('--gp', default=None, type=str, metavar='POOL', + help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.') + + # Optimizer parameters + parser.add_argument('--opt', default='sgd', type=str, metavar='OPTIMIZER', + help='Optimizer (default: "sgd"') + parser.add_argument('--opt-eps', default=None, type=float, metavar='EPSILON', + help='Optimizer Epsilon (default: None, use opt default)') + parser.add_argument('--opt-betas', default=None, type=float, nargs='+', metavar='BETA', + help='Optimizer Betas (default: None, use opt default)') + parser.add_argument('--momentum', type=float, default=0.9, metavar='M', + help='Optimizer momentum (default: 0.9)') + parser.add_argument('--weight-decay', type=float, default=0.00001, + help='weight decay (default: 0.0001)') + parser.add_argument('--clip-grad', type=float, default=None, metavar='NORM', + help='Clip gradient norm (default: None, no clipping)') + + + # Learning rate schedule parameters + parser.add_argument('--sched', default=None, type=str, metavar='SCHEDULER', + help='LR scheduler (default: "step"') + parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') + parser.add_argument('--lr-noise', type=float, nargs='+', default=None, metavar='pct, pct', + help='learning rate noise on/off epoch percentages') + parser.add_argument('--lr-noise-pct', type=float, default=0.67, metavar='PERCENT', + help='learning rate noise limit percent (default: 0.67)') + parser.add_argument('--lr-noise-std', type=float, default=1.0, metavar='STDDEV', + help='learning rate noise std-dev (default: 1.0)') + parser.add_argument('--lr-cycle-mul', type=float, default=1.0, metavar='MULT', + help='learning rate cycle len multiplier (default: 1.0)') + parser.add_argument('--lr-cycle-limit', type=int, default=1, metavar='N', + help='learning rate cycle limit') + parser.add_argument('--warmup-lr', type=float, default=0.0001, metavar='LR', + help='warmup learning rate (default: 0.0001)') + parser.add_argument('--min-lr', type=float, default=1e-5, metavar='LR', + help='lower lr bound for cyclic schedulers that hit 0 (1e-5)') + # parser.add_argument('--epochs', type=int, default=200, metavar='N', + # help='number of epochs to train (default: 2)') + parser.add_argument('--start-epoch', default=None, type=int, metavar='N', + help='manual epoch number (useful on restarts)') + parser.add_argument('--decay-epochs', type=float, default=30, metavar='N', + help='epoch interval to decay LR') + parser.add_argument('--warmup-epochs', type=int, default=3, metavar='N', + help='epochs to warmup LR, if scheduler supports') + parser.add_argument('--cooldown-epochs', type=int, default=10, metavar='N', + help='epochs to cooldown LR at min_lr, after cyclic schedule ends') + parser.add_argument('--patience-epochs', type=int, default=10, metavar='N', + help='patience epochs for Plateau LR scheduler (default: 10') + parser.add_argument('--decay-rate', '--dr', type=float, default=0.1, metavar='RATE', + help='LR decay rate (default: 0.1)') + + parser.add_argument('--decay-rounds', type=float, default=30, metavar='N', + help='round interval to decay LR') + + + # Augmentation & regularization parameters + parser.add_argument('--no-aug', action='store_true', default=False, + help='Disable all training augmentation, override other train aug args') + parser.add_argument('--scale', type=float, nargs='+', default=[0.08, 1.0], metavar='PCT', + help='Random resize scale (default: 0.08 1.0)') + parser.add_argument('--ratio', type=float, nargs='+', default=[3./4., 4./3.], metavar='RATIO', + help='Random resize aspect ratio (default: 0.75 1.33)') + parser.add_argument('--hflip', type=float, default=0.5, + help='Horizontal flip training aug probability') + parser.add_argument('--vflip', type=float, default=0., + help='Vertical flip training aug probability') + parser.add_argument('--color-jitter', type=float, default=0.4, metavar='PCT', + help='Color jitter factor (default: 0.4)') + parser.add_argument('--aa', type=str, default=None, metavar='NAME', + help='Use AutoAugment policy. "v0" or "original". (default: None)'), + parser.add_argument('--aug-splits', type=int, default=0, + help='Number of augmentation splits (default: 0, valid: 0 or >=2)') + parser.add_argument('--jsd', action='store_true', default=False, + help='Enable Jensen-Shannon Divergence + CE loss. Use with `--aug-splits`.') + parser.add_argument('--reprob', type=float, default=0., metavar='PCT', + help='Random erase prob (default: 0.)') + parser.add_argument('--remode', type=str, default='const', + help='Random erase mode (default: "const")') + parser.add_argument('--recount', type=int, default=1, + help='Random erase count (default: 1)') + parser.add_argument('--resplit', action='store_true', default=False, + help='Do not random erase first (clean) augmentation split') + parser.add_argument('--mixup', type=float, default=0.0, + help='mixup alpha, mixup enabled if > 0. (default: 0.)') + parser.add_argument('--cutmix', type=float, default=0.0, + help='cutmix alpha, cutmix enabled if > 0. (default: 0.)') + parser.add_argument('--cutmix-minmax', type=float, nargs='+', default=None, + help='cutmix min/max ratio, overrides alpha and enables cutmix if set (default: None)') + parser.add_argument('--mixup-prob', type=float, default=1.0, + help='Probability of performing mixup or cutmix when either/both is enabled') + parser.add_argument('--mixup-switch-prob', type=float, default=0.5, + help='Probability of switching to cutmix when both mixup and cutmix enabled') + parser.add_argument('--mixup-mode', type=str, default='batch', + help='How to apply mixup/cutmix params. Per "batch", "pair", or "elem"') + parser.add_argument('--mixup-off-epoch', default=0, type=int, metavar='N', + help='Turn off mixup after this epoch, disabled if 0 (default: 0)') + parser.add_argument('--smoothing', type=float, default=0.1, + help='Label smoothing (default: 0.1)') + parser.add_argument('--train-interpolation', type=str, default='random', + help='Training interpolation (random, bilinear, bicubic default: "random")') + parser.add_argument('--drop', type=float, default=0.0, metavar='PCT', + help='Dropout rate (default: 0.)') + parser.add_argument('--drop-connect', type=float, default=None, metavar='PCT', + help='Drop connect rate, DEPRECATED, use drop-path (default: None)') + parser.add_argument('--drop-path', type=float, default=None, metavar='PCT', + help='Drop path rate (default: None)') + parser.add_argument('--drop-block', type=float, default=None, metavar='PCT', + help='Drop block rate (default: None)') + + # Batch norm parameters (only works with gen_efficientnet based models currently) + parser.add_argument('--bn-tf', type=bool, default=False, + help='Use Tensorflow BatchNorm defaults for models that support it (default: False)') + parser.add_argument('--bn-momentum', type=float, default=None, + help='BatchNorm momentum override (if not None)') + parser.add_argument('--bn-eps', type=float, default=None, + help='BatchNorm epsilon override (if not None)') + parser.add_argument('--sync-bn', action='store_true', + help='Enable NVIDIA Apex or Torch synchronized BatchNorm.') + parser.add_argument('--dist-bn', type=str, default='', + help='Distribute BatchNorm stats between nodes after each epoch ("broadcast", "reduce", or "")') + parser.add_argument('--split-bn', action='store_true', + help='Enable separate BN layers per augmentation split.') + + # Model Exponential Moving Average + parser.add_argument('--model-ema', action='store_true', default=False, + help='Enable tracking moving average of model weights') + parser.add_argument('--model-ema-force-cpu', action='store_true', default=False, + help='Force ema to be tracked on CPU, rank=0 node only. Disables EMA validation.') + parser.add_argument('--model-ema-decay', type=float, default=0.9998, + help='decay factor for model weights moving average (default: 0.9998)') + + + args = parser.parse_args() + return args + + +def load_data(args, dataset_name): + if dataset_name in ["ILSVRC2012", "ILSVRC2012-100"]: + logging.info("load_data. dataset_name = %s" % dataset_name) + train_data_num, test_data_num, train_data_global, test_data_global, \ + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ + class_num = load_partition_data_ImageNet(dataset=dataset_name, data_dir=args.data_dir, + partition_method=None, partition_alpha=None, + client_number=args.client_num_in_total, + batch_size=args.batch_size, args=args) + + elif dataset_name == "gld23k": + logging.info("load_data. dataset_name = %s" % dataset_name) + args.client_num_in_total = 233 + # fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_train.csv') + # fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld23k_user_dict_test.csv') + fed_train_map_file = os.path.join(args.data_dir, 'mini_gld_train_split.csv') + fed_test_map_file = os.path.join(args.data_dir, 'mini_gld_test.csv') + + args.data_dir = os.path.join(args.data_dir, 'images') + + train_data_num, test_data_num, train_data_global, test_data_global, \ + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ + class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir, + fed_train_map_file=fed_train_map_file, + fed_test_map_file=fed_test_map_file, + partition_method=None, partition_alpha=None, + client_number=args.client_num_in_total, + batch_size=args.batch_size, args=args) + elif dataset_name == "gld160k": + logging.info("load_data. dataset_name = %s" % dataset_name) + args.client_num_in_total = 1262 + fed_train_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_train.csv') + fed_test_map_file = os.path.join(args.data_dir, 'data_user_dict/gld160k_user_dict_test.csv') + args.data_dir = os.path.join(args.data_dir, 'images') + + train_data_num, test_data_num, train_data_global, test_data_global, \ + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ + class_num = load_partition_data_landmarks(dataset=dataset_name, data_dir=args.data_dir, + fed_train_map_file=fed_train_map_file, + fed_test_map_file=fed_test_map_file, + partition_method=None, partition_alpha=None, + client_number=args.client_num_in_total, + batch_size=args.batch_size, args=args) + else: + if dataset_name == "cifar10": + data_loader = load_partition_data_cifar10 + elif dataset_name == "cifar100": + data_loader = load_partition_data_cifar100 + elif dataset_name == "cinic10": + data_loader = load_partition_data_cinic10 + else: + raise Exception("no such dataset") + + train_data_num, test_data_num, train_data_global, test_data_global, \ + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, \ + class_num = data_loader(args.dataset, args.data_dir, args.partition_method, + args.partition_alpha, args.client_num_in_total, args.batch_size) + + dataset = [train_data_num, test_data_num, train_data_global, test_data_global, + train_data_local_num_dict, train_data_local_dict, test_data_local_dict, class_num] + return dataset + + +def create_model(args, model_name, output_dim): + logging.info("create_model. model_name = %s, output_dim = %s" % (model_name, output_dim)) + if model_name == 'mobilenet_v3': + '''model_mode \in {LARGE: 5.15M, SMALL: 2.94M}''' + # model = MobileNetV3(model_mode='LARGE') + model = timm_create_model( + model_name="mobilenetv3_large_100", + pretrained=args.pretrained, + num_classes=output_dim, + drop_rate=args.drop, + # drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path + drop_path_rate=args.drop_path, + drop_block_rate=args.drop_block, + global_pool=args.gp, + bn_tf=args.bn_tf, + bn_momentum=args.bn_momentum, + bn_eps=args.bn_eps) + + elif model_name == 'efficientnet': + model = timm_create_model( + model_name="efficientnet_b0", + pretrained=args.pretrained, + num_classes=output_dim, + drop_rate=args.drop, + # drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path + drop_path_rate=args.drop_path, + drop_block_rate=args.drop_block, + global_pool=args.gp, + bn_tf=args.bn_tf, + bn_momentum=args.bn_momentum, + bn_eps=args.bn_eps) + else: + raise Exception("no such model") + return model + + +if __name__ == "__main__": + logging.basicConfig() + logger = logging.getLogger() + logger.setLevel(logging.DEBUG) + + # parser = add_args(argparse.ArgumentParser(description='FedAvg-standalone')) + # parser = argparse.ArgumentParser(description='FedAvg-standalone') + + parser = argparse.ArgumentParser() + args = add_args(parser) + args.rank = 0 + args.wd = args.weight_decay + + logger.info(args) + device = torch.device("cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu") + logger.info(device) + + # initialize the wandb machine learning experimental tracking platform (https://www.wandb.com/). + name_model_ema = "-model_ema" if args.model_ema else "-no_model_ema" + name_aa = args.aa if args.aa is not None else "_None" + wandb.init( + entity="automl", + project="fedcv-classification", + name="fedavg (d)" + str(args.partition_method) + "-" +str(args.dataset)+ + "-e" + str(args.epochs) + "-" + str(args.model) + "-" + + args.data_transform + "-aa" + name_aa + "-" + str(args.opt) + + name_model_ema + "-bs" + str(args.batch_size) + + "-lr" + str(args.lr) + "-wd" + str(args.wd), + config=args + ) + + # Set the random seed. The np.random seed determines the dataset partition. + # The torch_manual_seed determines the initial weight. + # We fix these two, so that we can reproduce the result. + random.seed(0) + np.random.seed(0) + torch.manual_seed(0) + torch.cuda.manual_seed_all(0) + + # load data + dataset = load_data(args, args.dataset) + + # create model. + # Note if the model is DNN (e.g., ResNet), the training will be very slow. + # In this case, please use our FedML distributed version (./fedml_experiments/distributed_fedavg) + model = create_model(args, model_name=args.model, output_dim=dataset[7]) + model_trainer = ClassificationTrainer(model, device, args) + logging.info(model) + + fedavgAPI = FedAvgAPI(dataset, device, args, model_trainer) + fedavgAPI.train() diff --git a/experiments/standalone/yolov5/_init.py b/experiments/standalone/yolov5/_init.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/experiments/standalone/yolov5/_init.py @@ -0,0 +1 @@ + diff --git a/experiments/standalone/yolov5/client.py b/experiments/standalone/yolov5/client.py new file mode 100644 index 0000000..847bed3 --- /dev/null +++ b/experiments/standalone/yolov5/client.py @@ -0,0 +1,590 @@ +import logging + +import os +import torch +from torch import nn + + +import time +from pathlib import Path +from threading import Thread +from warnings import warn + +import math +import random +import numpy as np +import torch.distributed as dist +import torch.nn.functional as F +import torch.optim as optim +import torch.optim.lr_scheduler as lr_scheduler +import torch.utils.data +import yaml +# from apex import amp +from torch.cuda import amp +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm + +import test # import test.py to get mAP after each epoch +# from fedml_api.model.object_detection.yolov5.models.experimental import +from fedml_api.model.object_detection.yolov5.models.experimental import attempt_load +from fedml_api.model.object_detection.yolov5.models.yolo import Model +from fedml_api.model.object_detection.yolov5.utils.autoanchor import check_anchors +# from utils.datasets import create_dataloader +from fedml_api.model.object_detection.yolov5.utils.general import labels_to_class_weights, increment_path, labels_to_image_weights, init_seeds, \ + fitness, strip_optimizer, get_latest_run, check_dataset, check_file, check_git_status, check_img_size, \ + print_mutation, set_logging +from fedml_api.model.object_detection.yolov5.utils.google_utils import attempt_download +from fedml_api.model.object_detection.yolov5.utils.loss import compute_loss +from fedml_api.model.object_detection.yolov5.utils.plots import plot_images, plot_labels, plot_results, plot_evolution +from fedml_api.model.object_detection.yolov5.utils.torch_utils import ModelEMA, select_device, intersect_dicts, torch_distributed_zero_first + +from fedml_api.data_preprocessing.coco_detection.datasets import partition_data +from fedml_api.data_preprocessing.coco_detection.datasets import create_dataloader + + + +logger = logging.getLogger(__name__) + + +class Client: + + def __init__(self, client_idx, local_training_data, local_sample_number, opt, device, model, tb_writer, wandb, hyp): + self.client_idx = client_idx + self.local_training_data = local_training_data + # self.local_test_data = local_test_data + self.local_sample_number = local_sample_number + logging.info("self.local_sample_number = " + str(self.local_sample_number)) + + self.opt = opt + self.device = device + self.model = model + self.hyp = hyp + self.tb_writer = tb_writer + self.wandb = wandb + + ''' + stackoverflow_lr is the task of multi-label classification + please refer to following links for detailed explainations on cross-entropy and corresponding implementation of tff research: + https://towardsdatascience.com/cross-entropy-for-classification-d98e7f974451 + https://github.com/google-research/federated/blob/49a43456aa5eaee3e1749855eed89c0087983541/optimization/stackoverflow_lr/federated_stackoverflow_lr.py#L131 + ''' + # if self.args.dataset == "stackoverflow_lr": + # self.criterion = nn.BCELoss(reduction = 'sum').to(device) + # else: + # self.criterion = nn.CrossEntropyLoss().to(device) + + def update_local_dataset(self, client_idx, local_training_data, local_sample_number): + self.client_idx = client_idx + self.local_training_data = local_training_data + # self.local_test_data = local_test_data + self.local_sample_number = local_sample_number + + def get_sample_number(self): + return self.local_sample_number + + def train(self, w_global, dataset, dataloader, wandb): + self.wandb = wandb + logger.info(f'Hyperparameters {self.hyp}') + save_dir, epochs, batch_size, total_batch_size, weights, rank = \ + Path( + self.opt.save_dir), self.opt.epochs, self.opt.batch_size, self.opt.total_batch_size, self.opt.weights, self.opt.global_rank + + # Directories + wdir = save_dir / 'weights' + wdir.mkdir(parents=True, exist_ok=True) # make dir + last = wdir / 'last.pt' + best = wdir / 'best.pt' + results_file = save_dir / 'results.txt' + + # Save run settings + with open(save_dir / 'hyp.yaml', 'w') as f: + yaml.dump(self.hyp, f, sort_keys=False) + with open(save_dir / 'opt.yaml', 'w') as f: + yaml.dump(vars(self.opt), f, sort_keys=False) + + # Configure + plots = not self.opt.evolve # create plots + cuda = self.device.type != 'cpu' + init_seeds(2 + rank) + with open(self.opt.data) as f: + data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict + with torch_distributed_zero_first(rank): + check_dataset(data_dict) # check + train_path = data_dict['train'] + test_path = data_dict['val'] + nc, names = (1, ['item']) if self.opt.single_cls else ( + int(data_dict['nc']), data_dict['names']) # number classes, names + assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data) # check + + # Model + pretrained = weights.endswith('.pt') + if pretrained: + with torch_distributed_zero_first(rank): + attempt_download(weights) # download if not found locally + ckpt = torch.load(weights, map_location=self.device) # load checkpoint + if self.hyp.get('anchors'): + ckpt['model'].yaml['anchors'] = round(self.hyp['anchors']) # force autoanchor + model = Model(self.opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(self.device) # create + exclude = ['anchor'] if self.opt.cfg or self.hyp.get('anchors') else [] # exclude keys + state_dict = ckpt['model'].float().state_dict() # to FP32 + state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect + model.load_state_dict(state_dict, strict=False) # load + logger.info( + 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report + else: + model = Model(self.opt.cfg, ch=3, nc=nc).to(self.device) # create + + # Freeze + freeze = [] # parameter names to freeze (full or partial) + for k, v in model.named_parameters(): + v.requires_grad = True # train all layers + if any(x in k for x in freeze): + print('freezing %s' % k) + v.requires_grad = False + + # Optimizer + nbs = 64 # nominal batch size + accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing + self.hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay + + pg0, pg1, pg2 = [], [], [] # optimizer parameter groups + for k, v in model.named_modules(): + if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): + pg2.append(v.bias) # biases + if isinstance(v, nn.BatchNorm2d): + pg0.append(v.weight) # no decay + elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): + pg1.append(v.weight) # apply decay + + if self.opt.adam: + optimizer = optim.Adam(pg0, lr=self.hyp['lr0'], betas=(self.hyp['momentum'], 0.999)) # adjust beta1 to momentum + else: + optimizer = optim.SGD(pg0, lr=self.hyp['lr0'], momentum=self.hyp['momentum'], nesterov=True) + + optimizer.add_param_group({'params': pg1, 'weight_decay': self.hyp['weight_decay']}) # add pg1 with weight_decay + optimizer.add_param_group({'params': pg2}) # add pg2 (biases) + logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) + del pg0, pg1, pg2 + + # Scheduler https://arxiv.org/pdf/1812.01187.pdf + # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR + lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - self.hyp['lrf']) + self.hyp['lrf'] # cosine + scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) + # plot_lr_scheduler(optimizer, scheduler, epochs) + + # Logging + if self.wandb and self.wandb.run is None: + self.opt.hyp = self.hyp # add hyperparameters + wandb_run = self.wandb.init(config=self.opt, resume="allow", + project='YOLOv5' if self.opt.project == 'runs/train' else Path( + self.opt.project).stem, + name=save_dir.stem, + id=ckpt.get('wandb_id') if 'ckpt' in locals() else None) + loggers = {'wandb': self.wandb} # loggers dict + + # Resume + start_epoch, best_fitness = 0, 0.0 + if pretrained: + # Optimizer + if ckpt['optimizer'] is not None: + optimizer.load_state_dict(ckpt['optimizer']) + best_fitness = ckpt['best_fitness'] + + # Results + if ckpt.get('training_results') is not None: + with open(results_file, 'w') as file: + file.write(ckpt['training_results']) # write results.txt + + # Epochs + start_epoch = ckpt['epoch'] + 1 + if self.opt.resume: + assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs) + if epochs < start_epoch: + logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % + (weights, ckpt['epoch'], epochs)) + epochs += ckpt['epoch'] # finetune additional epochs + + del ckpt, state_dict + + # Image sizes + gs = int(max(model.stride)) # grid size (max stride) + imgsz, imgsz_test = [check_img_size(x, gs) for x in self.opt.img_size] # verify imgsz are gs-multiples + + # DP mode + if cuda and rank == -1 and torch.cuda.device_count() > 1: + model = torch.nn.DataParallel(model) + + # SyncBatchNorm + if self.opt.sync_bn and cuda and rank != -1: + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(self.device) + logger.info('Using SyncBatchNorm()') + + # EMA + ema = ModelEMA(model) if rank in [-1, 0] else None + + # DDP mode + if cuda and rank != -1: + model = DDP(model, device_ids=[self.opt.local_rank], output_device=self.opt.local_rank) + + # Trainloader + # dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, self.opt, + # hyp=self.hyp, augment=True, cache=self.opt.cache_images, rect=self.opt.rect, + # rank=rank, + # world_size=self.opt.world_size, workers=self.opt.workers, + # image_weights=self.opt.image_weights) + + # client + # client_number = self.opt.client_number + # partition = self.opt.partition + # net_dataidx_map = partition_data(train_path, partition=partition, n_nets=client_number) + # train_data_loader_dict = dict() + # for i in range(client_number): + # dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, self.opt, + # hyp=self.hyp, augment=True, cache=self.opt.cache_images, + # rect=self.opt.rect, + # rank=rank, + # world_size=self.opt.world_size, workers=self.opt.workers, + # image_weights=self.opt.image_weights, + # net_dataidx_map=net_dataidx_map[i]) + # + # train_data_loader_dict[i] = dataloader + # self.client_list.append(Client(i, train_data_loader_dict[i], len(dataset), self.opt, self.device, model)) + + # TODO: train_client + # client sampling + # client train + # logging info + + # train_data_num = sum([len(net_dataidx_map[r]) for r in range(client_number)]) + # client_number_per_round = self.opt.client_num_per_round + + mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class + nb = len(dataloader) # number of batches + print("nb:", nb) + assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( + mlc, nc, self.opt.data, nc - 1) + + # Process 0 + if rank in [-1, 0]: + ema.updates = start_epoch * nb // accumulate # set EMA updates + testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, self.opt, # testloader + hyp=self.hyp, cache=self.opt.cache_images and not self.opt.notest, rect=True, + rank=-1, world_size=self.opt.world_size, workers=self.opt.workers, pad=0.5)[ + 0] + + if not self.opt.resume: + labels = np.concatenate(dataset.labels, 0) + c = torch.tensor(labels[:, 0]) # classes + # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency + # model._initialize_biases(cf.to(self.device)) + if plots: + Thread(target=plot_labels, args=(labels, save_dir, loggers), daemon=True).start() + if self.tb_writer: + self.tb_writer.add_histogram('classes', c, 0) + + # Anchors + if not self.opt.noautoanchor: + check_anchors(dataset, model=model, thr=self.hyp['anchor_t'], imgsz=imgsz) + + # Model parameters + self.hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset + model.nc = nc # attach number of classes to model + model.hyp = self.hyp # attach hyperparameters to model + model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) + model.class_weights = labels_to_class_weights(dataset.labels, nc).to(self.device) # attach class weights + model.names = names + + # Start training + t0 = time.time() + nw = max(round(self.hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) + # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training + maps = np.zeros(nc) # mAP per class + results = (0, 0, 0, 0, 0, 0, 0) # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls) + scheduler.last_epoch = start_epoch - 1 # do not move + scaler = amp.GradScaler(enabled=cuda) + logger.info('Image sizes %g train, %g test\n' + 'Using %g dataloader workers\nLogging results to %s\n' + 'Starting training for %g epochs...' % ( + imgsz, imgsz_test, dataloader.num_workers, save_dir, epochs)) + model = self.model + model.load_state_dict(w_global) + model.to(self.device) + for epoch in range(start_epoch, + epochs): # epoch ------------------------------------------------------------------ + model.train() + + # client_indexes = client_sampling(epoch, client_number, client_number_per_round) + # logging.info("client_indexes = " + str(client_indexes)) + + # Update image weights (optional) + if self.opt.image_weights: + # Generate indices + if rank in [-1, 0]: + cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights + iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights + dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx + # Broadcast if DDP + if rank != -1: + indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() + dist.broadcast(indices, 0) + if rank != 0: + dataset.indices = indices.cpu().numpy() + + # Update mosaic border + # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) + # dataset.mosaic_border = [b - imgsz, -b] # height, width borders + + mloss = torch.zeros(4, device=self.device) # mean losses + if rank != -1: + dataloader.sampler.set_epoch(epoch) + pbar = enumerate(dataloader) + logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) + if rank in [-1, 0]: + pbar = tqdm(pbar, total=nb) # progress bar + optimizer.zero_grad() + for i, ( + imgs, targets, paths, + _) in pbar: # batch ------------------------------------------------------------- + ni = i + nb * epoch # number integrated batches (since train start) + imgs = imgs.to(self.device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 + + # Warmup + if ni <= nw: + xi = [0, nw] # x interp + # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) + accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) + for j, x in enumerate(optimizer.param_groups): + # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 + x['lr'] = np.interp(ni, xi, + [self.hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) + if 'momentum' in x: + x['momentum'] = np.interp(ni, xi, [self.hyp['warmup_momentum'],self.hyp['momentum']]) + + # Multi-scale + if self.opt.multi_scale: + sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size + sf = sz / max(imgs.shape[2:]) # scale factor + if sf != 1: + ns = [math.ceil(x * sf / gs) * gs for x in + imgs.shape[2:]] # new shape (stretched to gs-multiple) + imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) + + # Forward + with amp.autocast(enabled=cuda): + pred = model(imgs) # forward + loss, loss_items = compute_loss(pred, targets.to(self.device), model) # loss scaled by batch_size + if rank != -1: + loss *= self.opt.world_size # gradient averaged between devices in DDP mode + + # Backward + scaler.scale(loss).backward() + + # Optimize + # if ni % accumulate == 0: + # scaler.step(optimizer) # optimizer.step + # scaler.update() + # optimizer.zero_grad() + # if ema: + # ema.update(model) + + # Print + if rank in [-1, 0]: + mloss = (mloss * i + loss_items) / (i + 1) # update mean losses + mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) + s = ('%10s' * 2 + '%10.4g' * 6) % ( + '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) + pbar.set_description(s) + + # Plot + if plots and ni < 3: + f = save_dir / f'train_batch{ni}.jpg' # filename + Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() + # if tb_writer: + # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) + # tb_writer.add_graph(model, imgs) # add model to tensorboard + elif plots and ni == 3 and self.wandb: + self.wandb.log( + {"Mosaics": [self.wandb.Image(str(x), caption=x.name) for x in + save_dir.glob('train*.jpg')]}) + + # end batch ------------------------------------------------------------------------------------------------ + # end epoch ---------------------------------------------------------------------------------------------------- + + # Scheduler + lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard + scheduler.step() + + # DDP process 0 or single-GPU + if rank in [-1, 0]: + # mAP + if ema: + ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) + final_epoch = epoch + 1 == epochs + if not self.opt.notest or final_epoch: # Calculate mAP + results, maps, times = test.test(self.opt.data, + batch_size=total_batch_size, + imgsz=imgsz_test, + model=ema.ema, + single_cls=self.opt.single_cls, + dataloader=testloader, + save_dir=save_dir, + plots=plots and final_epoch, + log_imgs=self.opt.log_imgs if self.wandb else 0) + + # Write + with open(results_file, 'a') as f: + f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls) + if len(self.opt.name) and self.opt.bucket: + os.system( + 'gsutil cp %s gs://%s/results/results%s.txt' % (results_file, self.opt.bucket, self.opt.name)) + + # Log + tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss + 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', + 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss + 'x/lr0', 'x/lr1', 'x/lr2'] # params + for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): + if self.tb_writer: + self.tb_writer.add_scalar(tag, x, epoch) # tensorboard + if self.wandb: + self.wandb.log({tag: x}) # W&B + + # Update best mAP + fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95] + if fi > best_fitness: + best_fitness = fi + + # Save model + save = (not self.opt.nosave) or (final_epoch and not self.opt.evolve) + if save: + with open(results_file, 'r') as f: # create checkpoint + ckpt = {'epoch': epoch, + 'best_fitness': best_fitness, + 'training_results': f.read(), + 'model': ema.ema, + 'optimizer': None if final_epoch else optimizer.state_dict(), + 'wandb_id': wandb_run.id if self.wandb else None} + + # Save last, best and delete + torch.save(ckpt, last) + if best_fitness == fi: + torch.save(ckpt, best) + del ckpt + # end epoch ---------------------------------------------------------------------------------------------------- + # end training + + if rank in [-1, 0]: + # Strip optimizers + for f in [last, best]: + if f.exists(): # is *.pt + strip_optimizer(f) # strip optimizer + os.system( + 'gsutil cp %s gs://%s/weights' % (f, self.opt.bucket)) if self.opt.bucket else None # upload + + # Plots + if plots: + plot_results(save_dir=save_dir) # save as results.png + if self.wandb: + files = ['results.png', 'precision_recall_curve.png', 'confusion_matrix.png'] + self.wandb.log({"Results": [self.wandb.Image(str(save_dir / f), caption=f) for f in files + if (save_dir / f).exists()]}) + logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) + + # Test best.pt + # if self.opt.data.endswith('coco.yaml') and nc == 80: # if COCO + # results, _, _ = test.test(self.opt.data, + # batch_size=total_batch_size, + # imgsz=imgsz_test, + # model=attempt_load(best if best.exists() else last, self.device).half(), + # single_cls=self.opt.single_cls, + # dataloader=testloader, + # save_dir=save_dir, + # save_json=True, # use pycocotools + # plots=False) + + else: + dist.destroy_process_group() + + self.wandb.run.finish() if self.wandb and self.wandb.run else None + torch.cuda.empty_cache() + return model.cpu().state_dict(), mloss #, results + + # def train(self, w_global): + # self.model.train() + # self.model.load_state_dict(w_global) + # self.model.to(self.device) + # + # # train and update + # if self.args.client_optimizer == "sgd": + # optimizer = torch.optim.SGD(self.model.parameters(), lr=self.args.lr) + # else: + # optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.args.lr, + # weight_decay=self.args.wd, amsgrad=True) + # + # epoch_loss = [] + # for epoch in range(self.args.epochs): + # batch_loss = [] + # for batch_idx, (x, labels) in enumerate(self.local_training_data): + # x, labels = x.to(self.device), labels.to(self.device) + # # logging.info("x.size = " + str(x.size())) + # # logging.info("labels.size = " + str(labels.size())) + # self.model.zero_grad() + # log_probs = self.model(x) + # loss = self.criterion(log_probs, labels) + # loss.backward() + # + # # to avoid nan loss + # # torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5) + # + # optimizer.step() + # # logging.info('Update Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + # # epoch, (batch_idx + 1) * self.args.batch_size, len(self.local_training_data) * self.args.batch_size, + # # 100. * (batch_idx + 1) / len(self.local_training_data), loss.item())) + # batch_loss.append(loss.item()) + # epoch_loss.append(sum(batch_loss) / len(batch_loss)) + # # logging.info('Client Index = {}\tEpoch: {}\tLoss: {:.6f}'.format( + # # self.client_idx, epoch, sum(epoch_loss) / len(epoch_loss))) + # return self.model.cpu().state_dict(), sum(epoch_loss) / len(epoch_loss) + + def local_test(self, model_global, b_use_test_dataset=False): + model_global.eval() + model_global.to(self.device) + metrics = { + 'test_correct': 0, + 'test_loss' : 0, + 'test_precision': 0, + 'test_recall': 0, + 'test_total' : 0 + } + if b_use_test_dataset: + test_data = self.local_test_data + else: + test_data = self.local_training_data + with torch.no_grad(): + for batch_idx, (x, target) in enumerate(test_data): + x = x.to(self.device) + target = target.to(self.device) + pred = model_global(x) + loss = self.criterion(pred, target) + + if self.args.dataset == "stackoverflow_lr": + predicted = (pred > .5).int() + correct = predicted.eq(target).sum(axis = -1).eq(target.size(1)).sum() + true_positive = ((target * predicted) > .1).int().sum(axis = -1) + precision = true_positive / (predicted.sum(axis = -1) + 1e-13) + recall = true_positive / (target.sum(axis = -1) + 1e-13) + metrics['test_precision'] += precision.sum().item() + metrics['test_recall'] += recall.sum().item() + else: + _, predicted = torch.max(pred, 1) + correct = predicted.eq(target).sum() + + metrics['test_correct'] += correct.item() + metrics['test_loss'] += loss.item() * target.size(0) + if len(target.size()) == 1: # + metrics['test_total'] += target.size(0) + elif len(target.size()) == 2: # for tasks of next word prediction + metrics['test_total'] += target.size(0) * target.size(1) + + return metrics diff --git a/experiments/standalone/yolov5/test.py b/experiments/standalone/yolov5/test.py new file mode 100644 index 0000000..1377726 --- /dev/null +++ b/experiments/standalone/yolov5/test.py @@ -0,0 +1,343 @@ +import argparse +import json +import os +from pathlib import Path +from threading import Thread + +import numpy as np +import torch +import yaml +from tqdm import tqdm + +import sys +sys.path.append('fedml/FedML-master') + +from fedml_api.model.object_detection.yolov5.models.experimental import attempt_load +from fedml_api.model.object_detection.yolov5.utils.datasets import create_dataloader +from fedml_api.model.object_detection.yolov5.utils.general import coco80_to_coco91_class, check_dataset, check_file, check_img_size, box_iou, \ + non_max_suppression, scale_coords, xyxy2xywh, xywh2xyxy, set_logging, increment_path +from fedml_api.model.object_detection.yolov5.utils.loss import compute_loss +from fedml_api.model.object_detection.yolov5.utils.metrics import ap_per_class, ConfusionMatrix +from fedml_api.model.object_detection.yolov5.utils.plots import plot_images, output_to_target, plot_study_txt +from fedml_api.model.object_detection.yolov5.utils.torch_utils import select_device, time_synchronized + + +def test(data, + weights=None, + batch_size=32, + imgsz=640, + conf_thres=0.001, + iou_thres=0.6, # for NMS + save_json=False, + single_cls=False, + augment=False, + verbose=False, + model=None, + dataloader=None, + save_dir=Path(''), # for saving images + save_txt=False, # for auto-labelling + save_hybrid=False, # for hybrid auto-labelling + save_conf=False, # save auto-label confidences + plots=True, + log_imgs=0): # number of logged images + + # Initialize/load model and set device + training = model is not None + if training: # called by train.py + device = next(model.parameters()).device # get model device + + else: # called directly + set_logging() + device = select_device(opt.device, batch_size=batch_size) + + device = 'cpu' + # Directories + save_dir = Path(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok)) # increment run + (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True) # make dir + + # Load model + model = attempt_load(weights, map_location=device) # load FP32 model + imgsz = check_img_size(imgsz, s=model.stride.max()) # check img_size + + # Multi-GPU disabled, incompatible with .half() https://github.com/ultralytics/yolov5/issues/99 + # if device.type != 'cpu' and torch.cuda.device_count() > 1: + # model = nn.DataParallel(model) + + device = torch.device('cpu') + # Half + half = device.type != 'cpu' # half precision only supported on CUDA + if half: + model.half() + + # Configure + model = model.to(device) + model.eval() + is_coco = data.endswith('coco.yaml') # is COCO dataset + with open(data) as f: + data = yaml.load(f, Loader=yaml.FullLoader) # model dict + check_dataset(data) # check + nc = 1 if single_cls else int(data['nc']) # number of classes + iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for mAP@0.5:0.95 + niou = iouv.numel() + + # Logging + log_imgs, wandb = min(log_imgs, 100), None # ceil + try: + import wandb # Weights & Biases + except ImportError: + log_imgs = 0 + + # Dataloader + if not training: + img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img + _ = model(img.half() if half else img) if device.type != 'cpu' else None # run once + path = data['test'] if opt.task == 'test' else data['val'] # path to val/test images + dataloader = create_dataloader(path, imgsz, batch_size, model.stride.max(), opt, pad=0.5, rect=True)[0] + + seen = 0 + confusion_matrix = ConfusionMatrix(nc=nc) + names = {k: v for k, v in enumerate(model.names if hasattr(model, 'names') else model.module.names)} + coco91class = coco80_to_coco91_class() + s = ('%20s' + '%12s' * 6) % ('Class', 'Images', 'Targets', 'P', 'R', 'mAP@.5', 'mAP@.5:.95') + p, r, f1, mp, mr, map50, map, t0, t1 = 0., 0., 0., 0., 0., 0., 0., 0., 0. + loss = torch.zeros(3, device=device) + jdict, stats, ap, ap_class, wandb_images = [], [], [], [], [] + for batch_i, (img, targets, paths, shapes) in enumerate(tqdm(dataloader, desc=s)): + img = img.to(device, non_blocking=True) + img = img.half() if half else img.float() # uint8 to fp16/32 + img /= 255.0 # 0 - 255 to 0.0 - 1.0 + targets = targets.to(device) + nb, _, height, width = img.shape # batch size, channels, height, width + + with torch.no_grad(): + # Run model + t = time_synchronized() + inf_out, train_out = model(img, augment=augment) # inference and training outputs + t0 += time_synchronized() - t + + # Compute loss + if training: + loss += compute_loss([x.float() for x in train_out], targets, model)[1][:3] # box, obj, cls + + # Run NMS + targets[:, 2:] *= torch.Tensor([width, height, width, height]).to(device) # to pixels + lb = [targets[targets[:, 0] == i, 1:] for i in range(nb)] if save_hybrid else [] # for autolabelling + t = time_synchronized() + inf_out = inf_out.cpu() + output = non_max_suppression(inf_out, conf_thres=conf_thres, iou_thres=iou_thres, labels=lb) + t1 += time_synchronized() - t + + # Statistics per image + targets = targets.cpu() + for si, pred in enumerate(output): + labels = targets[targets[:, 0] == si, 1:] + nl = len(labels) + tcls = labels[:, 0].tolist() if nl else [] # target class + path = Path(paths[si]) + seen += 1 + + if len(pred) == 0: + if nl: + stats.append((torch.zeros(0, niou, dtype=torch.bool), torch.Tensor(), torch.Tensor(), tcls)) + continue + + # Predictions + predn = pred.clone() + scale_coords(img[si].shape[1:], predn[:, :4], shapes[si][0], shapes[si][1]) # native-space pred + + # Append to text file + if save_txt: + gn = torch.tensor(shapes[si][0])[[1, 0, 1, 0]] # normalization gain whwh + for *xyxy, conf, cls in predn.tolist(): + xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh + line = (cls, *xywh, conf) if save_conf else (cls, *xywh) # label format + with open(save_dir / 'labels' / (path.stem + '.txt'), 'a') as f: + f.write(('%g ' * len(line)).rstrip() % line + '\n') + + # W&B logging + if plots and len(wandb_images) < log_imgs: + box_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]}, + "class_id": int(cls), + "box_caption": "%s %.3f" % (names[cls], conf), + "scores": {"class_score": conf}, + "domain": "pixel"} for *xyxy, conf, cls in pred.tolist()] + boxes = {"predictions": {"box_data": box_data, "class_labels": names}} # inference-space + wandb_images.append(wandb.Image(img[si], boxes=boxes, caption=path.name)) + + # Append to pycocotools JSON dictionary + if save_json: + # [{"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236}, ... + image_id = int(path.stem) if path.stem.isnumeric() else path.stem + box = xyxy2xywh(predn[:, :4]) # xywh + box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner + for p, b in zip(pred.tolist(), box.tolist()): + jdict.append({'image_id': image_id, + 'category_id': coco91class[int(p[5])] if is_coco else int(p[5]), + 'bbox': [round(x, 3) for x in b], + 'score': round(p[4], 5)}) + + # Assign all predictions as incorrect + correct = torch.zeros(pred.shape[0], niou, dtype=torch.bool, device=device) + if nl: + detected = [] # target indices + tcls_tensor = labels[:, 0] + + # target boxes + tbox = xywh2xyxy(labels[:, 1:5]) + scale_coords(img[si].shape[1:], tbox, shapes[si][0], shapes[si][1]) # native-space labels + if plots: + confusion_matrix.process_batch(pred, torch.cat((labels[:, 0:1], tbox), 1)) + + # Per target class + for cls in torch.unique(tcls_tensor): + ti = (cls == tcls_tensor).nonzero(as_tuple=False).view(-1) # prediction indices + pi = (cls == pred[:, 5]).nonzero(as_tuple=False).view(-1) # target indices + + # Search for detections + if pi.shape[0]: + # Prediction to target ious + ious, i = box_iou(predn[pi, :4], tbox[ti]).max(1) # best ious, indices + + # Append detections + detected_set = set() + for j in (ious > iouv[0]).nonzero(as_tuple=False): + d = ti[i[j]] # detected target + if d.item() not in detected_set: + detected_set.add(d.item()) + detected.append(d) + correct[pi[j]] = ious[j] > iouv # iou_thres is 1xn + if len(detected) == nl: # all targets already located in image + break + + # Append statistics (correct, conf, pcls, tcls) + stats.append((correct.cpu(), pred[:, 4].cpu(), pred[:, 5].cpu(), tcls)) + + # Plot images + if plots and batch_i < 3: + f = save_dir / f'test_batch{batch_i}_labels.jpg' # labels + Thread(target=plot_images, args=(img, targets, paths, f, names), daemon=True).start() + f = save_dir / f'test_batch{batch_i}_pred.jpg' # predictions + Thread(target=plot_images, args=(img, output_to_target(output), paths, f, names), daemon=True).start() + + # Compute statistics + stats = [np.concatenate(x, 0) for x in zip(*stats)] # to numpy + if len(stats) and stats[0].any(): + p, r, ap, f1, ap_class = ap_per_class(*stats, plot=plots, save_dir=save_dir, names=names) + p, r, ap50, ap = p[:, 0], r[:, 0], ap[:, 0], ap.mean(1) # [P, R, AP@0.5, AP@0.5:0.95] + mp, mr, map50, map = p.mean(), r.mean(), ap50.mean(), ap.mean() + nt = np.bincount(stats[3].astype(np.int64), minlength=nc) # number of targets per class + else: + nt = torch.zeros(1) + + # Print results + pf = '%20s' + '%12.3g' * 6 # print format + print(pf % ('all', seen, nt.sum(), mp, mr, map50, map)) + + # Print results per class + if verbose and nc > 1 and len(stats): + for i, c in enumerate(ap_class): + print(pf % (names[c], seen, nt[c], p[i], r[i], ap50[i], ap[i])) + + # Print speeds + t = tuple(x / seen * 1E3 for x in (t0, t1, t0 + t1)) + (imgsz, imgsz, batch_size) # tuple + if not training: + print('Speed: %.1f/%.1f/%.1f ms inference/NMS/total per %gx%g image at batch-size %g' % t) + + # Plots + if plots: + confusion_matrix.plot(save_dir=save_dir, names=list(names.values())) + if wandb and wandb.run: + wandb.log({"Images": wandb_images}) + wandb.log({"Validation": [wandb.Image(str(f), caption=f.name) for f in sorted(save_dir.glob('test*.jpg'))]}) + + # Save JSON + if save_json and len(jdict): + w = Path(weights[0] if isinstance(weights, list) else weights).stem if weights is not None else '' # weights + anno_json = '../coco/annotations/instances_val2017.json' # annotations json + pred_json = str(save_dir / f"{w}_predictions.json") # predictions json + print('\nEvaluating pycocotools mAP... saving %s...' % pred_json) + with open(pred_json, 'w') as f: + json.dump(jdict, f) + + try: # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb + from pycocotools.coco import COCO + from pycocotools.cocoeval import COCOeval + + anno = COCO(anno_json) # init annotations api + pred = anno.loadRes(pred_json) # init predictions api + eval = COCOeval(anno, pred, 'bbox') + if is_coco: + eval.params.imgIds = [int(Path(x).stem) for x in dataloader.dataset.img_files] # image IDs to evaluate + eval.evaluate() + eval.accumulate() + eval.summarize() + map, map50 = eval.stats[:2] # update results (mAP@0.5:0.95, mAP@0.5) + except Exception as e: + print(f'pycocotools unable to run: {e}') + + # Return results + if not training: + s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else '' + print(f"Results saved to {save_dir}{s}") + model.float() # for training + model.cuda() + maps = np.zeros(nc) + map + for i, c in enumerate(ap_class): + maps[c] = ap[i] + return (mp, mr, map50, map, *(loss.cpu() / len(dataloader)).tolist()), maps, t + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(prog='test.py') + parser.add_argument('--weights', nargs='+', type=str, default='yolov5s.pt', help='model.pt path(s)') + parser.add_argument('--data', type=str, default='data/coco128.yaml', help='*.data path') + parser.add_argument('--batch-size', type=int, default=32, help='size of each image batch') + parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)') + parser.add_argument('--conf-thres', type=float, default=0.001, help='object confidence threshold') + parser.add_argument('--iou-thres', type=float, default=0.6, help='IOU threshold for NMS') + parser.add_argument('--task', default='val', help="'val', 'test', 'study'") + parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') + parser.add_argument('--single-cls', action='store_true', help='treat as single-class dataset') + parser.add_argument('--augment', action='store_true', help='augmented inference') + parser.add_argument('--verbose', action='store_true', help='report mAP by class') + parser.add_argument('--save-txt', action='store_true', help='save results to *.txt') + parser.add_argument('--save-hybrid', action='store_true', help='save label+prediction hybrid results to *.txt') + parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels') + parser.add_argument('--save-json', action='store_true', help='save a cocoapi-compatible JSON results file') + parser.add_argument('--project', default='runs/test', help='save to project/name') + parser.add_argument('--name', default='exp', help='save to project/name') + parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') + opt = parser.parse_args() + opt.save_json |= opt.data.endswith('coco.yaml') + opt.data = check_file(opt.data) # check file + print(opt) + opt.device = 'cpu' + if opt.task in ['val', 'test']: # run normally + test(opt.data, + opt.weights, + opt.batch_size, + opt.img_size, + opt.conf_thres, + opt.iou_thres, + opt.save_json, + opt.single_cls, + opt.augment, + opt.verbose, + save_txt=opt.save_txt | opt.save_hybrid, + save_hybrid=opt.save_hybrid, + save_conf=opt.save_conf, + ) + + elif opt.task == 'study': # run over a range of settings and save/plot + for weights in ['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt']: + f = 'study_%s_%s.txt' % (Path(opt.data).stem, Path(weights).stem) # filename to save to + x = list(range(320, 800, 64)) # x axis + y = [] # y axis + for i in x: # img-size + print('\nRunning %s point %s...' % (f, i)) + r, _, t = test(opt.data, weights, opt.batch_size, i, opt.conf_thres, opt.iou_thres, opt.save_json, + plots=False) + y.append(r + t) # results and times + np.savetxt(f, y, fmt='%10.4g') # save + os.system('zip -r study.zip study_*.txt') + plot_study_txt(f, x) # plot diff --git a/experiments/standalone/yolov5/train.py b/experiments/standalone/yolov5/train.py new file mode 100644 index 0000000..27e642d --- /dev/null +++ b/experiments/standalone/yolov5/train.py @@ -0,0 +1,656 @@ +import argparse +import logging +import os +import random +import copy +import time +from pathlib import Path +from threading import Thread +from warnings import warn +import collections +import sys +sys.path.append('fedml/FedML-master') + + +import math +import numpy as np +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import torch.optim.lr_scheduler as lr_scheduler +import torch.utils.data +import yaml +# from apex import amp +from torch.cuda import amp +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.tensorboard import SummaryWriter +from tqdm import tqdm + +import test # import test.py to get mAP after each epoch +# from fedml_api.model.object_detection.yolov5.models.experimental import +from fedml_api.model.object_detection.yolov5.models.experimental import attempt_load +from fedml_api.model.object_detection.yolov5.models.yolo import Model +from models.yolo import Model +from fedml_api.model.object_detection.yolov5.utils.autoanchor import check_anchors +# from utils.datasets import create_dataloader +from fedml_api.model.object_detection.yolov5.utils.general import labels_to_class_weights, increment_path, labels_to_image_weights, init_seeds, \ + fitness, strip_optimizer, get_latest_run, check_dataset, check_file, check_git_status, check_img_size, \ + print_mutation, set_logging +from fedml_api.model.object_detection.yolov5.utils.google_utils import attempt_download +from fedml_api.model.object_detection.yolov5.utils.loss import compute_loss +from fedml_api.model.object_detection.yolov5.utils.plots import plot_images, plot_labels, plot_results, plot_evolution +from fedml_api.model.object_detection.yolov5.utils.torch_utils import ModelEMA, select_device, intersect_dicts, torch_distributed_zero_first + +from fedml_api.data_preprocessing.coco_detection.datasets import partition_data +from fedml_api.data_preprocessing.coco_detection.datasets import create_dataloader +from fedml_api.standalone.fedavg_yolo.client import Client +logger = logging.getLogger(__name__) + +sys.path.insert(0, 'fedml/FedML-master/fedml_api/standalone/fedavg_yolo/data/') + +try: + import wandb +except ImportError: + wandb = None + logger.info("Install Weights & Biases for experiment logging via 'pip install wandb' (recommended)") + +def aggregate(w_locals): + training_num = 0 + for idx in range(len(w_locals)): + (sample_num, averaged_params) = w_locals[idx] + training_num += sample_num + + (sample_num, averaged_params) = w_locals[0] + for k in averaged_params.keys(): + for i in range(0, len(w_locals)): + local_sample_number, local_model_params = w_locals[i] + w = local_sample_number / training_num + if i == 0: + averaged_params[k] = local_model_params[k] * w + else: + averaged_params[k] += local_model_params[k] * w + return averaged_params + +def client_sampling(round_idx, client_num_in_total, client_num_per_round): + if client_num_in_total == client_num_per_round: + client_indexes = [client_index for client_index in range(client_num_in_total)] + else: + num_clients = min(client_num_per_round, client_num_in_total) + np.random.seed(round_idx) # make sure for each comparison, we are selecting the same clients each round + client_indexes = np.random.choice(range(client_num_in_total), num_clients, replace=False) + logging.info("client_indexes = %s" % str(client_indexes)) + return client_indexes + +def train(hyp, opt, device, tb_writer=None, wandb=None): + logger.info(f'Hyperparameters {hyp}') + save_dir, epochs, batch_size, total_batch_size, weights, rank = \ + Path(opt.save_dir), opt.comm_round, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank + + # Directories + wdir = save_dir / 'weights' + wdir.mkdir(parents=True, exist_ok=True) # make dir + last = wdir / 'last.pt' + best = wdir / 'best.pt' + results_file = save_dir / 'results.txt' + + # Save run settings + with open(save_dir / 'hyp.yaml', 'w') as f: + yaml.dump(hyp, f, sort_keys=False) + with open(save_dir / 'opt.yaml', 'w') as f: + yaml.dump(vars(opt), f, sort_keys=False) + + # Configure + plots = not opt.evolve # create plots + cuda = device.type != 'cpu' + init_seeds(2 + rank) + with open(opt.data) as f: + data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict + with torch_distributed_zero_first(rank): + check_dataset(data_dict) # check + train_path = data_dict['train'] + test_path = data_dict['val'] + nc, names = (1, ['item']) if opt.single_cls else (int(data_dict['nc']), data_dict['names']) # number classes, names + assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data) # check + + # Model + pretrained = weights.endswith('.pt') + if pretrained: + with torch_distributed_zero_first(rank): + attempt_download(weights) # download if not found locally + ckpt = torch.load(weights, map_location=device) # load checkpoint + if hyp.get('anchors'): + ckpt['model'].yaml['anchors'] = round(hyp['anchors']) # force autoanchor + model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create + # ckpt = collections.defaultdict() + + exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [] # exclude keys + state_dict = ckpt['model'].float().state_dict() # to FP32 + state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect + model.load_state_dict(state_dict, strict=False) # load + logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report + else: + model = Model(opt.cfg, ch=3, nc=nc).to(device) # create + # model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True).to(device) + # Freeze + freeze = [] # parameter names to freeze (full or partial) + for k, v in model.named_parameters(): + v.requires_grad = True # train all layers + if any(x in k for x in freeze): + print('freezing %s' % k) + v.requires_grad = False + + # Optimizer + nbs = 64 # nominal batch size + accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing + hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay + + pg0, pg1, pg2 = [], [], [] # optimizer parameter groups + for k, v in model.named_modules(): + if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): + pg2.append(v.bias) # biases + if isinstance(v, nn.BatchNorm2d): + pg0.append(v.weight) # no decay + elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): + pg1.append(v.weight) # apply decay + + if opt.adam: + optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum + else: + optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) + + optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay + optimizer.add_param_group({'params': pg2}) # add pg2 (biases) + logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) + del pg0, pg1, pg2 + + # Scheduler https://arxiv.org/pdf/1812.01187.pdf + # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR + lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp['lrf']) + hyp['lrf'] # cosine + scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) + # plot_lr_scheduler(optimizer, scheduler, epochs) + + # Logging + if wandb and wandb.run is None: + opt.hyp = hyp # add hyperparameters + wandb_run = wandb.init(config=opt, resume="allow", + project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, + name=save_dir.stem, + id=ckpt.get('wandb_id') if 'ckpt' in locals() else None) + loggers = {'wandb': wandb} # loggers dict + + # Resume + start_epoch, best_fitness = 0, 0.0 + if pretrained: + # Optimizer + if ckpt['optimizer'] is not None: + optimizer.load_state_dict(ckpt['optimizer']) + best_fitness = ckpt['best_fitness'] + + # Results + if ckpt.get('training_results') is not None: + with open(results_file, 'w') as file: + file.write(ckpt['training_results']) # write results.txt + + # Epochs + start_epoch = ckpt['epoch'] + 1 + print("start_epoch:", start_epoch) + # start_epoch = 1 #250 + if opt.resume: + assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs) + if epochs < start_epoch: + logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % + (weights, ckpt['epoch'], epochs)) + epochs += ckpt['epoch'] # finetune additional epochs + + del ckpt, state_dict + # start_epoch = 0 + # Image sizes + gs = int(max(model.stride)) # grid size (max stride) + imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] # verify imgsz are gs-multiples + + # DP mode + if cuda and rank == -1 and torch.cuda.device_count() > 1: + model = torch.nn.DataParallel(model) + + # SyncBatchNorm + if opt.sync_bn and cuda and rank != -1: + model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) + logger.info('Using SyncBatchNorm()') + + # EMA + ema = ModelEMA(model) if rank in [-1, 0] else None + + # DDP mode + if cuda and rank != -1: + model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) + + # Trainloader + dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, + hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, + world_size=opt.world_size, workers=opt.workers, + image_weights=opt.image_weights) + mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class + nb = len(dataloader) # number of batches + assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1) + + # Process 0 + if rank in [-1, 0]: + ema.updates = start_epoch * nb // accumulate # set EMA updates + testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, # testloader + hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, + rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5)[0] + + if not opt.resume: + labels = np.concatenate(dataset.labels, 0) + c = torch.tensor(labels[:, 0]) # classes + # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency + # model._initialize_biases(cf.to(device)) + if plots: + Thread(target=plot_labels, args=(labels, save_dir, loggers), daemon=True).start() + if tb_writer: + tb_writer.add_histogram('classes', c, 0) + + # Anchors + if not opt.noautoanchor: + check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) + + # Model parameters + hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset + model.nc = nc # attach number of classes to model + model.hyp = hyp # attach hyperparameters to model + model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) + model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights + model.names = names + + # Start training + t0 = time.time() + nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) + # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training + maps = np.zeros(nc) # mAP per class + results = (0, 0, 0, 0, 0, 0, 0) # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls) + scheduler.last_epoch = start_epoch - 1 # do not move + scaler = amp.GradScaler(enabled=cuda) + logger.info('Image sizes %g train, %g test\n' + 'Using %g dataloader workers\nLogging results to %s\n' + 'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, save_dir, epochs)) + + # client + client_list = [] + client_number = opt.client_number + partition = opt.partition + net_dataidx_map = partition_data(train_path, partition=partition, n_nets=client_number) + train_data_loader_dict = dict() + train_data_num_dict = dict() + train_dataset_dict = dict() + for i in range(client_number): + print("net_dataidx_map trainer:", net_dataidx_map[i]) + dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, + hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, + rank=rank, + world_size=opt.world_size, workers=opt.workers, + image_weights=opt.image_weights, + net_dataidx_map=net_dataidx_map[i]) + train_dataset_dict[i] = dataset + train_data_num_dict[i] = len(dataset) + train_data_loader_dict[i] = dataloader + client_list.append( + Client(i, train_data_loader_dict[i], len(dataset), opt, device, model, tb_writer=tb_writer, + hyp=hyp, wandb=wandb)) + # fedml + w_global = model.state_dict() + print("comm_round:", opt.comm_round) + for round_idx in range(start_epoch, opt.comm_round): + logging.info("################Communication round : {}".format(round_idx)) + w_locals, loss_locals = [], [] + + client_indexes = client_sampling(round_idx, opt.client_number, opt.client_num_per_round) + logging.info("client_indexes = " + str(client_indexes)) + + for idx, client in enumerate(client_list): + client_idx = client_indexes[idx] + client.update_local_dataset(client_idx, train_data_loader_dict[client_idx], train_data_num_dict[client_idx]) + + client_model = client.model + client_model.to(device) + client_model.train() + + client_dataset = train_dataset_dict[client_idx] + client_dataloader = train_data_loader_dict[client_idx] + nb = len(client_dataloader) + + if opt.image_weights: + # Generate indices + if rank in [-1, 0]: + cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights + iw = labels_to_image_weights(client_dataset.labels, nc=nc, class_weights=cw) # image weights + client_dataset.indices = random.choices(range(client_dataset.n), weights=iw, k=client_dataset.n) # rand weighted idx + # Broadcast if DDP + if rank != -1: + indices = (torch.tensor(client_dataset.indices) if rank == 0 else torch.zeros(client_dataset.n)).int() + dist.broadcast(indices, 0) + if rank != 0: + client_dataset.indices = indices.cpu().numpy() + + + mloss = torch.zeros(4, device=device) # mean losses + if rank != -1: + client_dataloader.sampler.set_epoch(round_idx) + pbar = enumerate(client_dataloader) + logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) + if rank in [-1, 0]: + pbar = tqdm(pbar, total=nb) # progress bar + optimizer.zero_grad() + for i, ( + imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- + ni = i + nb * round_idx # number integrated batches (since train start) + imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 + + # Warmup + if ni <= nw: + xi = [0, nw] # x interp + # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) + accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) + for j, x in enumerate(optimizer.param_groups): + # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 + x['lr'] = np.interp(ni, xi, + [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(round_idx)]) + if 'momentum' in x: + x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) + + # Multi-scale + if opt.multi_scale: + sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size + sf = sz / max(imgs.shape[2:]) # scale factor + if sf != 1: + ns = [math.ceil(x * sf / gs) * gs for x in + imgs.shape[2:]] # new shape (stretched to gs-multiple) + imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) + + # Forward + # print("imgs:", imgs.dtype) + with amp.autocast(enabled=cuda): + pred = client_model(imgs) # forward + loss, loss_items = compute_loss(pred, targets.to(device), client_model) # loss scaled by batch_size + if rank != -1: + loss *= opt.world_size # gradient averaged between devices in DDP mode + + # Backward + scaler.scale(loss).backward() + + # Optimize + if ni % accumulate == 0: + scaler.step(optimizer) # optimizer.step + scaler.update() + optimizer.zero_grad() + if ema: + ema.update(model) + + # Print + if rank in [-1, 0]: + mloss = (mloss * i + loss_items) / (i + 1) # update mean losses + mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) + s = ('%10s' * 2 + '%10.4g' * 6) % ( + '%g/%g' % (round_idx, opt.comm_round - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) + pbar.set_description(s) + + # Plot + if plots and ni < 3: + f = save_dir / f'train_batch{ni}.jpg' # filename + Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() + # if tb_writer: + # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) + # tb_writer.add_graph(model, imgs) # add model to tensorboard + elif plots and ni == 3 and wandb: + wandb.log( + {"Mosaics": [wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg')]}) + + w_locals.append((client.get_sample_number(), copy.deepcopy(client_model.cpu().state_dict()))) + + w_global = aggregate(w_locals) + + + + lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard + scheduler.step() + + if round_idx % opt.frequency_of_the_test == 0 or round_idx == opt.comm_round - 1: + model.load_state_dict(w_global) + + else: + continue + # DDP process 0 or single-GPU + if rank in [-1, 0]: + # mAP + if ema: + ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) + final_epoch = round_idx + 1 == opt.comm_round + if not opt.notest or final_epoch: # Calculate mAP + results, maps, times = test.test(opt.data, + batch_size=total_batch_size, + imgsz=imgsz_test, + model=ema.ema, + single_cls=opt.single_cls, + dataloader=testloader, + save_dir=save_dir, + plots=plots and final_epoch, + log_imgs=opt.log_imgs if wandb else 0) + + # Write + with open(results_file, 'a') as f: + f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP@.5, mAP@.5-.95, val_loss(box, obj, cls) + if len(opt.name) and opt.bucket: + os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) + + # Log + tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss + 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', + 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss + 'x/lr0', 'x/lr1', 'x/lr2'] # params + for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): + if tb_writer: + tb_writer.add_scalar(tag, x, round_idx) # tensorboard + if wandb: + wandb.log({tag: x}) # W&B + + # Update best mAP + fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95] + if fi > best_fitness: + best_fitness = fi + + # Save model + save = (not opt.nosave) or (final_epoch and not opt.evolve) + if save: + with open(results_file, 'r') as f: # create checkpoint + ckpt = {'epoch': round_idx, + 'best_fitness': best_fitness, + 'training_results': f.read(), + 'model': ema.ema, + 'optimizer': None if final_epoch else optimizer.state_dict(), + 'wandb_id': wandb_run.id if wandb else None} + + # Save last, best and delete + torch.save(ckpt, last) + if best_fitness == fi: + torch.save(ckpt, best) + del ckpt + + + + + wandb.run.finish() if wandb and wandb.run else None + torch.cuda.empty_cache() + return results + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--weights', type=str, default='yolov5s.pt', help='initial weights path') + parser.add_argument('--cfg', type=str, default='', help='model.yaml path') + parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path') + parser.add_argument('--hyp', type=str, default='data/hyp.scratch.yaml', help='hyperparameters path') + parser.add_argument('--epochs', type=int, default=400) + parser.add_argument('--batch-size', type=int, default=1, help='total batch size for all GPUs') + parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='[train, test] image sizes') + parser.add_argument('--rect', action='store_true', help='rectangular training') + parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training') + parser.add_argument('--nosave', action='store_true', help='only save final checkpoint') + parser.add_argument('--notest', action='store_true', help='only test final epoch') + parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check') + parser.add_argument('--evolve', action='store_true', help='evolve hyperparameters') + parser.add_argument('--bucket', type=str, default='', help='gsutil bucket') + parser.add_argument('--cache-images', action='store_true', help='cache images for faster training') + parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training') + parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') + parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%') + parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset') + parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer') + parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode') + parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify') + parser.add_argument('--log-imgs', type=int, default=4, help='number of images for W&B logging, max 100') + parser.add_argument('--workers', type=int, default=1, help='maximum number of dataloader workers') + parser.add_argument('--project', default='runs/train', help='save to project/name') + parser.add_argument('--name', default='exp', help='save to project/name') + parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') + + # fedml + parser.add_argument('--partition', type=str, default='homo', help='hyperparameters path') + parser.add_argument('--client_number', type=int, default=8, help='maximum number of dataloader workers') + parser.add_argument('--client_num_per_round', type=int, default=8, help='maximum number of dataloader workers') + parser.add_argument('--comm_round', type=int, default=400, help='maximum number of dataloader workers') + parser.add_argument('--frequency_of_the_test', type=int, default=10) + + opt = parser.parse_args() + + # Set DDP variables + opt.total_batch_size = opt.batch_size + opt.world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1 + opt.global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1 + set_logging(opt.global_rank) + if opt.global_rank in [-1, 0]: + check_git_status() + + # Resume + if opt.resume: # resume an interrupted run + ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path + assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist' + with open(Path(ckpt).parent.parent / 'opt.yaml') as f: + opt = argparse.Namespace(**yaml.load(f, Loader=yaml.FullLoader)) # replace + opt.cfg, opt.weights, opt.resume = '', ckpt, True + logger.info('Resuming training from %s' % ckpt) + else: + # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml') + opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp) # check files + assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified' + opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test) + opt.name = 'evolve' if opt.evolve else opt.name + opt.save_dir = increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok | opt.evolve) # increment run + + # DDP mode + device = select_device(opt.device, batch_size=opt.batch_size) + if opt.local_rank != -1: + assert torch.cuda.device_count() > opt.local_rank + torch.cuda.set_device(opt.local_rank) + device = torch.device('cuda', opt.local_rank) + dist.init_process_group(backend='nccl', init_method='env://') # distributed backend + assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count' + opt.batch_size = opt.total_batch_size // opt.world_size + + # Hyperparameters + with open(opt.hyp) as f: + hyp = yaml.load(f, Loader=yaml.FullLoader) # load hyps + if 'box' not in hyp: + warn('Compatibility: %s missing "box" which was renamed from "giou" in %s' % + (opt.hyp, 'https://github.com/ultralytics/yolov5/pull/1120')) + hyp['box'] = hyp.pop('giou') + + # Train + logger.info(opt) + if not opt.evolve: + tb_writer = None # init loggers + if opt.global_rank in [-1, 0]: + logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.project}", view at http://localhost:6006/') + tb_writer = SummaryWriter(opt.save_dir) # Tensorboard + train(hyp, opt, device, tb_writer, wandb) + + # Evolve hyperparameters (optional) + else: + # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) + meta = {'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) + 'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) + 'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1 + 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay + 'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok) + 'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum + 'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr + 'box': (1, 0.02, 0.2), # box loss gain + 'cls': (1, 0.2, 4.0), # cls loss gain + 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight + 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels) + 'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight + 'iou_t': (0, 0.1, 0.7), # IoU training threshold + 'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold + 'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore) + 'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5) + 'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction) + 'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction) + 'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction) + 'degrees': (1, 0.0, 45.0), # image rotation (+/- deg) + 'translate': (1, 0.0, 0.9), # image translation (+/- fraction) + 'scale': (1, 0.0, 0.9), # image scale (+/- gain) + 'shear': (1, 0.0, 10.0), # image shear (+/- deg) + 'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001 + 'flipud': (1, 0.0, 1.0), # image flip up-down (probability) + 'fliplr': (0, 0.0, 1.0), # image flip left-right (probability) + 'mosaic': (1, 0.0, 1.0), # image mixup (probability) + 'mixup': (1, 0.0, 1.0)} # image mixup (probability) + + assert opt.local_rank == -1, 'DDP mode not implemented for --evolve' + opt.notest, opt.nosave = True, True # only test/save final epoch + # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices + yaml_file = Path(opt.save_dir) / 'hyp_evolved.yaml' # save best result here + if opt.bucket: + os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists + + for _ in range(300): # generations to evolve + if Path('evolve.txt').exists(): # if evolve.txt exists: select best hyps and mutate + # Select parent(s) + parent = 'single' # parent selection method: 'single' or 'weighted' + x = np.loadtxt('evolve.txt', ndmin=2) + n = min(5, len(x)) # number of previous results to consider + x = x[np.argsort(-fitness(x))][:n] # top n mutations + w = fitness(x) - fitness(x).min() # weights + if parent == 'single' or len(x) == 1: + # x = x[random.randint(0, n - 1)] # random selection + x = x[random.choices(range(n), weights=w)[0]] # weighted selection + elif parent == 'weighted': + x = (x * w.reshape(n, 1)).sum(0) / w.sum() # weighted combination + + # Mutate + mp, s = 0.8, 0.2 # mutation probability, sigma + npr = np.random + npr.seed(int(time.time())) + g = np.array([x[0] for x in meta.values()]) # gains 0-1 + ng = len(meta) + v = np.ones(ng) + while all(v == 1): # mutate until a change occurs (prevent duplicates) + v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0) + for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300) + hyp[k] = float(x[i + 7] * v[i]) # mutate + + # Constrain to limits + for k, v in meta.items(): + hyp[k] = max(hyp[k], v[1]) # lower limit + hyp[k] = min(hyp[k], v[2]) # upper limit + hyp[k] = round(hyp[k], 5) # significant digits + + # Train mutation + results = train(hyp.copy(), opt, device, wandb=wandb) + + # Write mutation results + print_mutation(hyp.copy(), results, yaml_file, opt.bucket) + + # Plot results + plot_evolution(yaml_file) + print(f'Hyperparameter evolution complete. Best results saved as: {yaml_file}\n' + f'Command to train a new model with these hyperparameters: $ python train.py --hyp {yaml_file}') diff --git a/model/classification/timm_models.md b/model/classification/timm_models.md new file mode 100644 index 0000000..7430b09 --- /dev/null +++ b/model/classification/timm_models.md @@ -0,0 +1,20 @@ +# EfficientNet-B0 with RandAugment - 77.7 top-1, 95.3 top-5 +Michael Klachko achieved these results with the command line for B2 adapted for larger batch size, with the recommended B0 dropout rate of 0.2. + +``` +./distributed_train.sh 2 /imagenet/ --model efficientnet_b0 -b 384 --sched step --epochs 450 --decay-epochs 2.4 --decay-rate .97 --opt rmsproptf --opt-eps .001 -j 8 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .048 +``` + +# MobileNetV3-Large-100 - 75.766 top-1, 92,542 top-5 +``` +./distributed_train.sh 2 /imagenet/ --model mobilenetv3_large_100 -b 512 --sched step --epochs 600 --decay-epochs 2.4 --decay-rate .973 --opt rmsproptf --opt-eps .001 -j 7 --warmup-lr 1e-6 --weight-decay 1e-5 --drop 0.2 --drop-connect 0.2 --model-ema --model-ema-decay 0.9999 --aa rand-m9-mstd0.5 --remode pixel --reprob 0.2 --amp --lr .064 --lr-noise 0.42 0.9 +``` + + + + + + + + + diff --git a/model/detection/yolov5/utils/activations.py b/model/detection/yolov5/utils/activations.py new file mode 100644 index 0000000..24f5a30 --- /dev/null +++ b/model/detection/yolov5/utils/activations.py @@ -0,0 +1,72 @@ +# Activation functions + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +# Swish https://arxiv.org/pdf/1905.02244.pdf --------------------------------------------------------------------------- +class Swish(nn.Module): # + @staticmethod + def forward(x): + return x * torch.sigmoid(x) + + +class Hardswish(nn.Module): # export-friendly version of nn.Hardswish() + @staticmethod + def forward(x): + # return x * F.hardsigmoid(x) # for torchscript and CoreML + return x * F.hardtanh(x + 3, 0., 6.) / 6. # for torchscript, CoreML and ONNX + + +class MemoryEfficientSwish(nn.Module): + class F(torch.autograd.Function): + @staticmethod + def forward(ctx, x): + ctx.save_for_backward(x) + return x * torch.sigmoid(x) + + @staticmethod + def backward(ctx, grad_output): + x = ctx.saved_tensors[0] + sx = torch.sigmoid(x) + return grad_output * (sx * (1 + x * (1 - sx))) + + def forward(self, x): + return self.F.apply(x) + + +# Mish https://github.com/digantamisra98/Mish -------------------------------------------------------------------------- +class Mish(nn.Module): + @staticmethod + def forward(x): + return x * F.softplus(x).tanh() + + +class MemoryEfficientMish(nn.Module): + class F(torch.autograd.Function): + @staticmethod + def forward(ctx, x): + ctx.save_for_backward(x) + return x.mul(torch.tanh(F.softplus(x))) # x * tanh(ln(1 + exp(x))) + + @staticmethod + def backward(ctx, grad_output): + x = ctx.saved_tensors[0] + sx = torch.sigmoid(x) + fx = F.softplus(x).tanh() + return grad_output * (fx + x * sx * (1 - fx * fx)) + + def forward(self, x): + return self.F.apply(x) + + +# FReLU https://arxiv.org/abs/2007.11824 ------------------------------------------------------------------------------- +class FReLU(nn.Module): + def __init__(self, c1, k=3): # ch_in, kernel + super().__init__() + self.conv = nn.Conv2d(c1, c1, k, 1, 1, groups=c1, bias=False) + self.bn = nn.BatchNorm2d(c1) + + def forward(self, x): + return torch.max(x, self.bn(self.conv(x))) diff --git a/model/detection/yolov5/utils/datasets.py b/model/detection/yolov5/utils/datasets.py new file mode 100644 index 0000000..313180f --- /dev/null +++ b/model/detection/yolov5/utils/datasets.py @@ -0,0 +1,933 @@ +# Dataset utils and dataloaders + +import glob +import logging +import math +import os +import random +import shutil +import time +from itertools import repeat +from multiprocessing.pool import ThreadPool +from pathlib import Path +from threading import Thread + +import cv2 +import numpy as np +import torch +from PIL import Image, ExifTags +from torch.utils.data import Dataset +from tqdm import tqdm + +from .general import xyxy2xywh, xywh2xyxy +from .torch_utils import torch_distributed_zero_first + +# Parameters +help_url = 'https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data' +img_formats = ['bmp', 'jpg', 'jpeg', 'png', 'tif', 'tiff', 'dng'] # acceptable image suffixes +vid_formats = ['mov', 'avi', 'mp4', 'mpg', 'mpeg', 'm4v', 'wmv', 'mkv'] # acceptable video suffixes +logger = logging.getLogger(__name__) + +# Get orientation exif tag +for orientation in ExifTags.TAGS.keys(): + if ExifTags.TAGS[orientation] == 'Orientation': + break + + +def get_hash(files): + # Returns a single hash value of a list of files + return sum(os.path.getsize(f) for f in files if os.path.isfile(f)) + + +def exif_size(img): + # Returns exif-corrected PIL size + s = img.size # (width, height) + try: + rotation = dict(img._getexif().items())[orientation] + if rotation == 6: # rotation 270 + s = (s[1], s[0]) + elif rotation == 8: # rotation 90 + s = (s[1], s[0]) + except: + pass + + return s + + +def create_dataloader(path, imgsz, batch_size, stride, opt, hyp=None, augment=False, cache=False, pad=0.0, rect=False, + rank=-1, world_size=1, workers=8, image_weights=False): + # Make sure only the first process in DDP process the dataset first, and the following others can use the cache + with torch_distributed_zero_first(rank): + dataset = LoadImagesAndLabels(path, imgsz, batch_size, + augment=augment, # augment images + hyp=hyp, # augmentation hyperparameters + rect=rect, # rectangular training + cache_images=cache, + single_cls=opt.single_cls, + stride=int(stride), + pad=pad, + rank=rank, + image_weights=image_weights) + + batch_size = min(batch_size, len(dataset)) + nw = min([os.cpu_count() // world_size, batch_size if batch_size > 1 else 0, workers]) # number of workers + sampler = torch.utils.data.distributed.DistributedSampler(dataset) if rank != -1 else None + loader = torch.utils.data.DataLoader if image_weights else InfiniteDataLoader + # Use torch.utils.data.DataLoader() if dataset.properties will update during training else InfiniteDataLoader() + dataloader = loader(dataset, + batch_size=batch_size, + num_workers=nw, + sampler=sampler, + pin_memory=True, + collate_fn=LoadImagesAndLabels.collate_fn) + return dataloader, dataset + + +class InfiniteDataLoader(torch.utils.data.dataloader.DataLoader): + """ Dataloader that reuses workers + + Uses same syntax as vanilla DataLoader + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + object.__setattr__(self, 'batch_sampler', _RepeatSampler(self.batch_sampler)) + self.iterator = super().__iter__() + + def __len__(self): + return len(self.batch_sampler.sampler) + + def __iter__(self): + for i in range(len(self)): + yield next(self.iterator) + + +class _RepeatSampler(object): + """ Sampler that repeats forever + + Args: + sampler (Sampler) + """ + + def __init__(self, sampler): + self.sampler = sampler + + def __iter__(self): + while True: + yield from iter(self.sampler) + + +class LoadImages: # for inference + def __init__(self, path, img_size=640): + p = str(Path(path)) # os-agnostic + p = os.path.abspath(p) # absolute path + if '*' in p: + files = sorted(glob.glob(p, recursive=True)) # glob + elif os.path.isdir(p): + files = sorted(glob.glob(os.path.join(p, '*.*'))) # dir + elif os.path.isfile(p): + files = [p] # files + else: + raise Exception('ERROR: %s does not exist' % p) + + images = [x for x in files if x.split('.')[-1].lower() in img_formats] + videos = [x for x in files if x.split('.')[-1].lower() in vid_formats] + ni, nv = len(images), len(videos) + + self.img_size = img_size + self.files = images + videos + self.nf = ni + nv # number of files + self.video_flag = [False] * ni + [True] * nv + self.mode = 'image' + if any(videos): + self.new_video(videos[0]) # new video + else: + self.cap = None + assert self.nf > 0, 'No images or videos found in %s. Supported formats are:\nimages: %s\nvideos: %s' % \ + (p, img_formats, vid_formats) + + def __iter__(self): + self.count = 0 + return self + + def __next__(self): + if self.count == self.nf: + raise StopIteration + path = self.files[self.count] + + if self.video_flag[self.count]: + # Read video + self.mode = 'video' + ret_val, img0 = self.cap.read() + if not ret_val: + self.count += 1 + self.cap.release() + if self.count == self.nf: # last video + raise StopIteration + else: + path = self.files[self.count] + self.new_video(path) + ret_val, img0 = self.cap.read() + + self.frame += 1 + print('video %g/%g (%g/%g) %s: ' % (self.count + 1, self.nf, self.frame, self.nframes, path), end='') + + else: + # Read image + self.count += 1 + img0 = cv2.imread(path) # BGR + assert img0 is not None, 'Image Not Found ' + path + print('image %g/%g %s: ' % (self.count, self.nf, path), end='') + + # Padded resize + img = letterbox(img0, new_shape=self.img_size)[0] + + # Convert + img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416 + img = np.ascontiguousarray(img) + + return path, img, img0, self.cap + + def new_video(self, path): + self.frame = 0 + self.cap = cv2.VideoCapture(path) + self.nframes = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) + + def __len__(self): + return self.nf # number of files + + +class LoadWebcam: # for inference + def __init__(self, pipe='0', img_size=640): + self.img_size = img_size + + if pipe.isnumeric(): + pipe = eval(pipe) # local camera + # pipe = 'rtsp://192.168.1.64/1' # IP camera + # pipe = 'rtsp://username:password@192.168.1.64/1' # IP camera with login + # pipe = 'http://wmccpinetop.axiscam.net/mjpg/video.mjpg' # IP golf camera + + self.pipe = pipe + self.cap = cv2.VideoCapture(pipe) # video capture object + self.cap.set(cv2.CAP_PROP_BUFFERSIZE, 3) # set buffer size + + def __iter__(self): + self.count = -1 + return self + + def __next__(self): + self.count += 1 + if cv2.waitKey(1) == ord('q'): # q to quit + self.cap.release() + cv2.destroyAllWindows() + raise StopIteration + + # Read frame + if self.pipe == 0: # local camera + ret_val, img0 = self.cap.read() + img0 = cv2.flip(img0, 1) # flip left-right + else: # IP camera + n = 0 + while True: + n += 1 + self.cap.grab() + if n % 30 == 0: # skip frames + ret_val, img0 = self.cap.retrieve() + if ret_val: + break + + # Print + assert ret_val, 'Camera Error %s' % self.pipe + img_path = 'webcam.jpg' + print('webcam %g: ' % self.count, end='') + + # Padded resize + img = letterbox(img0, new_shape=self.img_size)[0] + + # Convert + img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416 + img = np.ascontiguousarray(img) + + return img_path, img, img0, None + + def __len__(self): + return 0 + + +class LoadStreams: # multiple IP or RTSP cameras + def __init__(self, sources='streams.txt', img_size=640): + self.mode = 'stream' + self.img_size = img_size + + if os.path.isfile(sources): + with open(sources, 'r') as f: + sources = [x.strip() for x in f.read().strip().splitlines() if len(x.strip())] + else: + sources = [sources] + + n = len(sources) + self.imgs = [None] * n + self.sources = sources + for i, s in enumerate(sources): + # Start the thread to read frames from the video stream + print('%g/%g: %s... ' % (i + 1, n, s), end='') + cap = cv2.VideoCapture(eval(s) if s.isnumeric() else s) + assert cap.isOpened(), 'Failed to open %s' % s + w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fps = cap.get(cv2.CAP_PROP_FPS) % 100 + _, self.imgs[i] = cap.read() # guarantee first frame + thread = Thread(target=self.update, args=([i, cap]), daemon=True) + print(' success (%gx%g at %.2f FPS).' % (w, h, fps)) + thread.start() + print('') # newline + + # check for common shapes + s = np.stack([letterbox(x, new_shape=self.img_size)[0].shape for x in self.imgs], 0) # inference shapes + self.rect = np.unique(s, axis=0).shape[0] == 1 # rect inference if all shapes equal + if not self.rect: + print('WARNING: Different stream shapes detected. For optimal performance supply similarly-shaped streams.') + + def update(self, index, cap): + # Read next stream frame in a daemon thread + n = 0 + while cap.isOpened(): + n += 1 + # _, self.imgs[index] = cap.read() + cap.grab() + if n == 4: # read every 4th frame + _, self.imgs[index] = cap.retrieve() + n = 0 + time.sleep(0.01) # wait time + + def __iter__(self): + self.count = -1 + return self + + def __next__(self): + self.count += 1 + img0 = self.imgs.copy() + if cv2.waitKey(1) == ord('q'): # q to quit + cv2.destroyAllWindows() + raise StopIteration + + # Letterbox + img = [letterbox(x, new_shape=self.img_size, auto=self.rect)[0] for x in img0] + + # Stack + img = np.stack(img, 0) + + # Convert + img = img[:, :, :, ::-1].transpose(0, 3, 1, 2) # BGR to RGB, to bsx3x416x416 + img = np.ascontiguousarray(img) + + return self.sources, img, img0, None + + def __len__(self): + return 0 # 1E12 frames = 32 streams at 30 FPS for 30 years + + +def img2label_paths(img_paths): + # Define label paths as a function of image paths + sa, sb = os.sep + 'images' + os.sep, os.sep + 'labels' + os.sep # /images/, /labels/ substrings + return [x.replace(sa, sb, 1).replace('.' + x.split('.')[-1], '.txt') for x in img_paths] + + +class LoadImagesAndLabels(Dataset): # for training/testing + def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False, + cache_images=False, single_cls=False, stride=32, pad=0.0, rank=-1): + self.img_size = img_size + self.augment = augment + self.hyp = hyp + self.image_weights = image_weights + self.rect = False if image_weights else rect + self.mosaic = self.augment and not self.rect # load 4 images at a time into a mosaic (only during training) + self.mosaic_border = [-img_size // 2, -img_size // 2] + self.stride = stride + + try: + f = [] # image files + for p in path if isinstance(path, list) else [path]: + p = Path(p) # os-agnostic + if p.is_dir(): # dir + f += glob.glob(str(p / '**' / '*.*'), recursive=True) + elif p.is_file(): # file + with open(p, 'r') as t: + t = t.read().strip().splitlines() + parent = str(p.parent) + os.sep + f += [x.replace('./', parent) if x.startswith('./') else x for x in t] # local to global path + else: + raise Exception('%s does not exist' % p) + self.img_files = sorted([x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in img_formats]) + assert self.img_files, 'No images found' + except Exception as e: + raise Exception('Error loading data from %s: %s\nSee %s' % (path, e, help_url)) + + # Check cache + self.label_files = img2label_paths(self.img_files) # labels + cache_path = Path(self.label_files[0]).parent.with_suffix('.cache') # cached labels + if cache_path.is_file(): + cache = torch.load(cache_path) # load + if cache['hash'] != get_hash(self.label_files + self.img_files) or 'results' not in cache: # changed + cache = self.cache_labels(cache_path) # re-cache + else: + cache = self.cache_labels(cache_path) # cache + + # Display cache + [nf, nm, ne, nc, n] = cache.pop('results') # found, missing, empty, corrupted, total + desc = f"Scanning '{cache_path}' for images and labels... {nf} found, {nm} missing, {ne} empty, {nc} corrupted" + tqdm(None, desc=desc, total=n, initial=n) + assert nf > 0 or not augment, f'No labels found in {cache_path}. Can not train without labels. See {help_url}' + + # Read cache + cache.pop('hash') # remove hash + labels, shapes = zip(*cache.values()) + self.labels = list(labels) + self.shapes = np.array(shapes, dtype=np.float64) + self.img_files = list(cache.keys()) # update + self.label_files = img2label_paths(cache.keys()) # update + if single_cls: + for x in self.labels: + x[:, 0] = 0 + + n = len(shapes) # number of images + bi = np.floor(np.arange(n) / batch_size).astype(np.int) # batch index + nb = bi[-1] + 1 # number of batches + self.batch = bi # batch index of image + self.n = n + self.indices = range(n) + + # Rectangular Training + if self.rect: + # Sort by aspect ratio + s = self.shapes # wh + ar = s[:, 1] / s[:, 0] # aspect ratio + irect = ar.argsort() + self.img_files = [self.img_files[i] for i in irect] + self.label_files = [self.label_files[i] for i in irect] + self.labels = [self.labels[i] for i in irect] + self.shapes = s[irect] # wh + ar = ar[irect] + + # Set training image shapes + shapes = [[1, 1]] * nb + for i in range(nb): + ari = ar[bi == i] + mini, maxi = ari.min(), ari.max() + if maxi < 1: + shapes[i] = [maxi, 1] + elif mini > 1: + shapes[i] = [1, 1 / mini] + + self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(np.int) * stride + + # Cache images into memory for faster training (WARNING: large datasets may exceed system RAM) + self.imgs = [None] * n + if cache_images: + gb = 0 # Gigabytes of cached images + self.img_hw0, self.img_hw = [None] * n, [None] * n + results = ThreadPool(8).imap(lambda x: load_image(*x), zip(repeat(self), range(n))) # 8 threads + pbar = tqdm(enumerate(results), total=n) + for i, x in pbar: + self.imgs[i], self.img_hw0[i], self.img_hw[i] = x # img, hw_original, hw_resized = load_image(self, i) + gb += self.imgs[i].nbytes + pbar.desc = 'Caching images (%.1fGB)' % (gb / 1E9) + + def cache_labels(self, path=Path('./labels.cache')): + # Cache dataset labels, check images and read shapes + x = {} # dict + nm, nf, ne, nc = 0, 0, 0, 0 # number missing, found, empty, duplicate + pbar = tqdm(zip(self.img_files, self.label_files), desc='Scanning images', total=len(self.img_files)) + for i, (im_file, lb_file) in enumerate(pbar): + try: + # verify images + im = Image.open(im_file) + im.verify() # PIL verify + shape = exif_size(im) # image size + assert (shape[0] > 9) & (shape[1] > 9), 'image size <10 pixels' + + # verify labels + if os.path.isfile(lb_file): + nf += 1 # label found + with open(lb_file, 'r') as f: + l = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32) # labels + if len(l): + assert l.shape[1] == 5, 'labels require 5 columns each' + assert (l >= 0).all(), 'negative labels' + assert (l[:, 1:] <= 1).all(), 'non-normalized or out of bounds coordinate labels' + assert np.unique(l, axis=0).shape[0] == l.shape[0], 'duplicate labels' + else: + ne += 1 # label empty + l = np.zeros((0, 5), dtype=np.float32) + else: + nm += 1 # label missing + l = np.zeros((0, 5), dtype=np.float32) + x[im_file] = [l, shape] + except Exception as e: + nc += 1 + print('WARNING: Ignoring corrupted image and/or label %s: %s' % (im_file, e)) + + pbar.desc = f"Scanning '{path.parent / path.stem}' for images and labels... " \ + f"{nf} found, {nm} missing, {ne} empty, {nc} corrupted" + + if nf == 0: + print(f'WARNING: No labels found in {path}. See {help_url}') + + x['hash'] = get_hash(self.label_files + self.img_files) + x['results'] = [nf, nm, ne, nc, i + 1] + torch.save(x, path) # save for next time + logging.info(f"New cache created: {path}") + return x + + def __len__(self): + return len(self.img_files) + + # def __iter__(self): + # self.count = -1 + # print('ran dataset iter') + # #self.shuffled_vector = np.random.permutation(self.nF) if self.augment else np.arange(self.nF) + # return self + + def __getitem__(self, index): + index = self.indices[index] # linear, shuffled, or image_weights + + hyp = self.hyp + mosaic = self.mosaic and random.random() < hyp['mosaic'] + if mosaic: + # Load mosaic + img, labels = load_mosaic(self, index) + shapes = None + + # MixUp https://arxiv.org/pdf/1710.09412.pdf + if random.random() < hyp['mixup']: + img2, labels2 = load_mosaic(self, random.randint(0, self.n - 1)) + r = np.random.beta(8.0, 8.0) # mixup ratio, alpha=beta=8.0 + img = (img * r + img2 * (1 - r)).astype(np.uint8) + labels = np.concatenate((labels, labels2), 0) + + else: + # Load image + img, (h0, w0), (h, w) = load_image(self, index) + + # Letterbox + shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size # final letterboxed shape + img, ratio, pad = letterbox(img, shape, auto=False, scaleup=self.augment) + shapes = (h0, w0), ((h / h0, w / w0), pad) # for COCO mAP rescaling + + # Load labels + labels = [] + x = self.labels[index] + if x.size > 0: + # Normalized xywh to pixel xyxy format + labels = x.copy() + labels[:, 1] = ratio[0] * w * (x[:, 1] - x[:, 3] / 2) + pad[0] # pad width + labels[:, 2] = ratio[1] * h * (x[:, 2] - x[:, 4] / 2) + pad[1] # pad height + labels[:, 3] = ratio[0] * w * (x[:, 1] + x[:, 3] / 2) + pad[0] + labels[:, 4] = ratio[1] * h * (x[:, 2] + x[:, 4] / 2) + pad[1] + + if self.augment: + # Augment imagespace + if not mosaic: + img, labels = random_perspective(img, labels, + degrees=hyp['degrees'], + translate=hyp['translate'], + scale=hyp['scale'], + shear=hyp['shear'], + perspective=hyp['perspective']) + + # Augment colorspace + augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v']) + + # Apply cutouts + # if random.random() < 0.9: + # labels = cutout(img, labels) + + nL = len(labels) # number of labels + if nL: + labels[:, 1:5] = xyxy2xywh(labels[:, 1:5]) # convert xyxy to xywh + labels[:, [2, 4]] /= img.shape[0] # normalized height 0-1 + labels[:, [1, 3]] /= img.shape[1] # normalized width 0-1 + + if self.augment: + # flip up-down + if random.random() < hyp['flipud']: + img = np.flipud(img) + if nL: + labels[:, 2] = 1 - labels[:, 2] + + # flip left-right + if random.random() < hyp['fliplr']: + img = np.fliplr(img) + if nL: + labels[:, 1] = 1 - labels[:, 1] + + labels_out = torch.zeros((nL, 6)) + if nL: + labels_out[:, 1:] = torch.from_numpy(labels) + + # Convert + img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416 + img = np.ascontiguousarray(img) + + return torch.from_numpy(img), labels_out, self.img_files[index], shapes + + @staticmethod + def collate_fn(batch): + img, label, path, shapes = zip(*batch) # transposed + for i, l in enumerate(label): + l[:, 0] = i # add target image index for build_targets() + return torch.stack(img, 0), torch.cat(label, 0), path, shapes + + +# Ancillary functions -------------------------------------------------------------------------------------------------- +def load_image(self, index): + # loads 1 image from dataset, returns img, original hw, resized hw + img = self.imgs[index] + if img is None: # not cached + path = self.img_files[index] + img = cv2.imread(path) # BGR + assert img is not None, 'Image Not Found ' + path + h0, w0 = img.shape[:2] # orig hw + r = self.img_size / max(h0, w0) # resize image to img_size + if r != 1: # always resize down, only resize up if training with augmentation + interp = cv2.INTER_AREA if r < 1 and not self.augment else cv2.INTER_LINEAR + img = cv2.resize(img, (int(w0 * r), int(h0 * r)), interpolation=interp) + return img, (h0, w0), img.shape[:2] # img, hw_original, hw_resized + else: + return self.imgs[index], self.img_hw0[index], self.img_hw[index] # img, hw_original, hw_resized + + +def augment_hsv(img, hgain=0.5, sgain=0.5, vgain=0.5): + r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1 # random gains + hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV)) + dtype = img.dtype # uint8 + + x = np.arange(0, 256, dtype=np.int16) + lut_hue = ((x * r[0]) % 180).astype(dtype) + lut_sat = np.clip(x * r[1], 0, 255).astype(dtype) + lut_val = np.clip(x * r[2], 0, 255).astype(dtype) + + img_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))).astype(dtype) + cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) # no return needed + + # Histogram equalization + # if random.random() < 0.2: + # for i in range(3): + # img[:, :, i] = cv2.equalizeHist(img[:, :, i]) + + +def load_mosaic(self, index): + # loads images in a mosaic + + labels4 = [] + s = self.img_size + yc, xc = [int(random.uniform(-x, 2 * s + x)) for x in self.mosaic_border] # mosaic center x, y + indices = [index] + [self.indices[random.randint(0, self.n - 1)] for _ in range(3)] # 3 additional image indices + for i, index in enumerate(indices): + # Load image + img, _, (h, w) = load_image(self, index) + + # place img in img4 + if i == 0: # top left + img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8) # base image with 4 tiles + x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc # xmin, ymin, xmax, ymax (large image) + x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h # xmin, ymin, xmax, ymax (small image) + elif i == 1: # top right + x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc + x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h + elif i == 2: # bottom left + x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h) + x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, w, min(y2a - y1a, h) + elif i == 3: # bottom right + x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h) + x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h) + + img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b] # img4[ymin:ymax, xmin:xmax] + padw = x1a - x1b + padh = y1a - y1b + + # Labels + x = self.labels[index] + labels = x.copy() + if x.size > 0: # Normalized xywh to pixel xyxy format + labels[:, 1] = w * (x[:, 1] - x[:, 3] / 2) + padw + labels[:, 2] = h * (x[:, 2] - x[:, 4] / 2) + padh + labels[:, 3] = w * (x[:, 1] + x[:, 3] / 2) + padw + labels[:, 4] = h * (x[:, 2] + x[:, 4] / 2) + padh + labels4.append(labels) + + # Concat/clip labels + if len(labels4): + labels4 = np.concatenate(labels4, 0) + np.clip(labels4[:, 1:], 0, 2 * s, out=labels4[:, 1:]) # use with random_perspective + # img4, labels4 = replicate(img4, labels4) # replicate + + # Augment + img4, labels4 = random_perspective(img4, labels4, + degrees=self.hyp['degrees'], + translate=self.hyp['translate'], + scale=self.hyp['scale'], + shear=self.hyp['shear'], + perspective=self.hyp['perspective'], + border=self.mosaic_border) # border to remove + + return img4, labels4 + + +def replicate(img, labels): + # Replicate labels + h, w = img.shape[:2] + boxes = labels[:, 1:].astype(int) + x1, y1, x2, y2 = boxes.T + s = ((x2 - x1) + (y2 - y1)) / 2 # side length (pixels) + for i in s.argsort()[:round(s.size * 0.5)]: # smallest indices + x1b, y1b, x2b, y2b = boxes[i] + bh, bw = y2b - y1b, x2b - x1b + yc, xc = int(random.uniform(0, h - bh)), int(random.uniform(0, w - bw)) # offset x, y + x1a, y1a, x2a, y2a = [xc, yc, xc + bw, yc + bh] + img[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b] # img4[ymin:ymax, xmin:xmax] + labels = np.append(labels, [[labels[i, 0], x1a, y1a, x2a, y2a]], axis=0) + + return img, labels + + +def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True): + # Resize image to a 32-pixel-multiple rectangle https://github.com/ultralytics/yolov3/issues/232 + shape = img.shape[:2] # current shape [height, width] + if isinstance(new_shape, int): + new_shape = (new_shape, new_shape) + + # Scale ratio (new / old) + r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) + if not scaleup: # only scale down, do not scale up (for better test mAP) + r = min(r, 1.0) + + # Compute padding + ratio = r, r # width, height ratios + new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) + dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding + if auto: # minimum rectangle + dw, dh = np.mod(dw, 32), np.mod(dh, 32) # wh padding + elif scaleFill: # stretch + dw, dh = 0.0, 0.0 + new_unpad = (new_shape[1], new_shape[0]) + ratio = new_shape[1] / shape[1], new_shape[0] / shape[0] # width, height ratios + + dw /= 2 # divide padding into 2 sides + dh /= 2 + + if shape[::-1] != new_unpad: # resize + img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR) + top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) + left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) + img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # add border + return img, ratio, (dw, dh) + + +def random_perspective(img, targets=(), degrees=10, translate=.1, scale=.1, shear=10, perspective=0.0, border=(0, 0)): + # torchvision.transforms.RandomAffine(degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-10, 10)) + # targets = [cls, xyxy] + + height = img.shape[0] + border[0] * 2 # shape(h,w,c) + width = img.shape[1] + border[1] * 2 + + # Center + C = np.eye(3) + C[0, 2] = -img.shape[1] / 2 # x translation (pixels) + C[1, 2] = -img.shape[0] / 2 # y translation (pixels) + + # Perspective + P = np.eye(3) + P[2, 0] = random.uniform(-perspective, perspective) # x perspective (about y) + P[2, 1] = random.uniform(-perspective, perspective) # y perspective (about x) + + # Rotation and Scale + R = np.eye(3) + a = random.uniform(-degrees, degrees) + # a += random.choice([-180, -90, 0, 90]) # add 90deg rotations to small rotations + s = random.uniform(1 - scale, 1 + scale) + # s = 2 ** random.uniform(-scale, scale) + R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s) + + # Shear + S = np.eye(3) + S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # x shear (deg) + S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # y shear (deg) + + # Translation + T = np.eye(3) + T[0, 2] = random.uniform(0.5 - translate, 0.5 + translate) * width # x translation (pixels) + T[1, 2] = random.uniform(0.5 - translate, 0.5 + translate) * height # y translation (pixels) + + # Combined rotation matrix + M = T @ S @ R @ P @ C # order of operations (right to left) is IMPORTANT + if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any(): # image changed + if perspective: + img = cv2.warpPerspective(img, M, dsize=(width, height), borderValue=(114, 114, 114)) + else: # affine + img = cv2.warpAffine(img, M[:2], dsize=(width, height), borderValue=(114, 114, 114)) + + # Visualize + # import matplotlib.pyplot as plt + # ax = plt.subplots(1, 2, figsize=(12, 6))[1].ravel() + # ax[0].imshow(img[:, :, ::-1]) # base + # ax[1].imshow(img2[:, :, ::-1]) # warped + + # Transform label coordinates + n = len(targets) + if n: + # warp points + xy = np.ones((n * 4, 3)) + xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1 + xy = xy @ M.T # transform + if perspective: + xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8) # rescale + else: # affine + xy = xy[:, :2].reshape(n, 8) + + # create new boxes + x = xy[:, [0, 2, 4, 6]] + y = xy[:, [1, 3, 5, 7]] + xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T + + # # apply angle-based reduction of bounding boxes + # radians = a * math.pi / 180 + # reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5 + # x = (xy[:, 2] + xy[:, 0]) / 2 + # y = (xy[:, 3] + xy[:, 1]) / 2 + # w = (xy[:, 2] - xy[:, 0]) * reduction + # h = (xy[:, 3] - xy[:, 1]) * reduction + # xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T + + # clip boxes + xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width) + xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height) + + # filter candidates + i = box_candidates(box1=targets[:, 1:5].T * s, box2=xy.T) + targets = targets[i] + targets[:, 1:5] = xy[i] + + return img, targets + + +def box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.1): # box1(4,n), box2(4,n) + # Compute candidate boxes: box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio + w1, h1 = box1[2] - box1[0], box1[3] - box1[1] + w2, h2 = box2[2] - box2[0], box2[3] - box2[1] + ar = np.maximum(w2 / (h2 + 1e-16), h2 / (w2 + 1e-16)) # aspect ratio + return (w2 > wh_thr) & (h2 > wh_thr) & (w2 * h2 / (w1 * h1 + 1e-16) > area_thr) & (ar < ar_thr) # candidates + + +def cutout(image, labels): + # Applies image cutout augmentation https://arxiv.org/abs/1708.04552 + h, w = image.shape[:2] + + def bbox_ioa(box1, box2): + # Returns the intersection over box2 area given box1, box2. box1 is 4, box2 is nx4. boxes are x1y1x2y2 + box2 = box2.transpose() + + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3] + b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3] + + # Intersection area + inter_area = (np.minimum(b1_x2, b2_x2) - np.maximum(b1_x1, b2_x1)).clip(0) * \ + (np.minimum(b1_y2, b2_y2) - np.maximum(b1_y1, b2_y1)).clip(0) + + # box2 area + box2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + 1e-16 + + # Intersection over box2 area + return inter_area / box2_area + + # create random masks + scales = [0.5] * 1 + [0.25] * 2 + [0.125] * 4 + [0.0625] * 8 + [0.03125] * 16 # image size fraction + for s in scales: + mask_h = random.randint(1, int(h * s)) + mask_w = random.randint(1, int(w * s)) + + # box + xmin = max(0, random.randint(0, w) - mask_w // 2) + ymin = max(0, random.randint(0, h) - mask_h // 2) + xmax = min(w, xmin + mask_w) + ymax = min(h, ymin + mask_h) + + # apply random color mask + image[ymin:ymax, xmin:xmax] = [random.randint(64, 191) for _ in range(3)] + + # return unobscured labels + if len(labels) and s > 0.03: + box = np.array([xmin, ymin, xmax, ymax], dtype=np.float32) + ioa = bbox_ioa(box, labels[:, 1:5]) # intersection over area + labels = labels[ioa < 0.60] # remove >60% obscured labels + + return labels + + +def create_folder(path='./new'): + # Create folder + if os.path.exists(path): + shutil.rmtree(path) # delete output folder + os.makedirs(path) # make new output folder + + +def flatten_recursive(path='../coco128'): + # Flatten a recursive directory by bringing all files to top level + new_path = Path(path + '_flat') + create_folder(new_path) + for file in tqdm(glob.glob(str(Path(path)) + '/**/*.*', recursive=True)): + shutil.copyfile(file, new_path / Path(file).name) + + +def extract_boxes(path='../coco128/'): # from utils.datasets import *; extract_boxes('../coco128') + # Convert detection dataset into classification dataset, with one directory per class + + path = Path(path) # images dir + shutil.rmtree(path / 'classifier') if (path / 'classifier').is_dir() else None # remove existing + files = list(path.rglob('*.*')) + n = len(files) # number of files + for im_file in tqdm(files, total=n): + if im_file.suffix[1:] in img_formats: + # image + im = cv2.imread(str(im_file))[..., ::-1] # BGR to RGB + h, w = im.shape[:2] + + # labels + lb_file = Path(img2label_paths([str(im_file)])[0]) + if Path(lb_file).exists(): + with open(lb_file, 'r') as f: + lb = np.array([x.split() for x in f.read().strip().splitlines()], dtype=np.float32) # labels + + for j, x in enumerate(lb): + c = int(x[0]) # class + f = (path / 'classifier') / f'{c}' / f'{path.stem}_{im_file.stem}_{j}.jpg' # new filename + if not f.parent.is_dir(): + f.parent.mkdir(parents=True) + + b = x[1:] * [w, h, w, h] # box + # b[2:] = b[2:].max() # rectangle to square + b[2:] = b[2:] * 1.2 + 3 # pad + b = xywh2xyxy(b.reshape(-1, 4)).ravel().astype(np.int) + + b[[0, 2]] = np.clip(b[[0, 2]], 0, w) # clip boxes outside of image + b[[1, 3]] = np.clip(b[[1, 3]], 0, h) + assert cv2.imwrite(str(f), im[b[1]:b[3], b[0]:b[2]]), f'box failure in {f}' + + +def autosplit(path='../coco128', weights=(0.9, 0.1, 0.0)): # from utils.datasets import *; autosplit('../coco128') + """ Autosplit a dataset into train/val/test splits and save path/autosplit_*.txt files + # Arguments + path: Path to images directory + weights: Train, val, test weights (list) + """ + path = Path(path) # images dir + files = list(path.rglob('*.*')) + n = len(files) # number of files + indices = random.choices([0, 1, 2], weights=weights, k=n) # assign each image to a split + txt = ['autosplit_train.txt', 'autosplit_val.txt', 'autosplit_test.txt'] # 3 txt files + [(path / x).unlink() for x in txt if (path / x).exists()] # remove existing + for i, img in tqdm(zip(indices, files), total=n): + if img.suffix[1:] in img_formats: + with open(path / txt[i], 'a') as f: + f.write(str(img) + '\n') # add image to txt file diff --git a/model/detection/yolov5/utils/google_app_engine/Dockerfile b/model/detection/yolov5/utils/google_app_engine/Dockerfile new file mode 100644 index 0000000..0155618 --- /dev/null +++ b/model/detection/yolov5/utils/google_app_engine/Dockerfile @@ -0,0 +1,25 @@ +FROM gcr.io/google-appengine/python + +# Create a virtualenv for dependencies. This isolates these packages from +# system-level packages. +# Use -p python3 or -p python3.7 to select python version. Default is version 2. +RUN virtualenv /env -p python3 + +# Setting these environment variables are the same as running +# source /env/bin/activate. +ENV VIRTUAL_ENV /env +ENV PATH /env/bin:$PATH + +RUN apt-get update && apt-get install -y python-opencv + +# Copy the application's requirements.txt and run pip to install all +# dependencies into the virtualenv. +ADD requirements.txt /app/requirements.txt +RUN pip install -r /app/requirements.txt + +# Add the application source code. +ADD . /app + +# Run a WSGI server to serve the application. gunicorn must be declared as +# a dependency in requirements.txt. +CMD gunicorn -b :$PORT main:app diff --git a/model/detection/yolov5/utils/google_app_engine/additional_requirements.txt b/model/detection/yolov5/utils/google_app_engine/additional_requirements.txt new file mode 100644 index 0000000..5fcc305 --- /dev/null +++ b/model/detection/yolov5/utils/google_app_engine/additional_requirements.txt @@ -0,0 +1,4 @@ +# add these requirements in your app on top of the existing ones +pip==18.1 +Flask==1.0.2 +gunicorn==19.9.0 diff --git a/model/detection/yolov5/utils/google_app_engine/app.yaml b/model/detection/yolov5/utils/google_app_engine/app.yaml new file mode 100644 index 0000000..ac29d10 --- /dev/null +++ b/model/detection/yolov5/utils/google_app_engine/app.yaml @@ -0,0 +1,14 @@ +runtime: custom +env: flex + +service: yolov5app + +liveness_check: + initial_delay_sec: 600 + +manual_scaling: + instances: 1 +resources: + cpu: 1 + memory_gb: 4 + disk_size_gb: 20 \ No newline at end of file diff --git a/model/detection/yolov5/utils/metrics.py b/model/detection/yolov5/utils/metrics.py new file mode 100644 index 0000000..99d5bcf --- /dev/null +++ b/model/detection/yolov5/utils/metrics.py @@ -0,0 +1,200 @@ +# Model validation metrics + +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np +import torch + +from . import general + + +def fitness(x): + # Model fitness as a weighted combination of metrics + w = [0.0, 0.0, 0.1, 0.9] # weights for [P, R, mAP@0.5, mAP@0.5:0.95] + return (x[:, :4] * w).sum(1) + + +def ap_per_class(tp, conf, pred_cls, target_cls, plot=False, save_dir='precision-recall_curve.png', names=[]): + """ Compute the average precision, given the recall and precision curves. + Source: https://github.com/rafaelpadilla/Object-Detection-Metrics. + # Arguments + tp: True positives (nparray, nx1 or nx10). + conf: Objectness value from 0-1 (nparray). + pred_cls: Predicted object classes (nparray). + target_cls: True object classes (nparray). + plot: Plot precision-recall curve at mAP@0.5 + save_dir: Plot save directory + # Returns + The average precision as computed in py-faster-rcnn. + """ + + # Sort by objectness + i = np.argsort(-conf) + tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] + + # Find unique classes + unique_classes = np.unique(target_cls) + + # Create Precision-Recall curve and compute AP for each class + px, py = np.linspace(0, 1, 1000), [] # for plotting + pr_score = 0.1 # score to evaluate P and R https://github.com/ultralytics/yolov3/issues/898 + s = [unique_classes.shape[0], tp.shape[1]] # number class, number iou thresholds (i.e. 10 for mAP0.5...0.95) + ap, p, r = np.zeros(s), np.zeros(s), np.zeros(s) + for ci, c in enumerate(unique_classes): + i = pred_cls == c + n_l = (target_cls == c).sum() # number of labels + n_p = i.sum() # number of predictions + + if n_p == 0 or n_l == 0: + continue + else: + # Accumulate FPs and TPs + fpc = (1 - tp[i]).cumsum(0) + tpc = tp[i].cumsum(0) + + # Recall + recall = tpc / (n_l + 1e-16) # recall curve + r[ci] = np.interp(-pr_score, -conf[i], recall[:, 0]) # r at pr_score, negative x, xp because xp decreases + + # Precision + precision = tpc / (tpc + fpc) # precision curve + p[ci] = np.interp(-pr_score, -conf[i], precision[:, 0]) # p at pr_score + + # AP from recall-precision curve + for j in range(tp.shape[1]): + ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j]) + if plot and (j == 0): + py.append(np.interp(px, mrec, mpre)) # precision at mAP@0.5 + + # Compute F1 score (harmonic mean of precision and recall) + f1 = 2 * p * r / (p + r + 1e-16) + + if plot: + plot_pr_curve(px, py, ap, save_dir, names) + + return p, r, ap, f1, unique_classes.astype('int32') + + +def compute_ap(recall, precision): + """ Compute the average precision, given the recall and precision curves + # Arguments + recall: The recall curve (list) + precision: The precision curve (list) + # Returns + Average precision, precision curve, recall curve + """ + + # Append sentinel values to beginning and end + mrec = np.concatenate(([0.], recall, [recall[-1] + 0.01])) + mpre = np.concatenate(([1.], precision, [0.])) + + # Compute the precision envelope + mpre = np.flip(np.maximum.accumulate(np.flip(mpre))) + + # Integrate area under curve + method = 'interp' # methods: 'continuous', 'interp' + if method == 'interp': + x = np.linspace(0, 1, 101) # 101-point interp (COCO) + ap = np.trapz(np.interp(x, mrec, mpre), x) # integrate + else: # 'continuous' + i = np.where(mrec[1:] != mrec[:-1])[0] # points where x axis (recall) changes + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) # area under curve + + return ap, mpre, mrec + + +class ConfusionMatrix: + # Updated version of https://github.com/kaanakan/object_detection_confusion_matrix + def __init__(self, nc, conf=0.25, iou_thres=0.45): + self.matrix = np.zeros((nc + 1, nc + 1)) + self.nc = nc # number of classes + self.conf = conf + self.iou_thres = iou_thres + + def process_batch(self, detections, labels): + """ + Return intersection-over-union (Jaccard index) of boxes. + Both sets of boxes are expected to be in (x1, y1, x2, y2) format. + Arguments: + detections (Array[N, 6]), x1, y1, x2, y2, conf, class + labels (Array[M, 5]), class, x1, y1, x2, y2 + Returns: + None, updates confusion matrix accordingly + """ + detections = detections[detections[:, 4] > self.conf] + gt_classes = labels[:, 0].int() + detection_classes = detections[:, 5].int() + iou = general.box_iou(labels[:, 1:], detections[:, :4]) + + x = torch.where(iou > self.iou_thres) + if x[0].shape[0]: + matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy() + if x[0].shape[0] > 1: + matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 1], return_index=True)[1]] + matches = matches[matches[:, 2].argsort()[::-1]] + matches = matches[np.unique(matches[:, 0], return_index=True)[1]] + else: + matches = np.zeros((0, 3)) + + n = matches.shape[0] > 0 + m0, m1, _ = matches.transpose().astype(np.int16) + for i, gc in enumerate(gt_classes): + j = m0 == i + if n and sum(j) == 1: + self.matrix[gc, detection_classes[m1[j]]] += 1 # correct + else: + self.matrix[gc, self.nc] += 1 # background FP + + if n: + for i, dc in enumerate(detection_classes): + if not any(m1 == i): + self.matrix[self.nc, dc] += 1 # background FN + + def matrix(self): + return self.matrix + + def plot(self, save_dir='', names=()): + try: + import seaborn as sn + + array = self.matrix / (self.matrix.sum(0).reshape(1, self.nc + 1) + 1E-6) # normalize + array[array < 0.005] = np.nan # don't annotate (would appear as 0.00) + + fig = plt.figure(figsize=(12, 9), tight_layout=True) + sn.set(font_scale=1.0 if self.nc < 50 else 0.8) # for label size + labels = (0 < len(names) < 99) and len(names) == self.nc # apply names to ticklabels + sn.heatmap(array, annot=self.nc < 30, annot_kws={"size": 8}, cmap='Blues', fmt='.2f', square=True, + xticklabels=names + ['background FN'] if labels else "auto", + yticklabels=names + ['background FP'] if labels else "auto").set_facecolor((1, 1, 1)) + fig.axes[0].set_xlabel('True') + fig.axes[0].set_ylabel('Predicted') + fig.savefig(Path(save_dir) / 'confusion_matrix.png', dpi=250) + except Exception as e: + pass + + def print(self): + for i in range(self.nc + 1): + print(' '.join(map(str, self.matrix[i]))) + + +# Plots ---------------------------------------------------------------------------------------------------------------- + +def plot_pr_curve(px, py, ap, save_dir='.', names=()): + fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True) + py = np.stack(py, axis=1) + + if 0 < len(names) < 21: # show mAP in legend if < 10 classes + for i, y in enumerate(py.T): + ax.plot(px, y, linewidth=1, label=f'{names[i]} %.3f' % ap[i, 0]) # plot(recall, precision) + else: + ax.plot(px, py, linewidth=1, color='grey') # plot(recall, precision) + + ax.plot(px, py.mean(1), linewidth=3, color='blue', label='all classes %.3f mAP@0.5' % ap[:, 0].mean()) + ax.set_xlabel('Recall') + ax.set_ylabel('Precision') + ax.set_xlim(0, 1) + ax.set_ylim(0, 1) + plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left") + fig.savefig(Path(save_dir) / 'precision_recall_curve.png', dpi=250) diff --git a/model/detection/yolov5/weights/download_weights.sh b/model/detection/yolov5/weights/download_weights.sh new file mode 100644 index 0000000..43c8e31 --- /dev/null +++ b/model/detection/yolov5/weights/download_weights.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# Download latest models from https://github.com/ultralytics/yolov5/releases +# Usage: +# $ bash weights/download_weights.sh + +python - < 1 # JSD only valid with aug splits set + # self.train_loss_fn = JsdCrossEntropy(num_splits=num_aug_splits, smoothing=args.smoothing).to(device) + # elif mixup_active: + # # smoothing is handled with mixup target transform + # self.train_loss_fn = SoftTargetCrossEntropy().to(device) + if args.smoothing: + self.train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing).to(device) + else: + self.train_loss_fn = nn.CrossEntropyLoss().to(device) + self.validate_loss_fn = nn.CrossEntropyLoss().to(device) + + + def get_model_params(self): + return self.model.cpu().state_dict() + + def set_model_params(self, model_parameters): + self.model.load_state_dict(model_parameters) + + def train(self, train_data, device, args): + model = self.model + + model.to(device) + model.train() + + epoch_loss = [] + for epoch in range(args.epochs): + batch_loss = [] + for batch_idx, (x, labels) in enumerate(train_data): + # logging.info(images.shape) + x, labels = x.to(device), labels.to(device) + self.optimizer.zero_grad() + log_probs = model(x) + loss = self.train_loss_fn(log_probs, labels) + loss.backward() + self.optimizer.step() + batch_loss.append(loss.item()) + if len(batch_loss) > 0: + epoch_loss.append(sum(batch_loss) / len(batch_loss)) + logging.info('(Trainer_ID {}. Local Training Epoch: {} \tLoss: {:.6f}'.format( + self.id, epoch, sum(epoch_loss) / len(epoch_loss))) + self.lr_scheduler.step(epoch=epoch + 1, metric=None) + + + def train_one_epoch(self, train_data, device, args, epoch, tracker=None, metrics=None): + model = self.model + + model.to(device) + model.train() + batch_loss = [] + for batch_idx, (x, labels) in enumerate(train_data): + x, labels = x.to(device), labels.to(device) + self.optimizer.zero_grad() + log_probs = model(x) + # logging.debug("labels: {}".format(labels)) + # logging.debug("pred: {}".format(log_probs)) + loss = self.train_loss_fn(log_probs, labels) + loss.backward() + self.optimizer.step() + batch_loss.append(loss.item()) + if (metrics is not None) and (tracker is not None): + metric_stat = metrics.evaluate(loss, log_probs, labels) + tracker.update_metrics(metric_stat, n_samples=labels.size(0)) + if len(batch_loss) > 0: + logging.info('(Trainer_ID {}. Local Training Epoch: {}, Iter: {} \tLoss: {:.6f} ACC1:{}'.format( + self.id, epoch, batch_idx, sum(batch_loss) / len(batch_loss), metric_stat['Acc1'])) + else: + if len(batch_loss) > 0: + logging.info('(Trainer_ID {}. Local Training Epoch: {}, Iter: {} \tLoss: {:.6f}'.format( + self.id, epoch, batch_idx, sum(batch_loss) / len(batch_loss))) + self.lr_scheduler.step(epoch=epoch + 1, metric=None) + + if (metrics is not None) and (tracker is not None): + return None + else: + return sum(batch_loss) / len(batch_loss) + + + + def train_one_step(self, train_batch_data, device, args, tracker=None, metrics=None): + model = self.model + + model.to(device) + model.train() + x, labels = train_batch_data + x, labels = x.to(device), labels.to(device) + self.optimizer.zero_grad() + log_probs = model(x) + loss = self.train_loss_fn(log_probs, labels) + loss.backward() + self.optimizer.step() + if (tracker is not None) and (metrics is not None): + metric_stat = metrics.evaluate(loss, log_probs, labels) + tracker.update_metrics(metric_stat, n_samples=labels.size(0)) + + return loss, log_probs, labels + + + + def test(self, test_data, device, args, tracker=None, metrics=None): + model = self.model + + model.eval() + model.to(device) + + + with torch.no_grad(): + for batch_idx, (x, target) in enumerate(test_data): + x = x.to(device) + target = target.to(device) + pred = model(x) + # logging.debug("labels: {}".format(target)) + # logging.debug("pred: {}".format(pred)) + loss = self.validate_loss_fn(pred, target) + if (metrics is not None) and (tracker is not None): + metric_stat = metrics.evaluate(loss, pred, target) + tracker.update_metrics(metric_stat, n_samples=target.size(0)) + logging.info('(Trainer_ID {}. Local Testing Iter: {} \tLoss: {:.6f} ACC1:{}'.format( + self.id, batch_idx, loss.item(), metric_stat['Acc1'])) + else: + raise NotImplementedError + + if (metrics is not None) and (tracker is not None): + return None + else: + raise NotImplementedError + + + + + diff --git a/training/fedavg_classification_trainer.py b/training/fedavg_classification_trainer.py new file mode 100644 index 0000000..ba99c0c --- /dev/null +++ b/training/fedavg_classification_trainer.py @@ -0,0 +1,126 @@ +import logging + +import torch +from torch import nn + +from timm.loss import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy, JsdCrossEntropy +from timm.optim import create_optimizer +from timm.scheduler import create_scheduler + + +from FedML.fedml_core.trainer.model_trainer import ModelTrainer + + +class ClassificationTrainer(ModelTrainer): + def __init__(self, model, device, args): + super().__init__(model) + # self.model = model + self.args = args + + if args.opt in ['rmsproptf']: + self.optimizer = create_optimizer(args, model) + elif args.opt == 'sgd': + self.optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, + weight_decay=args.wd, momentum=args.momentum) + elif args.opt == 'adam': + self.optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), + lr=args.lr, + weight_decay=args.wd, amsgrad=True) + else: + raise NotImplementedError + # TODO + # In fedavg, decay according to the round + args.decay_epochs = args.decay_rounds + if args.sched == 'step': + self.lr_scheduler, self.num_epochs = create_scheduler(args, self.optimizer) + elif args.sched == 'StepLR': + self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, + args.decay_epochs, args.decay_rate) + else: + raise NotImplementedError + + self.lr_scheduler.step(0) + + if args.smoothing: + self.train_loss_fn = LabelSmoothingCrossEntropy(smoothing=args.smoothing).to(device) + else: + self.train_loss_fn = nn.CrossEntropyLoss().to(device) + self.validate_loss_fn = nn.CrossEntropyLoss().to(device) + + + def get_model_params(self): + return self.model.cpu().state_dict() + + def set_model_params(self, model_parameters): + self.model.load_state_dict(model_parameters) + + def train(self, train_data, device, args): + model = self.model + + model.to(device) + model.train() + + epoch_loss = [] + for epoch in range(args.epochs): + batch_loss = [] + for batch_idx, (x, labels) in enumerate(train_data): + # logging.info(images.shape) + x, labels = x.to(device), labels.to(device) + self.optimizer.zero_grad() + log_probs = model(x) + loss = self.train_loss_fn(log_probs, labels) + loss.backward() + self.optimizer.step() + batch_loss.append(loss.item()) + logging.info('Local Training Epoch: {} iter: {} \t Loss: {:.6f}'.format( + epoch, batch_idx, loss.item())) + if len(batch_loss) > 0: + epoch_loss.append(sum(batch_loss) / len(batch_loss)) + logging.info('(Trainer_ID {}. Local Training Epoch: {} \tLoss: {:.6f}'.format( + self.id, epoch, sum(epoch_loss) / len(epoch_loss))) + # self.lr_scheduler.step(epoch=epoch + 1, metric=None) + self.lr_scheduler.step(epoch=args.round_idx) + + + def test(self, test_data, device, args): + model = self.model + + model.eval() + model.to(device) + + metrics = { + 'test_correct': 0, + 'test_loss': 0, + 'test_precision': 0, + 'test_recall': 0, + 'test_total': 0 + } + + # criterion = nn.CrossEntropyLoss().to(device) + with torch.no_grad(): + for batch_idx, (x, target) in enumerate(test_data): + x = x.to(device) + target = target.to(device) + pred = model(x) + loss = self.validate_loss_fn(pred, target) + if args.dataset == "stackoverflow_lr": + predicted = (pred > .5).int() + correct = predicted.eq(target).sum(axis=-1).eq(target.size(1)).sum() + true_positive = ((target * predicted) > .1).int().sum(axis=-1) + precision = true_positive / (predicted.sum(axis=-1) + 1e-13) + recall = true_positive / (target.sum(axis=-1) + 1e-13) + metrics['test_precision'] += precision.sum().item() + metrics['test_recall'] += recall.sum().item() + else: + _, predicted = torch.max(pred, -1) + correct = predicted.eq(target).sum() + + metrics['test_correct'] += correct.item() + metrics['test_loss'] += loss.item() * target.size(0) + metrics['test_total'] += target.size(0) + logging.info('Local Testing iter: {} \t Loss: {:.6f} Acc: {:.6f}'.format( + batch_idx, loss.item(), metrics['test_correct']/metrics['test_total'])) + return metrics + + def test_on_the_server(self, train_data_local_dict, test_data_local_dict, device, args=None) -> bool: + pass diff --git a/utils/context.py b/utils/context.py new file mode 100644 index 0000000..76cea18 --- /dev/null +++ b/utils/context.py @@ -0,0 +1,35 @@ +from contextlib import contextmanager +import threading + +import traceback +from mpi4py import MPI + + + +@contextmanager +def raise_MPI_error(): + import logging + logging.debug("Debugging, Enter the MPI catch error") + try: + yield + except Exception as e: + logging.info(e) + logging.info('traceback.format_exc():\n%s' % traceback.format_exc()) + MPI.COMM_WORLD.Abort() + +@contextmanager +def raise_error_without_process(): + import logging + logging.debug("Debugging, Enter the MPI catch error") + try: + yield + except Exception as e: + logging.info(e) + logging.info('traceback.format_exc():\n%s' % traceback.format_exc()) + +@contextmanager +def get_lock(lock: threading.Lock()): + lock.acquire() + yield + if lock.locked(): + lock.release() diff --git a/utils/logger.py b/utils/logger.py new file mode 100644 index 0000000..f2c53d0 --- /dev/null +++ b/utils/logger.py @@ -0,0 +1,196 @@ +import os +import json +import time +import platform +import logging + +def logging_config(args, process_id): + # customize the log format + while logging.getLogger().handlers: + logging.getLogger().handlers.clear() + console = logging.StreamHandler() + if args.level == 'INFO': + console.setLevel(logging.INFO) + elif args.level == 'DEBUG': + console.setLevel(logging.DEBUG) + else: + raise NotImplementedError + formatter = logging.Formatter(str(process_id) + + ' - %(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') + console.setFormatter(formatter) + # Create an instance + logging.getLogger().addHandler(console) + # logging.getLogger().info("test") + logging.basicConfig() + logger = logging.getLogger() + if args.level == 'INFO': + logger.setLevel(logging.INFO) + elif args.level == 'DEBUG': + logger.setLevel(logging.DEBUG) + else: + raise NotImplementedError + logging.info(args) + + + +class Logger(object): + + INFO = 0 + DEBUG = 1 + WARNING = 2 + ERROR = 3 + CRITICAL = 4 + + @classmethod + def config_logger(cls, file_folder='.', level="info", + save_log=False, display_source=False): + """ + :param filename: ending with .json + :param auto_save: save the JSON file after every addition + """ + cls.file_folder = file_folder + cls.file_json = os.path.join(file_folder, "log-1.json") + # cls.file_log can be changed by add_log_file() + cls.file_log = os.path.join(file_folder, "log.log") + cls.values = [] + cls.save_log = save_log + logger = logging.getLogger() + if display_source: + cls.formatter = logging.Formatter('%(asctime)s [%(filename)s:%(lineno)d] %(levelname)s %(message)s') + else: + cls.formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') + cls.level = level + if level == "info": + logger.setLevel(logging.INFO) + elif level == "debug": + logger.setLevel(logging.DEBUG) + elif level == "warning": + logger.setLevel(logging.WARNING) + elif level == "error": + logger.setLevel(logging.ERROR) + elif level == "critical": + logger.setLevel(logging.CRITICAL) + + strhdlr = logging.StreamHandler() + strhdlr.setFormatter(cls.formatter) + logger.addHandler(strhdlr) + if save_log: + cls.add_log_file(cls.file_log) + cls.logger = logger + + + @classmethod + def add_log_file(cls, logfile): + assert cls.save_log is True + hdlr = logging.FileHandler(logfile) + hdlr.setFormatter(cls.formatter) + cls.logger.addHandler(hdlr) + + + @classmethod + def display_metric(cls, name, values, tags): + cls.info( + value="{name} ({tags}): {values} ".format( + name=name, values=values) + ) + + + @classmethod + def cache_metric_in_memory(cls, name, values, tags): + """ + Store a scalar metric. Example: + name="runtime", + values={ + "time": current_time, + "rank": rank, + "epoch": epoch, + "best_perf": best_perf, + }, + tags={"split": "test", "type": "local_model_avg"}, + """ + cls.values.append({"measurement": name, **tags, **values}) + + + @classmethod + def log_timer(cls, name, values, tags): + cls.info( + value="{name} ({tags}): {values} ".format( + name=name, values=values) + ) + + + @classmethod + def info(cls, value): + cls.logger.info(value) + + @classmethod + def debug(cls, value): + cls.logger.debug(value) + + @classmethod + def warning(cls, value): + cls.logger.warning(value) + + @classmethod + def error(cls, value): + cls.logger.error(value) + + @classmethod + def critical(cls, value): + cls.logger.critical(value) + + + @classmethod + def save_json(cls): + """Save the internal memory to a file.""" + with open(cls.file_json, "w") as fp: + json.dump(cls.values, fp, indent=" ") + + if len(cls.values) > 1e3: + # reset 'values' and redirect the json file to other name. + cls.values = [] + cls.redirect_new_json() + + + @classmethod + def redirect_new_json(cls): + """get the number of existing json files under the current folder.""" + existing_json_files = [ + file for file in os.listdir(cls.file_folder) if "json" in file + ] + cls.file_json = os.path.join( + cls.file_folder, "log-{}.json".format(len(existing_json_files) + 1) + ) + + +# Usage example +def display_training_stat(conf, tracker, epoch, n_bits_to_transmit): + current_time = time.strftime("%Y-%m-%d %H:%M:%S") + + # display the runtime training information. + Logger.display_metric( + name="runtime", + values={ + "time": current_time, + "epoch": epoch, + "n_bits_to_transmit": n_bits_to_transmit / 8 / (2 ** 20), + **tracker(), + }, + tags={"split": "train"} + ) + + +# Usage example +def display_test_stat(conf, tracker, epoch, label="local"): + current_time = time.strftime("%Y-%m-%d %H:%M:%S") + + # display the runtime training information. + Logger.display_metric( + name="runtime", + values={ + "time": current_time, + "epoch": epoch, + **tracker(), + }, + tags={"split": "test", "type": label} + ) diff --git a/utils/metrics.py b/utils/metrics.py new file mode 100644 index 0000000..9b4ccbe --- /dev/null +++ b/utils/metrics.py @@ -0,0 +1,67 @@ +import math + + +class Metrics(object): + + def __init__(self, topks=[1], task="classification"): + self.task = task + self.topks = topks + self.metric_names = self.get_metric_names(topks, task) + self.metrics_fn = self._get_metric_measure(topks, task) + + def evaluate(self, loss, output, target): + return self.metrics_fn(loss, output, target) + + @classmethod + def get_metric_names(cls, topks, task): + if task == "classification": + metric_names = ["Acc{}".format(topk) for topk in topks] + metric_names += ["Loss"] + elif task == "stackoverflow_lr": + metric_names = ["Acc", "Loss", "Precision", "Recall"] + else: + raise NotImplementedError + return metric_names + + def _get_metric_measure(self, topks, task): + if task == "classification": + return self._classification_metric + elif task == "stackoverflow_lr": + return self._stackoverflow_lr_metric + else: + raise NotImplementedError + + assert self.metric_names is not None + + def _classification_metric(self, loss, output, target): + """Computes the precision@k for the specified values of k""" + metric_stat = {} + metric_stat["Loss"] = loss.item() + + maxk = max(self.topks) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + for topk in self.topks: + correct_k = correct[:topk].view(-1).float().sum(0, keepdim=True) + # res.append(correct_k.mul_(100.0 / batch_size).item()) + metric_stat["Acc{}".format(topk)] = correct_k.mul_(100.0 / batch_size).item() + + return metric_stat + + def _stackoverflow_lr_metric(self, loss, output, target): + metric_stat = {} + metric_stat["Loss"] = loss.item() + predicted = (output > .5).int() + correct = predicted.eq(target).sum(axis=-1).eq(target.size(1)).sum() + true_positive = ((target * predicted) > .1).int().sum(axis=-1) + metric_stat["Precision"] = true_positive / (predicted.sum(axis=-1) + 1e-13) + metric_stat["Recall"] = true_positive / (target.sum(axis=-1) + 1e-13) + metric_stat["Acc"] = correct.mul_(100.0 / target.size(0)).item() + metric_stat["Loss"] = loss.item() + return metric_stat + + diff --git a/utils/tracker.py b/utils/tracker.py new file mode 100644 index 0000000..dcce876 --- /dev/null +++ b/utils/tracker.py @@ -0,0 +1,133 @@ +from copy import deepcopy + + +class MaxMeter(object): + """ + Keeps track of the max of all the values that are 'add'ed + """ + + def __init__(self): + self.max = None + + def update(self, value): + """ + Add a value to the accumulator. + :return: `true` if the provided value became the new max + """ + if self.max is None or value > self.max: + self.max = deepcopy(value) + return True + else: + return False + + def value(self): + """Access the current running average""" + return self.max + + +class MinMeter(object): + """ + Keeps track of the max of all the values that are 'add'ed + """ + + def __init__(self): + self.min = None + + def update(self, value): + """ + Add a value to the accumulator. + :return: `true` if the provided value became the new max + """ + if self.min is None or value < self.min: + self.min = deepcopy(value) + return True + else: + return False + + def value(self): + """Access the current running average""" + return self.min + + +class AverageMeter(object): + """Computes and stores the average and current value""" + + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.max = -float("inf") + self.min = float("inf") + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + self.max = val if val > self.max else self.max + self.min = val if val < self.min else self.min + + +class RuntimeTracker(object): + """Tracking the runtime stat for local training.""" + + # def __init__(self, metrics_to_track=["top1"], on_cuda=True): + def __init__(self, things_to_track=["loss"], on_cuda=True): + self.things_to_track = things_to_track + self.on_cuda = on_cuda + self.n_samples = 0 + self.stat = None + self.reset() + + + def reset(self): + self.stat = dict((name, AverageMeter()) for name in self.things_to_track) + self.n_samples = 0 + + # def evaluate_global_metric(self, metric): + # return global_average( + # self.stat[metric].sum, self.stat[metric].count, on_cuda=self.on_cuda + # ).item() + + # def evaluate_global_metrics(self): + # return [self.evaluate_global_metric(metric) for metric in self.metrics_to_track] + + def get_metrics_performance(self): + return [self.stat[thing].avg for thing in self.things_to_track] + + def update_metrics(self, metric_stat, n_samples): + self.n_samples += n_samples + for thing in self.things_to_track: + self.stat[thing].update(metric_stat[thing], n_samples) + + def __call__(self): + return dict((name, val.avg) for name, val in self.stat.items()) + + +class BestPerf(object): + def __init__(self, best_perf=None, larger_is_better=True): + self.best_perf = best_perf + self.cur_perf = None + self.best_perf_locs = [] + self.larger_is_better = larger_is_better + + # define meter + self._define_meter() + + def _define_meter(self): + self.meter = MaxMeter() if self.larger_is_better else MinMeter() + + def update(self, perf, perf_location): + self.is_best = self.meter.update(perf) + self.cur_perf = perf + + if self.is_best: + self.best_perf = perf + self.best_perf_locs += [perf_location] + + def get_best_perf_loc(self): + return self.best_perf_locs[-1] if len(self.best_perf_locs) != 0 else None diff --git a/utils/wandb_util.py b/utils/wandb_util.py new file mode 100644 index 0000000..984aae6 --- /dev/null +++ b/utils/wandb_util.py @@ -0,0 +1,18 @@ +import wandb + +def wandb_log(prefix, sp_values, com_values): + """ + prefix + tags.values is the name of sp_values; + values should include information like: + {"Acc": 0.9, "Loss":} + com_values should include information like: + {"epoch": epoch, } + """ + new_values = {} + for k, _ in sp_values.items(): + new_values[prefix+"/" + k] = sp_values[k] + new_values.update(com_values) + wandb.log(new_values) + + +