diff --git a/recipes/objection_detection/cfg/data/coco.yaml b/recipes/objection_detection/cfg/data/coco.yaml new file mode 100644 index 0000000..5578693 --- /dev/null +++ b/recipes/objection_detection/cfg/data/coco.yaml @@ -0,0 +1,151 @@ +######## +# Data configuration file for COCO8 trainings. +# Based on the ultralytics data conf. +# +# Adapted by: +# - Matteo Beltrami, 2023 +# - Francesco Paissan, 2023 +######## +task: detect # (str) YOLO task, i.e. detect, segment, classify, pose +mode: train # (str) YOLO mode, i.e. train, val, predict, export, track, benchmark + +# Train settings ------------------------------------------------------------------------------------------------------- +imgsz: 640 # (int | list) input images size as int for train and val modes, or list[w,h] for predict and export modes +rect: False # (bool) rectangular training if mode='train' or rectangular validation if mode='val' +cache: False # (bool) True/ram, disk or False. Use cache for data loading +single_cls: False # (bool) train multi-class data as single-class +fraction: 1.0 # (float) dataset fraction to train on (default is 1.0, all images in train set) + +# Segmentation +overlap_mask: True # (bool) masks should overlap during training (segment train only) +mask_ratio: 4 # (int) mask downsample ratio (segment train only) + +# Prediction settings -------------------------------------------------------------------------------------------------- +classes: # (int | list[int], optional) filter results by class, i.e. classes=0, or classes=[0,2,3] + +# Hyperparameters ------------------------------------------------------------------------------------------------------ +box: 7.5 # (float) box loss gain +cls: 0.5 # (float) cls loss gain (scale with pixels) +dfl: 1.5 # (float) dfl loss gain + +hsv_h: 0.015 # (float) image HSV-Hue augmentation (fraction) +hsv_s: 0.7 # (float) image HSV-Saturation augmentation (fraction) +hsv_v: 0.4 # (float) image HSV-Value augmentation (fraction) +degrees: 0.0 # (float) image rotation (+/- deg) +translate: 0.1 # (float) image translation (+/- fraction) +scale: 0.5 # (float) image scale (+/- gain) +shear: 0.0 # (float) image shear (+/- deg) +perspective: 0.0 # (float) image perspective (+/- fraction), range 0-0.001 +flipud: 0.0 # (float) image flip up-down (probability) +fliplr: 0.5 # (float) image flip left-right (probability) +mosaic: 1.0 # (float) image mosaic (probability) +mixup: 0.0 # (float) image mixup (probability) +copy_paste: 0.0 # (float) segment copy-paste (probability) + + +# Dataset location +path: /mnt/data/coco # dataset root dir +train: train2017.txt # train images (relative to 'path') 118287 images +val: val2017.txt # val images (relative to 'path') 5000 images +test: test-dev2017.txt # 20288 of 40670 images, submit to https://competitions.codalab.org/competitions/20794 + +# Classes +names: + 0: person + 1: bicycle + 2: car + 3: motorcycle + 4: airplane + 5: bus + 6: train + 7: truck + 8: boat + 9: traffic light + 10: fire hydrant + 11: stop sign + 12: parking meter + 13: bench + 14: bird + 15: cat + 16: dog + 17: horse + 18: sheep + 19: cow + 20: elephant + 21: bear + 22: zebra + 23: giraffe + 24: backpack + 25: umbrella + 26: handbag + 27: tie + 28: suitcase + 29: frisbee + 30: skis + 31: snowboard + 32: sports ball + 33: kite + 34: baseball bat + 35: baseball glove + 36: skateboard + 37: surfboard + 38: tennis racket + 39: bottle + 40: wine glass + 41: cup + 42: fork + 43: knife + 44: spoon + 45: bowl + 46: banana + 47: apple + 48: sandwich + 49: orange + 50: broccoli + 51: carrot + 52: hot dog + 53: pizza + 54: donut + 55: cake + 56: chair + 57: couch + 58: potted plant + 59: bed + 60: dining table + 61: toilet + 62: tv + 63: laptop + 64: mouse + 65: remote + 66: keyboard + 67: cell phone + 68: microwave + 69: oven + 70: toaster + 71: sink + 72: refrigerator + 73: book + 74: clock + 75: vase + 76: scissors + 77: teddy bear + 78: hair drier + 79: toothbrush + + +# Download script/URL (optional) +download: | + from ultralytics.utils.downloads import download + from pathlib import Path + + # Download labels + segments = True # segment or box labels + dir = Path(data_cfg['path']) # dataset root dir + url = 'https://github.com/ultralytics/yolov5/releases/download/v1.0/' + urls = [url + ('coco2017labels-segments.zip' if segments else 'coco2017labels.zip')] # labels + download(urls, dir=dir.parent) + # Download data + urls = ['http://images.cocodataset.org/zips/train2017.zip', # 19G, 118k images + 'http://images.cocodataset.org/zips/val2017.zip', # 1G, 5k images + 'http://images.cocodataset.org/zips/test2017.zip'] # 7G, 41k images (optional) + download(urls, dir=dir / 'images', threads=3) diff --git a/recipes/objection_detection/cfg/data/coco8.yaml b/recipes/objection_detection/cfg/data/coco8.yaml new file mode 100644 index 0000000..6492788 --- /dev/null +++ b/recipes/objection_detection/cfg/data/coco8.yaml @@ -0,0 +1,144 @@ +######## +# Data configuration file for COCO8 trainings. +# Based on the ultralytics data conf. +# +# Adapted by: +# - Matteo Beltrami, 2023 +# - Francesco Paissan, 2023 +######## +task: detect # (str) YOLO task, i.e. detect, segment, classify, pose +mode: train # (str) YOLO mode, i.e. train, val, predict, export, track, benchmark + +# Train settings ------------------------------------------------------------------------------------------------------- +imgsz: 640 # (int | list) input images size as int for train and val modes, or list[w,h] for predict and export modes +rect: False # (bool) rectangular training if mode='train' or rectangular validation if mode='val' +cache: False # (bool) True/ram, disk or False. Use cache for data loading +single_cls: False # (bool) train multi-class data as single-class +fraction: 1.0 # (float) dataset fraction to train on (default is 1.0, all images in train set) + +# Segmentation +overlap_mask: True # (bool) masks should overlap during training (segment train only) +mask_ratio: 4 # (int) mask downsample ratio (segment train only) + +# Prediction settings -------------------------------------------------------------------------------------------------- +classes: # (int | list[int], optional) filter results by class, i.e. classes=0, or classes=[0,2,3] + +# Hyperparameters ------------------------------------------------------------------------------------------------------ +box: 7.5 # (float) box loss gain +cls: 0.5 # (float) cls loss gain (scale with pixels) +dfl: 1.5 # (float) dfl loss gain + +hsv_h: 0.015 # (float) image HSV-Hue augmentation (fraction) +hsv_s: 0.7 # (float) image HSV-Saturation augmentation (fraction) +hsv_v: 0.4 # (float) image HSV-Value augmentation (fraction) +degrees: 0.0 # (float) image rotation (+/- deg) +translate: 0.1 # (float) image translation (+/- fraction) +scale: 0.5 # (float) image scale (+/- gain) +shear: 0.0 # (float) image shear (+/- deg) +perspective: 0.0 # (float) image perspective (+/- fraction), range 0-0.001 +flipud: 0.0 # (float) image flip up-down (probability) +fliplr: 0.5 # (float) image flip left-right (probability) +mosaic: 1.0 # (float) image mosaic (probability) +mixup: 0.0 # (float) image mixup (probability) +copy_paste: 0.0 # (float) segment copy-paste (probability) + + +# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..] +path: /mnt/data/coco8 # dataset root dir +train: images/train # train images (relative to 'path') 4 images +val: images/val # val images (relative to 'path') 4 images +test: # test images (optional) + +# Classes +names: + 0: person + 1: bicycle + 2: car + 3: motorcycle + 4: airplane + 5: bus + 6: train + 7: truck + 8: boat + 9: traffic light + 10: fire hydrant + 11: stop sign + 12: parking meter + 13: bench + 14: bird + 15: cat + 16: dog + 17: horse + 18: sheep + 19: cow + 20: elephant + 21: bear + 22: zebra + 23: giraffe + 24: backpack + 25: umbrella + 26: handbag + 27: tie + 28: suitcase + 29: frisbee + 30: skis + 31: snowboard + 32: sports ball + 33: kite + 34: baseball bat + 35: baseball glove + 36: skateboard + 37: surfboard + 38: tennis racket + 39: bottle + 40: wine glass + 41: cup + 42: fork + 43: knife + 44: spoon + 45: bowl + 46: banana + 47: apple + 48: sandwich + 49: orange + 50: broccoli + 51: carrot + 52: hot dog + 53: pizza + 54: donut + 55: cake + 56: chair + 57: couch + 58: potted plant + 59: bed + 60: dining table + 61: toilet + 62: tv + 63: laptop + 64: mouse + 65: remote + 66: keyboard + 67: cell phone + 68: microwave + 69: oven + 70: toaster + 71: sink + 72: refrigerator + 73: book + 74: clock + 75: vase + 76: scissors + 77: teddy bear + 78: hair drier + 79: toothbrush + +# Download script/URL (optional) +download: | + from pathlib import Path + import zipfile + import os + data_cfg['path'] = Path(data_cfg['path']) + os.makedirs(data_cfg["path"], exist_ok=True) + os.system(f"wget https://ultralytics.com/assets/coco8.zip -O {os.path.join(data_cfg['path'], 'coco8.zip')}") + with zipfile.ZipFile(os.path.join(data_cfg['path'], 'coco8.zip'), 'r') as zip_ref: + zip_ref.extractall(data_cfg['path'].parent) diff --git a/recipes/objection_detection/cfg/yolo_phinet.py b/recipes/objection_detection/cfg/yolo_phinet.py new file mode 100644 index 0000000..4d7e5c8 --- /dev/null +++ b/recipes/objection_detection/cfg/yolo_phinet.py @@ -0,0 +1,20 @@ +""" +YOLOPhiNet training configuration. + +Authors: + - Matteo Beltrami, 2023 + - Francesco Paissan, 2023 +""" +# Data configuration +batch_size = 8 +data_cfg = "cfg/data/coco.yaml" + +# Model configuration +input_shape = (3, 672, 672) +alpha = 3 +num_layers = 7 +beta = 0.75 +t_zero = 6 +divisor = 8 +downsampling_layers = [5, 7] +return_layers = [4, 6, 7] diff --git a/recipes/objection_detection/prepare_data.py b/recipes/objection_detection/prepare_data.py new file mode 100644 index 0000000..bb3b570 --- /dev/null +++ b/recipes/objection_detection/prepare_data.py @@ -0,0 +1,88 @@ +""" +Data preparation script for YOLO training. Parses ultralytics yaml files +and, if needed, downloads them on disk. + +Authors: + - Matteo Beltrami, 2023 + - Francesco Paissan, 2023 +""" +from typing import Dict +import os + +from torch.utils.data import DataLoader, ConcatDataset +from ultralytics.data import build_yolo_dataset + + +def create_loaders(m_cfg: Dict, data_cfg: Dict, batch_size: int): + """Creates DataLoaders for dataset specified in the configuration file. + Refer to ... for how to select the proper configuration. + + Arguments + --------- + m_cfg : Dict + Contains information about the training process (e.g., data augmentation). + data_cfg : Dict + Contains details about the data configurations (e.g., image size, etc.). + batch_size : int + Batch size for the training process. + + """ + if "download" in data_cfg and not os.path.exists(data_cfg["path"]): + # download data if it's not there + exec(data_cfg["download"]) + + mode = "train" + if isinstance(data_cfg["train"], list): + train_set = [] + for p in data_cfg["train"]: + train_set.append( + build_yolo_dataset( + m_cfg, + p, + batch_size, + data_cfg, + mode=mode, + rect=mode == "val", + ) + ) + train_set = ConcatDataset(train_set) + train_set = build_yolo_dataset( + m_cfg, + data_cfg["train"], + batch_size, + data_cfg, + mode=mode, + rect=mode == "val", + ) + + train_loader = DataLoader( + train_set, + batch_size, + shuffle=True, + num_workers=16, + persistent_workers=True, + pin_memory=True, + collate_fn=getattr(train_set, "collate_fn", None), + ) + + mode = "val" + val_set = build_yolo_dataset( + m_cfg, + data_cfg["val"], + batch_size, + data_cfg, + mode=mode, + rect=mode == "val", + ) + + val_loader = DataLoader( + val_set, + batch_size, + shuffle=False, + num_workers=16, + persistent_workers=True, + pin_memory=True, + collate_fn=getattr(val_set, "collate_fn", None), + ) + + return train_loader, val_loader diff --git a/recipes/objection_detection/train.py b/recipes/objection_detection/train.py new file mode 100644 index 0000000..51cc0c5 --- /dev/null +++ b/recipes/objection_detection/train.py @@ -0,0 +1,194 @@ +""" +YOLO training. + +This code allows you to train an object detection model with the YOLOv8 neck and loss. + +To run this script, you can start it with: + python train.py cfg/yolo_phinet.py + +Authors: + - Matteo Beltrami, 2023 + - Francesco Paissan, 2023 +""" + +import torch +from prepare_data import create_loaders +from torchinfo import summary +from ultralytics.utils.ops import scale_boxes, xywh2xyxy +from yolo_loss import Loss + +import micromind as mm +from micromind.networks import PhiNet +from micromind.networks.yolo import SPPF, DetectionHead, Yolov8Neck +from micromind.utils import parse_configuration +from micromind.utils.yolo import ( + load_config, + mean_average_precision, + postprocess, +) +import sys + + +class YOLO(mm.MicroMind): + def __init__(self, m_cfg, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.modules["phinet"] = PhiNet( + input_shape=hparams.input_shape, + alpha=hparams.alpha, + num_layers=hparams.num_layers, + beta=hparams.beta, + t_zero=hparams.t_zero, + include_top=False, + compatibility=False, + divisor=hparams.divisor, + downsampling_layers=hparams.downsampling_layers, + return_layers=hparams.return_layers, + ) + + # load ImageNet checkpoint + self.modules["phinet"].load_state_dict( + torch.load(hparams.model_path), strict=False + ) + + sppf_ch, neck_filters, up, head_filters = self.get_parameters() + + self.modules["sppf"] = SPPF(*sppf_ch) + self.modules["neck"] = Yolov8Neck(filters=neck_filters, up=up) + self.modules["head"] = DetectionHead(filters=head_filters) + + tot_params = 0 + for m in self.modules.values(): + temp = summary(m, verbose=0) + tot_params += temp.total_params + + print(f"Total parameters of model: {tot_params * 1e-6:.2f} M") + + self.m_cfg = m_cfg + + def get_parameters(self): + """ + Gets the parameters with which to initialize the network detection part + (SPPF block, Yolov8Neck, DetectionHead). + """ + in_shape = self.modules["phinet"].input_shape + x = torch.randn(1, *in_shape) + y = self.modules["phinet"](x) + + c1 = c2 = y[1][2].shape[1] + sppf = SPPF(c1, c2) + out_sppf = sppf(y[1][2]) + + neck_filters = [y[1][0].shape[1], y[1][1].shape[1], out_sppf.shape[1]] + up = [2, 2] + up[0] = y[1][1].shape[2] / out_sppf.shape[2] + up[1] = y[1][0].shape[2] / (up[0] * out_sppf.shape[2]) + temp = """The layers you selected are not valid. \ + Please choose only layers between which the spatial resolution \ + doubles every time. Eventually, you can achieve this by \ + changing the downsampling layers.""" + + assert up == [2, 2], " ".join(temp.split()) + + neck = Yolov8Neck(filters=neck_filters, up=up) + out_neck = neck(y[1][0], y[1][1], out_sppf) + + head_filters = ( + out_neck[0].shape[1], + out_neck[1].shape[1], + out_neck[2].shape[1], + ) + # head = DetectionHead(filters=head_filters) + + return (c1, c2), neck_filters, up, head_filters + + def preprocess_batch(self, batch): + """Preprocesses a batch of images by scaling and converting to float.""" + preprocessed_batch = {} + preprocessed_batch["img"] = ( + batch["img"].to(self.device, non_blocking=True).float() / 255 + ) + for k in batch: + if isinstance(batch[k], torch.Tensor) and k != "img": + preprocessed_batch[k] = batch[k].to(self.device) + + return preprocessed_batch + + def forward(self, batch): + preprocessed_batch = self.preprocess_batch(batch) + backbone = self.modules["phinet"](preprocessed_batch["img"].to(self.device))[1] + backbone[-1] = self.modules["sppf"](backbone[-1]) + neck = self.modules["neck"](*backbone) + head = self.modules["head"](neck) + + return head + + def compute_loss(self, pred, batch): + self.criterion = Loss(self.m_cfg, self.modules["head"], self.device) + preprocessed_batch = self.preprocess_batch(batch) + + lossi_sum, lossi = self.criterion( + pred[1], + preprocessed_batch, + ) + + return lossi_sum + + def configure_optimizers(self): + opt = torch.optim.SGD(self.modules.parameters(), lr=1e-2, weight_decay=0.0005) + sched = torch.optim.lr_scheduler.CosineAnnealingLR( + opt, T_max=14000, eta_min=1e-3 + ) + return opt, sched + + @torch.no_grad() + def mAP(self, pred, batch): + preprocessed_batch = self.preprocess_batch(batch) + post_predictions = postprocess( + preds=pred[0], img=preprocessed_batch, orig_imgs=batch + ) + + batch_bboxes_xyxy = xywh2xyxy(batch["bboxes"]) + dim = batch["resized_shape"][0][0] + batch_bboxes_xyxy[:, :4] *= dim + + batch_bboxes = [] + for i in range(len(batch["batch_idx"])): + for b in range(len(batch_bboxes_xyxy[batch["batch_idx"] == i, :])): + batch_bboxes.append( + scale_boxes( + batch["resized_shape"][i], + batch_bboxes_xyxy[batch["batch_idx"] == i, :][b], + batch["ori_shape"][i], + ) + ) + batch_bboxes = torch.stack(batch_bboxes) + mmAP = mean_average_precision(post_predictions, batch, batch_bboxes) + + return torch.Tensor([mmAP]) + + +if __name__ == "__main__": + assert len(sys.argv) > 1, "Please pass the configuration file to the script." + hparams = parse_configuration(sys.argv[1]) + + m_cfg, data_cfg = load_config(hparams.data_cfg) + train_loader, val_loader = create_loaders(m_cfg, data_cfg, hparams.batch_size) + + exp_folder = mm.utils.checkpointer.create_experiment_folder( + hparams.output_folder, hparams.experiment_name + ) + + checkpointer = mm.utils.checkpointer.Checkpointer(exp_folder, key="loss") + + yolo_mind = YOLO(m_cfg, hparams=hparams) + + mAP = mm.Metric("mAP", yolo_mind.mAP, eval_only=True, eval_period=1) + + yolo_mind.train( + epochs=200, + datasets={"train": train_loader, "val": val_loader}, + metrics=[mAP], + checkpointer=checkpointer, + debug=hparams.debug, + ) diff --git a/recipes/objection_detection/yolo_loss.py b/recipes/objection_detection/yolo_loss.py new file mode 100644 index 0000000..29650a6 --- /dev/null +++ b/recipes/objection_detection/yolo_loss.py @@ -0,0 +1,137 @@ +""" +Wrapper for the YOLO loss, from the ultralytics implementation. +For a reference on the parameters, please refer to https://shorturl.at/gkrAO + + +Authors: + - Matteo Beltrami, 2023 + - Francesco Paissan, 2023 +""" +import torch +import torch.nn as nn +from ultralytics.utils.loss import BboxLoss, v8DetectionLoss +from ultralytics.utils.ops import xywh2xyxy +from ultralytics.utils.tal import TaskAlignedAssigner, dist2bbox, make_anchors + + +class Loss(v8DetectionLoss): + def __init__(self, h, m, device): # model must be de-paralleled + self.bce = nn.BCEWithLogitsLoss(reduction="none") + self.hyp = h + self.stride = m.stride + self.nc = m.nc + self.no = m.no + self.reg_max = m.reg_max + self.device = device + + self.use_dfl = m.reg_max > 1 + + self.assigner = TaskAlignedAssigner( + topk=10, num_classes=self.nc, alpha=0.5, beta=6.0 + ) + self.bbox_loss = BboxLoss(m.reg_max - 1, use_dfl=self.use_dfl).to(device) + self.proj = torch.arange(m.reg_max, dtype=torch.float, device=device) + + def preprocess(self, targets, batch_size, scale_tensor): + """ + Preprocesses the target counts and matches with the input batch size + to output a tensor. + """ + if targets.shape[0] == 0: + out = torch.zeros(batch_size, 0, 5, device=self.device) + else: + i = targets[:, 0] # image index + _, counts = i.unique(return_counts=True) + counts = counts.to(dtype=torch.int32) + out = torch.zeros(batch_size, counts.max(), 5, device=self.device) + for j in range(batch_size): + matches = i == j + n = matches.sum() + if n: + out[j, :n] = targets[matches, 1:] + out[..., 1:5] = xywh2xyxy(out[..., 1:5].mul_(scale_tensor)) + return out + + def bbox_decode(self, anchor_points, pred_dist): + """ + Decode predicted object bounding box coordinates from anchor points and + distribution. + """ + if self.use_dfl: + b, a, c = pred_dist.shape # batch, anchors, channels + pred_dist = ( + pred_dist.view(b, a, 4, c // 4) + .softmax(3) + .matmul(self.proj.type(pred_dist.dtype)) + ) + return dist2bbox(pred_dist, anchor_points, xywh=False) + + def __call__(self, preds, batch): + """ + Calculate the sum of the loss for box, cls and dfl multiplied by batch size. + """ + loss = torch.zeros(3, device=self.device) # box, cls, dfl + feats = preds[1] if isinstance(preds, tuple) else preds + pred_distri, pred_scores = torch.cat( + [xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2 + ).split((self.reg_max * 4, self.nc), 1) + + pred_scores = pred_scores.permute(0, 2, 1).contiguous() + pred_distri = pred_distri.permute(0, 2, 1).contiguous() + + dtype = pred_scores.dtype + batch_size = pred_scores.shape[0] + imgsz = ( + torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) + * self.stride[0] + ) # image size (h,w) + anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5) + + # Targets + targets = torch.cat( + (batch["batch_idx"].view(-1, 1), batch["cls"].view(-1, 1), batch["bboxes"]), + 1, + ) + targets = self.preprocess( + targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]] + ) + gt_labels, gt_bboxes = targets.split((1, 4), 2) # cls, xyxy + mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0) + + # Pboxes + pred_bboxes = self.bbox_decode(anchor_points, pred_distri) # xyxy, (b, h*w, 4) + + _, target_bboxes, target_scores, fg_mask, _ = self.assigner( + pred_scores.detach().sigmoid(), + (pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype), + anchor_points * stride_tensor, + gt_labels, + gt_bboxes, + mask_gt, + ) + + target_scores_sum = max(target_scores.sum(), 1) + + # Cls loss + loss[1] = ( + self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum + ) # BCE + + # Bbox loss + if fg_mask.sum(): + target_bboxes /= stride_tensor + loss[0], loss[2] = self.bbox_loss( + pred_distri, + pred_bboxes, + anchor_points, + target_bboxes, + target_scores, + target_scores_sum, + fg_mask, + ) + + loss[0] *= self.hyp.box # box gain + loss[1] *= self.hyp.cls # cls gain + loss[2] *= self.hyp.dfl # dfl gain + + return loss.sum() * batch_size, loss.detach() # loss(box, cls, dfl)