diff --git a/README.md b/README.md index c156d500..d2b6ee8a 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ After installing the required computing environment (see next section), one need ## config.yaml -The `config.yaml` file is located in the `conf` directory. It stores the values of all parameters needed by the deep learning algorithms for all phases. It contains the following 5 sections: +The `config.yaml` file is located in the `conf` directory. It stores the values of all parameters needed by the deep learning algorithms for all phases. It contains the following 4 sections: ```yaml # Deep learning configuration file ------------------------------------------------ @@ -77,7 +77,6 @@ The `config.yaml` file is located in the `conf` directory. It stores the values # 2) Sampling parameters # 3) Training parameters # 4) Inference parameters -# 5) Model parameters ``` Specific parameters in each section are shown below, where relevant. For more information about config.yaml, view file directly: [conf/config.yaml](https://github.com/NRCan/geo-deep-learning/blob/master/conf/config.yaml) @@ -91,18 +90,6 @@ Specific parameters in each section are shown below, where relevant. For more in - [FCN (backbone: resnet101)](https://people.eecs.berkeley.edu/~jonlong/long_shelhamer_fcn.pdf) - [Deeplabv3 (backbone: resnet101)](https://arxiv.org/abs/1706.05587) -The `config.yaml` contains parameters for each model. Here's an example: - -```yaml -# Models parameters; used in train_model.py and inference.py - -models: - unet: unet001 - dropout: False # Set dropout regularization - probability: 0.2 # Set with dropout - pretrained: /path/to/model/checkpoint.pth.tar # Optional -``` - ## `csv` preparation The `csv` specifies the input images and the reference vector data that will be use during the training. Each row in the `csv` file must contain 4 comma-separated items: @@ -138,8 +125,8 @@ global: number_of_bands: 3 # Number of bands in input images model_name: unetsmall # One of unet, unetsmall, checkpointed_unet, ternausnet, or inception bucket_name: # name of the S3 bucket where data is stored. Leave blank if using local files - debug_mode: True # Prints detailed progress bar scale_data: [0, 1] # Min and Max for input data rescaling. Default: [0, 1]. Enter False if no rescaling is desired. + debug_mode: True # Prints detailed progress bar sample: prep_csv_file: /path/to/csv/file_name.csv # Path to CSV file used in preparation. @@ -175,7 +162,7 @@ Details on parameters used by this module: global: samples_size: 256 # Size (in pixel) of the samples num_classes: 2 # Number of classes - data_path: /path/to/data/folder # Path to folder containing samples + data_path: /path/to/data/folder # Path to folder containing samples, model and log files number_of_bands: 3 # Number of bands in input images model_name: unetsmall # One of unet, unetsmall, checkpointed_unet, ternausnet, or inception bucket_name: # name of the S3 bucket where data is stored. Leave blank if using local files @@ -183,20 +170,21 @@ global: num_gpus: 0 # Number of GPU device(s) to use. Default: 0 debug_mode: True # Prints detailed progress bar with sample loss, GPU stats (RAM, % of use) and information about current samples. - training: - output_path: /path/to/output/weights/folder # Path to folder where files containing weights will be written + state_dict_path: False # Pretrained model path as .pth.tar or .pth file. Optional. num_trn_samples: 4960 # Number of samples to use for training. (default: all samples in hdfs file are taken) num_val_samples: 2208 # Number of samples to use for validation. (default: all samples in hdfs file are taken) num_tst_samples: # Number of samples to use for test. (default: all samples in hdfs file are taken) batch_size: 32 # Size of each batch num_epochs: 150 # Number of epochs - loss_fn: Lovasz # One of CrossEntropy, Lovasz, Focal, OhemCrossEntropy (*Lovasz for segmentation tasks only) - optimizer: adabound # One of adam, sgd or adabound + loss_fn: Lovasz # One of CrossEntropy, Lovasz, Focal, OhemCrossEntropy (*Lovasz for segmentation tasks only) + optimizer: adabound # One of adam, sgd or adabound learning_rate: 0.0001 # Initial learning rate weight_decay: 0 # Value for weight decay (each epoch) - gamma: 0.9 # Multiple for learning rate decay step_size: 4 # Apply gamma every step_size + gamma: 0.9 # Multiple for learning rate decay + dropout: False # (bool) Use dropout or not. Applies to certain models only. + dropout_prob: False # (float) Set dropout probability, e.g. 0.5 class_weights: [1.0, 2.0] # Weights to apply to each class. A value > 1.0 will apply more weights to the learning of the class. batch_metrics: 2 # (int) Metrics computed every (int) batches. If left blank, will not perform metrics. If (int)=1, metrics computed on all batches. ignore_index: 0 # Specifies a target value that is ignored and does not contribute to the input gradient. Default: None @@ -214,6 +202,8 @@ Inputs: Output: - Trained model weights - checkpoint.pth.tar Corresponding to the training state where the validation loss was the lowest during the training process. +- Model weights and log files are saved to: data_path / 'model' / name_of_.yaml_file. +- If running multiple tests with same data_path, a suffix containing date and time is added to directory (i.e. name of .yaml file) Process: - The application loads the model @@ -233,6 +223,11 @@ Optimizers: - SGD (standard optimizer in [torch.optim](https://pytorch.org/docs/stable/optim.html) - [Adabound/AdaboundW](https://openreview.net/forum?id=Bkg3g2R9FX) +Advanced features: +- To check how a pretrained model performs on test split without fine-tuning, simply: + 1. Specify state_dict_path in training parameters + 2. In same parameter section, set num_epochs to 0. + ## inference.py The final step in the process is to assign very pixel in the original image a value corresponding to the most probable class. @@ -249,12 +244,12 @@ global: model_name: unetsmall # One of unet, unetsmall, checkpointed_unet, ternausnet, or inception bucket_name: # name of the S3 bucket where data is stored. Leave blank if using local files task: segmentation # Task to perform. Either segmentation or classification - debug_mode: True # Prints detailed progress bar scale_data: [0, 1] # Min and Max for input data rescaling. Default: [0, 1]. Enter False if no rescaling is desired. + debug_mode: True # Prints detailed progress bar inference: - img_csv_file: /path/to/csv/containing/images/list.csv # CSV file containing the list of all images to infer on + img_dir_or_csv_file: /path/to/csv/containing/images/list.csv # Directory containing all images to infer on OR CSV file with list of images working_folder: /path/to/folder/with/resulting/images # Folder where all resulting images will be written state_dict_path: /path/to/model/weights/for/inference/checkpoint.pth.tar # File containing pre-trained weights chunk_size: 512 # (int) Size (height and width) of each prediction patch. Default: 512 @@ -322,15 +317,22 @@ global: debug_mode: True # Prints detailed progress bar with sample loss, GPU stats (RAM, % of use) and information about current samples. training: - output_path: /path/to/output/weights/folder # Path to folder where files containing weights will be written + state_dict_path: False # Pretrained model path as .pth.tar or .pth file. Optional. batch_size: 32 # Size of each batch num_epochs: 150 # Number of epochs learning_rate: 0.0001 # Initial learning rate weight_decay: 0 # Value for weight decay (each epoch) - gamma: 0.9 # Multiple for learning rate decay step_size: 4 # Apply gamma every step_size + gamma: 0.9 # Multiple for learning rate decay + dropout: False # (bool) Use dropout or not. Applies to certain models only. + dropout_prob: False # (float) Set dropout probability, e.g. 0.5 class_weights: [1.0, 2.0] # Weights to apply to each class. A value > 1.0 will apply more weights to the learning of the class. batch_metrics: 2 # (int) Metrics computed every (int) batches. If left blank, will not perform metrics. If (int)=1, metrics computed on all batches. + ignore_index: 0 # Specifies a target value that is ignored and does not contribute to the input gradient. Default: None + augmentation: + rotate_limit: 45 + rotate_prob: 0.5 + hflip_prob: 0.5 ``` Note: ```data_path``` must always have a value for classification tasks @@ -341,6 +343,8 @@ Output: - Trained model weights - checkpoint.pth.tar Corresponding to the training state where the validation loss was the lowest during the training process. - last_epoch.pth.tar Corresponding to the training state after the last epoch. +- Model weights and log files are saved to: data_path / 'model' / name_of_.yaml_file. +- If running multiple tests with same data_path, a suffix containing date and time is added to directory (i.e. name of .yaml file) Process: - The application loads the model specified in the configuration file @@ -378,7 +382,7 @@ global: debug_mode: True # Prints detailed progress bar inference: - img_csv_file: /path/to/csv/containing/images/list.csv # CSV file containing the list of all images to infer on + img_dir_or_csv_file: /path/to/csv/containing/images/list.csv # Directory containing all images to infer on OR CSV file with list of images working_folder: /path/to/folder/with/resulting/images # Folder where all resulting images will be written state_dict_path: /path/to/model/weights/for/inference/checkpoint.pth.tar # File containing pre-trained weights ``` diff --git a/conf/config.yaml b/conf/config.yaml index 8fa727d9..baea00f3 100644 --- a/conf/config.yaml +++ b/conf/config.yaml @@ -31,7 +31,7 @@ sample: # Training parameters; used in train_model.py ---------------------- training: - output_path: /path/to/model/weights/output/folder + state_dict_path: path/to/pretrained/file/checkpoint.pth.tar # optional num_trn_samples: 4960 num_val_samples: 2208 num_tst_samples: 1000 @@ -43,6 +43,8 @@ training: weight_decay: 0 step_size: 4 gamma: 0.9 + dropout: False # (bool) Use dropout or not + dropout_prob: # (float) Set dropout probability, e.g. 0.5 class_weights: [1.0, 2.0] batch_metrics: # (int) Metrics computed every (int) batches. If left blank, will not perform metrics. If (int)=1, metrics computed on all batches. ignore_index: 0 # Specifies a target value that is ignored and does not contribute to the input gradient. Default: None @@ -54,29 +56,8 @@ training: # Inference parameters; used in inference.py -------- inference: - img_csv_file: /path/to/csv/containing/images/list.csv + img_dir_or_csv_file: /path/to/csv/containing/images/list.csv working_folder: /path/to/folder/with/resulting/images state_dict_path: /path/to/model/weights/for/inference/checkpoint.pth.tar chunk_size: 512 # (int) Size (height and width) of each prediction patch. Default: 512 overlap: 10 # (int) Percentage of overlap between 2 chunks. Default: 10 - -# Models parameters; used in train_model.py and inference.py - -models: - unet: &unet001 - dropout: False - probability: 0.2 # Set with dropout - pretrained: False # optional - unetsmall: - <<: *unet001 - pretrained: - ternausnet: - pretrained: ./models/TernausNet.pt # Mandatory - checkpointed_unet: - <<: *unet001 - fcn_resnet101: # pretrained on coco dataset. Use only for 3 band data. - pretrained: # optional - deeplabv3_resnet101: # pretrained on coco dataset. Use only for 3 band data. - pretrained: # optional - inception: - pretrained: # optional diff --git a/conf/config_ci_classification_local.yaml b/conf/config_ci_classification_local.yaml index c78ed0ff..c44a5bad 100644 --- a/conf/config_ci_classification_local.yaml +++ b/conf/config_ci_classification_local.yaml @@ -31,7 +31,7 @@ sample: # Training parameters; used in train_model.py ---------------------- training: - output_path: ./data + state_dict_path: # optional num_trn_samples: 24 num_val_samples: 24 num_tst_samples: @@ -43,6 +43,8 @@ training: weight_decay: 0 step_size: 4 gamma: 0.9 + dropout: False # (bool) Use dropout or not + dropout_prob: False # (float) Set dropout probability, e.g. 0.5 class_weights: batch_metrics: 1 ignore_index: # Specifies a target value that is ignored and does not contribute to the input gradient @@ -50,28 +52,8 @@ training: # Inference parameters; used in inference.py -------- inference: - img_csv_file: ./data/inference_classif_ci_csv.csv + img_dir_or_csv_file: ./data/inference_classif_ci_csv.csv working_folder: ./data/classification - state_dict_path: ./data/checkpoint.pth.tar + state_dict_path: ./data/model/config_ci_classification_local/checkpoint.pth.tar chunk_size: - overlap: - -# Models parameters; used in train_model.py and inference.py - -models: - unet: &unet001 - dropout: False - probability: 0.2 # Set with dropout - pretrained: False # optional - unetsmall: - <<: *unet001 - ternausnet: - pretrained: ./models/TernausNet.pt # Mandatory - checkpointed_unet: - <<: *unet001 - fcn_resnet101: # only for 3 band data - pretrained: # optional - deeplabv3_resnet101: # only for 3 band data - pretrained: # optional - inception: - pretrained: # optional \ No newline at end of file + overlap: \ No newline at end of file diff --git a/conf/config_ci_segmentation_local.yaml b/conf/config_ci_segmentation_local.yaml index 1d1c10fd..9308162c 100644 --- a/conf/config_ci_segmentation_local.yaml +++ b/conf/config_ci_segmentation_local.yaml @@ -31,7 +31,7 @@ sample: # Training parameters; used in train_model.py ---------------------- training: - output_path: ./data + state_dict_path: # optional num_trn_samples: num_val_samples: num_tst_samples: @@ -43,6 +43,8 @@ training: weight_decay: 0 step_size: 4 gamma: 0.9 + dropout: False # (bool) Use dropout or not + dropout_prob: False # (float) Set dropout probability, e.g. 0.5 class_weights: [1.0, 2.0] batch_metrics: 1 ignore_index: 0 # Specifies a target value that is ignored and does not contribute to the input gradient @@ -54,28 +56,8 @@ training: # Inference parameters; used in inference.py -------- inference: - img_csv_file: ./data/inference_sem_seg_ci_csv.csv + img_dir_or_csv_file: ./data/inference_sem_seg_ci_csv.csv working_folder: ./data - state_dict_path: ./data/checkpoint.pth.tar + state_dict_path: ./data/model/config_ci_segmentation_local/checkpoint.pth.tar chunk_size: 512 # (int) Size (height and width) of each prediction patch. Default: 512 - overlap: 10 # (int) Percentage of overlap between 2 chunks. Default: 10 - -# Models parameters; used in train_model.py and inference.py - -models: - unet: &unet001 - dropout: False - probability: 0.2 # Set with dropout - pretrained: False # optional - unetsmall: - <<: *unet001 - ternausnet: - pretrained: ./models/TernausNet.pt # Mandatory - checkpointed_unet: - <<: *unet001 - fcn_resnet101: # only for 3 band data - pretrained: # optional - deeplabv3_resnet101: # only for 3 band data - pretrained: /home/rtavon/Documents/kingston-test-deeplabv3-2/model/checkpoint.pth.tar # optional - inception: - pretrained: # optional \ No newline at end of file + overlap: 10 # (int) Percentage of overlap between 2 chunks. Default: 10 \ No newline at end of file diff --git a/images_to_samples.py b/images_to_samples.py index ef7ae298..88f10378 100644 --- a/images_to_samples.py +++ b/images_to_samples.py @@ -1,5 +1,7 @@ import argparse import os +from pathlib import Path + import numpy as np import warnings import fiona @@ -161,6 +163,7 @@ def main(params): gpkg_file = [] bucket_name = params['global']['bucket_name'] data_path = params['global']['data_path'] + Path.mkdir(Path(data_path), exist_ok=True) csv_file = params['sample']['prep_csv_file'] if bucket_name: @@ -199,7 +202,10 @@ def main(params): bucket.download_file(info['gpkg'], info['gpkg'].split('/')[-1]) info['gpkg'] = info['gpkg'].split('/')[-1] - assert_band_number(info['tif'], params['global']['number_of_bands']) + if os.path.isfile(info['tif']): + assert_band_number(info['tif'], params['global']['number_of_bands']) + else: + raise IOError(f'Could not locate "{info["tif"]}". Make sure file exists in this directory.') _tqdm.set_postfix(OrderedDict(file=f'{info["tif"]}', sample_size=params['global']['samples_size'])) diff --git a/inference.py b/inference.py index 57d01b2b..c3cb6370 100644 --- a/inference.py +++ b/inference.py @@ -13,10 +13,11 @@ from collections import OrderedDict import warnings from tqdm import tqdm +from pathlib import Path from models.model_choice import net from utils.utils import read_parameters, assert_band_number, load_from_checkpoint, \ - image_reader_as_array, read_csv, get_device_ids + image_reader_as_array, read_csv, get_device_ids, gpu_stats from utils.preprocess import minmax_scale try: @@ -80,34 +81,43 @@ def sem_seg_inference(model, nd_array, overlay, chunk_size, num_classes, device) if padded_array.any(): with torch.no_grad(): - for row in tqdm(range(overlay, h, chunk_size - overlay), position=1, leave=False): - row_start = row - overlay - row_end = row_start + chunk_size - for col in range(overlay, w, chunk_size - overlay): - col_start = col - overlay - col_end = col_start + chunk_size - - chunk_input = padded_array[row_start:row_end, col_start:col_end, :] - inputs = torch.from_numpy(np.float32(np.transpose(chunk_input, (2, 0, 1)))) - - inputs.unsqueeze_(0) - - inputs = inputs.to(device) - # forward - outputs = model(inputs) - - # torchvision models give output it 'out' key. May cause problems in future versions of torchvision. - if isinstance(outputs, OrderedDict) and 'out' in outputs.keys(): - outputs = outputs['out'] - - output_counts[row_start:row_end, col_start:col_end] += 1 - output_probs[:, row_start:row_end, col_start:col_end] += np.squeeze(outputs.cpu().numpy(), axis=0) + with tqdm(range(overlay, h, chunk_size - overlay), position=1, leave=False) as _tqdm: + for row in _tqdm: + row_start = row - overlay + row_end = row_start + chunk_size + for col in range(overlay, w, chunk_size - overlay): + col_start = col - overlay + col_end = col_start + chunk_size + + chunk_input = padded_array[row_start:row_end, col_start:col_end, :] + inputs = torch.from_numpy(np.float32(np.transpose(chunk_input, (2, 0, 1)))) + + inputs.unsqueeze_(0) + + inputs = inputs.to(device) + # forward + outputs = model(inputs) + + # torchvision models give output it 'out' key. May cause problems in future versions of torchvision. + if isinstance(outputs, OrderedDict) and 'out' in outputs.keys(): + outputs = outputs['out'] + + output_counts[row_start:row_end, col_start:col_end] += 1 + output_probs[:, row_start:row_end, col_start:col_end] += np.squeeze(outputs.cpu().numpy(), axis=0) + + if debug and device.type == 'cuda': + res, mem = gpu_stats(device=device.index) + _tqdm.set_postfix(OrderedDict(device=device, + gpu_perc=f'{res.gpu} %', + gpu_RAM=f'{mem.used / (1024 ** 2):.0f}/{mem.total / (1024 ** 2):.0f} MiB', + chunk_size=inputs.cpu().numpy().shape, + output_size=outputs.cpu().numpy().shape)) output_mask = np.argmax(np.divide(output_probs, np.maximum(output_counts, 1)), axis=0) # Resize the output array to the size of the input image and write it return output_mask[overlay:(h + overlay), overlay:(w + overlay)].astype(np.uint8) else: - print("Error classifying image : Image shape of {:1} is not recognized".format(len(nd_array.shape))) + raise IOError(f"Error classifying image : Image shape of {len(nd_array.shape)} is not recognized") def classifier(params, img_list, model): @@ -203,7 +213,10 @@ def main(params): """ since = time.time() - csv_file = params['inference']['img_csv_file'] + img_dir_or_csv = params['inference']['img_dir_or_csv_file'] + working_folder = Path(params['inference']['working_folder']) + Path.mkdir(working_folder, exist_ok=True) + print(f'Inferences will be saved to: {working_folder}') bucket = None bucket_name = params['global']['bucket_name'] @@ -225,10 +238,24 @@ def main(params): if bucket_name: s3 = boto3.resource('s3') bucket = s3.Bucket(bucket_name) - bucket.download_file(csv_file, 'img_csv_file.csv') - list_img = read_csv('img_csv_file.csv', inference=True) + if img_dir_or_csv.endswith('.csv'): + bucket.download_file(img_dir_or_csv, 'img_csv_file.csv') + list_img = read_csv('img_csv_file.csv', inference=True) + else: + raise NotImplementedError('Specify a csv file containing images for inference. Directory input not implemented yet') else: - list_img = read_csv(csv_file, inference=True) + if img_dir_or_csv.endswith('.csv'): + list_img = read_csv(img_dir_or_csv, inference=True) + else: + img_dir = Path(img_dir_or_csv) + assert img_dir.exists(), f'Could not find directory "{img_dir_or_csv}"' + list_img_paths = sorted(img_dir.glob('*.tif')) + list_img = [] + for img_path in list_img_paths: + img = {} + img['tif'] = img_path + list_img.append(img) + assert len(list_img) >= 0, f'No .tif files found in {img_dir_or_csv}' if params['global']['task'] == 'classification': classifier(params, list_img, model) @@ -242,35 +269,42 @@ def main(params): chunk_size, nbr_pix_overlap = calc_overlap(params) num_classes = params['global']['num_classes'] - for img in tqdm(list_img, desc='image list', position=0): - img_name = os.path.basename(img['tif']) - if bucket: - local_img = f"Images/{img_name}" - bucket.download_file(img['tif'], local_img) - inference_image = f"Classified_Images/{img_name.split('.')[0]}_inference.tif" - else: - local_img = img['tif'] - inference_image = os.path.join(params['inference']['working_folder'], - f"{img_name.split('.')[0]}_inference.tif") - - assert_band_number(local_img, params['global']['number_of_bands']) - - nd_array_tif = image_reader_as_array(local_img) - - # See: http://cs231n.github.io/neural-networks-2/#datapre. e.g. Scale arrays from [0,255] to [0,1] - scale = params['global']['scale_data'] - if scale: - sc_min, sc_max = params['global']['scale_data'] - nd_array_tif = minmax_scale(nd_array_tif, - orig_range=(np.min(nd_array_tif), np.max(nd_array_tif)), - scale_range=(sc_min,sc_max)) - - sem_seg_results = sem_seg_inference(model, nd_array_tif, nbr_pix_overlap, chunk_size, num_classes, device) - create_new_raster_from_base(local_img, inference_image, sem_seg_results) - tqdm.write(f"Semantic segmentation of image {img_name} completed") - if bucket: - bucket.upload_file(inference_image, os.path.join(params['inference']['working_folder'], - f"{img_name.split('.')[0]}_inference.tif")) + with tqdm(list_img, desc='image list', position=0) as _tqdm: + for img in _tqdm: + img_name = os.path.basename(img['tif']) + if bucket: + local_img = f"Images/{img_name}" + bucket.download_file(img['tif'], local_img) + inference_image = f"Classified_Images/{img_name.split('.')[0]}_inference.tif" + else: + local_img = img['tif'] + inference_image = os.path.join(params['inference']['working_folder'], + f"{img_name.split('.')[0]}_inference.tif") + + assert_band_number(local_img, params['global']['number_of_bands']) + + nd_array_tif = image_reader_as_array(local_img) + assert(len(np.unique(nd_array_tif))>1), (f'Image "{img_name}" only contains {np.unique(nd_array_tif)} value.') + + # See: http://cs231n.github.io/neural-networks-2/#datapre. e.g. Scale arrays from [0,255] to [0,1] + scale = params['global']['scale_data'] + if scale: + sc_min, sc_max = params['global']['scale_data'] + nd_array_tif = minmax_scale(nd_array_tif, + orig_range=(np.min(nd_array_tif), np.max(nd_array_tif)), + scale_range=(sc_min,sc_max)) + if debug: + _tqdm.set_postfix(OrderedDict(image_name=img_name, image_shape=nd_array_tif.shape, scale=scale)) + + sem_seg_results = sem_seg_inference(model, nd_array_tif, nbr_pix_overlap, chunk_size, num_classes, device) + if debug and len(np.unique(sem_seg_results))==1: + print(f'Something is wrong. Inference contains only "{np.unique(sem_seg_results)} value. Make sure ' + f'"scale_data" parameter is coherent with parameters used for training model used in inference.') + create_new_raster_from_base(local_img, inference_image, sem_seg_results) + tqdm.write(f"Semantic segmentation of image {img_name} completed") + if bucket: + bucket.upload_file(inference_image, os.path.join(params['inference']['working_folder'], + f"{img_name.split('.')[0]}_inference.tif")) else: raise ValueError(f"The task should be either classification or segmentation. The provided value is {params['global']['task']}") @@ -286,4 +320,6 @@ def main(params): args = parser.parse_args() params = read_parameters(args.param_file) + debug = True if params['global']['debug_mode'] else False + main(params) diff --git a/models/TernausNet.py b/models/TernausNet.py index 460a49c1..47aed0eb 100644 --- a/models/TernausNet.py +++ b/models/TernausNet.py @@ -96,7 +96,7 @@ def forward(self, x): return self.final(dec1) -def ternausnet(num_classes, state_dict_path): +def ternausnet(num_classes): """ pretrained: False - no pre-trained network is used diff --git a/models/model_choice.py b/models/model_choice.py index 0c46db80..d5887af7 100644 --- a/models/model_choice.py +++ b/models/model_choice.py @@ -1,8 +1,31 @@ +import os +import torch import torchvision.models as models from models import TernausNet, unet, checkpointed_unet, inception from utils.utils import chop_layer +def load_checkpoint(filename): + ''' Loads checkpoint from provided path + :param filename: path to checkpoint as .pth.tar or .pth + :return: (dict) checkpoint ready to be loaded into model instance + ''' + try: + print("=> loading model '{}'".format(filename)) + + checkpoint = torch.load(filename) if torch.cuda.is_available() else torch.load(filename, map_location='cpu') + + # For loading external models with different structure in state dict. May cause problems when trying to load optimizer + if 'model' not in checkpoint.keys(): + temp_checkpoint = {} + temp_checkpoint['model'] = {k: v for k, v in checkpoint.items()} # Place entire state_dict inside 'model' key + del checkpoint + checkpoint = temp_checkpoint + return checkpoint + except FileNotFoundError: + raise FileNotFoundError(f"=> No model found at '{filename}'") + + def net(net_params, inference=False): """Define the neural net""" model_name = net_params['global']['model_name'].lower() @@ -12,46 +35,37 @@ def net(net_params, inference=False): if model_name == 'unetsmall': model = unet.UNetSmall(num_classes, net_params['global']['number_of_bands'], - net_params['models']['unetsmall']['dropout'], - net_params['models']['unetsmall']['probability']) - if net_params['models']['unetsmall']['pretrained']: - state_dict_path = net_params['models']['unetsmall']['pretrained'] + net_params['training']['dropout'], + net_params['training']['dropout_prob']) elif model_name == 'unet': model = unet.UNet(num_classes, net_params['global']['number_of_bands'], - net_params['models']['unet']['dropout'], - net_params['models']['unet']['probability']) - if net_params['models']['unet']['pretrained']: - state_dict_path = net_params['models']['unet']['pretrained'] + net_params['training']['dropout'], + net_params['training']['dropout_prob']) elif model_name == 'ternausnet': - model = TernausNet.ternausnet(num_classes, - net_params['models']['ternausnet']['pretrained']) - if net_params['models']['ternausnet']['pretrained']: - state_dict_path = net_params['models']['ternausnet']['pretrained'] + assert net_params['global']['number_of_bands'] == 3, msg + model = TernausNet.ternausnet(num_classes) elif model_name == 'checkpointed_unet': model = checkpointed_unet.UNetSmall(num_classes, net_params['global']['number_of_bands'], - net_params['models']['unetsmall']['dropout'], - net_params['models']['unetsmall']['probability']) - if net_params['models']['unetsmall']['pretrained']: - state_dict_path = net_params['models']['unetsmall']['pretrained'] + net_params['training']['dropout'], + net_params['training']['dropout_prob']) elif model_name == 'inception': model = inception.Inception3(num_classes, net_params['global']['number_of_bands']) - if net_params['models']['inception']['pretrained']: - state_dict_path = net_params['models']['inception']['pretrained'] elif model_name == 'fcn_resnet101': - assert net_params['global']['number_of_bands'], msg + assert net_params['global']['number_of_bands'] == 3, msg coco_model = models.segmentation.fcn_resnet101(pretrained=True, progress=True, num_classes=21, aux_loss=None) model = models.segmentation.fcn_resnet101(pretrained=False, progress=True, num_classes=num_classes, aux_loss=None) chopped_dict = chop_layer(coco_model.state_dict(), layer_names=['classifier.4']) del coco_model - model.load_state_dict(chopped_dict, strict=False) # load the new state dict - if net_params['models']['fcn_resnet101']['pretrained']: - state_dict_path = net_params['models']['fcn_resnet101']['pretrained'] + # load the new state dict + # When strict=False, allows to load only the variables that are identical between the two models irrespective of + # whether one is subset/superset of the other. + model.load_state_dict(chopped_dict, strict=False) elif model_name == 'deeplabv3_resnet101': - assert net_params['global']['number_of_bands'], msg + assert net_params['global']['number_of_bands'] == 3, msg # pretrained on coco (21 classes) coco_model = models.segmentation.deeplabv3_resnet101(pretrained=True, progress=True, num_classes=21, aux_loss=None) @@ -59,15 +73,16 @@ def net(net_params, inference=False): num_classes=num_classes, aux_loss=None) chopped_dict = chop_layer(coco_model.state_dict(), layer_names=['classifier.4']) del coco_model - # load the new state dict - model.load_state_dict(chopped_dict, strict=False) # When strict=False, allows to load only the variables that - # are identical between the two models irrespective of whether one is subset/superset of the other. - - if net_params['models']['deeplabv3_resnet101']['pretrained']: - state_dict_path = net_params['models']['deeplabv3_resnet101']['pretrained'] + model.load_state_dict(chopped_dict, strict=False) else: - raise ValueError('The model name in the config.yaml is not defined.') - if inference: + raise ValueError(f'The model name {model_name} in the config.yaml is not defined.') + if net_params['training']['state_dict_path']: + state_dict_path = net_params['training']['state_dict_path'] + checkpoint = load_checkpoint(state_dict_path) + elif inference: state_dict_path = net_params['inference']['state_dict_path'] + checkpoint = load_checkpoint(state_dict_path) + else: + checkpoint = None - return model, state_dict_path, model_name + return model, checkpoint, model_name diff --git a/train_model.py b/train_model.py index f5aa0785..6cf7d21e 100644 --- a/train_model.py +++ b/train_model.py @@ -1,9 +1,8 @@ -from pathlib import Path - import torch # import torch should be first. Unclear issue, mentioned here: https://github.com/pytorch/pytorch/issues/2083 import argparse import os +from pathlib import Path # TODO Use Path instead of os where possible. Better cross-platform compatibility import csv import time import h5py @@ -30,9 +29,9 @@ from utils.optimizer import create_optimizer from utils.logger import InformationLogger, save_logs_to_bucket, tsv_line from utils.metrics import report_classification, create_metrics_dict -from models.model_choice import net +from models.model_choice import net, load_checkpoint from losses import MultiClassCriterion -from utils.utils import read_parameters, load_from_checkpoint, list_s3_subfolders, get_device_ids +from utils.utils import read_parameters, load_from_checkpoint, list_s3_subfolders, get_device_ids, gpu_stats try: import boto3 @@ -41,19 +40,6 @@ pass -def gpu_stats(device=0): - """ - Provides GPU utilization (%) and RAM usage - :return: res.gpu, res.memory - """ - nvmlInit() - handle = nvmlDeviceGetHandleByIndex(device) - res = nvmlDeviceGetUtilizationRates(handle) - mem = nvmlDeviceGetMemoryInfo(handle) - - return res, mem - - def verify_weights(num_classes, weights): """Verifies that the number of weights equals the number of classes if any are given Args: @@ -94,7 +80,7 @@ def get_s3_classification_images(dataset, bucket, bucket_name, data_path, output path = os.path.join('Images', dataset) try: - os.mkdir(path) + os.mkdir(path) # TODO use Path from pathlib instead? except FileExistsError: pass for c in classes: @@ -159,13 +145,14 @@ def download_s3_files(bucket_name, data_path, output_path, num_classes, task): return bucket, bucket_output_path, local_output_path, data_path -def create_dataloader(data_path, num_samples, batch_size, task): +def create_dataloader(data_path, num_samples, batch_size, task, num_devices): """ Function to create dataloader objects for training, validation and test datasets. :param data_path: (str) path to the samples folder :param num_samples: (dict) number of samples for training, validation and test :param batch_size: (int) batch size :param task: (str) classification or segmentation + :param num_devices: (int) number of GPUs used :return: trn_dataloader, val_dataloader, tst_dataloader """ if task == 'classification': @@ -195,8 +182,8 @@ def create_dataloader(data_path, num_samples, batch_size, task): # Shuffle must be set to True. # https://discuss.pytorch.org/t/guidelines-for-assigning-num-workers-to-dataloader/813/5 - if torch.cuda.device_count() > 1: - num_workers = torch.cuda.device_count() * 4 + if num_devices > 1: + num_workers = num_devices * 4 else: num_workers = 4 @@ -229,47 +216,30 @@ def get_num_samples(data_path, params): return num_samples -def set_hyperparameters(params, model, state_dict_path): +def set_hyperparameters(params, model, checkpoint): """ Function to set hyperparameters based on values provided in yaml config file. Will also set model to GPU, if available. - If none provided, default functions values are used. + If none provided, default functions values may be used. :param params: (dict) Parameters found in the yaml config file :param model: Model loaded from model_choice.py - :param state_dict_path: (str) Full file path to the state dict + :param checkpoint: (dict) state dict as loaded by model_choice.py :return: model, criterion, optimizer, lr_scheduler, num_gpus """ - - # assign default values to hyperparameters - loss_signature = inspect.signature(nn.CrossEntropyLoss).parameters - optim_signature = inspect.signature(optim.Adam).parameters - lr_scheduler_signature = inspect.signature(optim.lr_scheduler.StepLR).parameters - class_weights = loss_signature['weight'].default - ignore_index = loss_signature['ignore_index'].default - lr = optim_signature['lr'].default - weight_decay = optim_signature['weight_decay'].default - step_size = lr_scheduler_signature['step_size'].default - if not isinstance(step_size, int): - step_size = params['training']['num_epochs'] + 1 - gamma = lr_scheduler_signature['gamma'].default - num_devices = 0 - - # replace default values by those in config file if they exist + # set mandatory hyperparameters values with those in config file if they exist + lr = params['training']['learning_rate'] + weight_decay = params['training']['weight_decay'] + step_size = params['training']['step_size'] + gamma = params['training']['gamma'] + num_devices = params['global']['num_gpus'] + msg = 'Missing mandatory hyperparameter in config file. Make sure learning_rate, weight_decay, step_size, gamma and num_devices are set.' + assert (lr and weight_decay and step_size and gamma and num_devices) is not None, msg + + # optional hyperparameters. Set to None if not in config file + class_weights = torch.tensor(params['training']['class_weights']) if params['training']['class_weights'] else None if params['training']['class_weights']: - class_weights = torch.tensor(params['training']['class_weights']) verify_weights(params['global']['num_classes'], class_weights) - if params['training']['ignore_index']: - ignore_index = params['training']['ignore_index'] - if params['training']['learning_rate']: - lr = params['training']['learning_rate'] - if params['training']['weight_decay']: - weight_decay = params['training']['weight_decay'] - if params['training']['step_size']: - step_size = params['training']['step_size'] - if params['training']['gamma']: - gamma = params['training']['gamma'] - if params['global']['num_gpus']: - num_devices = params['global']['num_gpus'] + ignore_index = params['training']['ignore_index'] if params['training']['ignore_index'] else -100 # Loss function criterion = MultiClassCriterion(loss_type=params['training']['loss_fn'], ignore_index=ignore_index, weight=class_weights) @@ -295,22 +265,32 @@ def set_hyperparameters(params, model, state_dict_path): optimizer = create_optimizer(params=model.parameters(), mode=opt_fn, base_lr=lr, weight_decay=weight_decay) lr_scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=step_size, gamma=gamma) - if state_dict_path != '': - model, optimizer = load_from_checkpoint(state_dict_path, model, optimizer=optimizer) + if checkpoint: + model, optimizer = load_from_checkpoint(checkpoint, model, optimizer=optimizer) return model, criterion, optimizer, lr_scheduler, device, num_devices -def main(params): +def main(params, config_path): """ Function to train and validate a models for semantic segmentation or classification. :param params: (dict) Parameters found in the yaml config file. + :param config_path: (str) Path to the yaml config file. """ - model, state_dict_path, model_name = net(params) + now = datetime.datetime.now().strftime("%Y-%m-%d_%I-%M") + + model, checkpoint, model_name = net(params) bucket_name = params['global']['bucket_name'] - output_path = params['training']['output_path'] data_path = params['global']['data_path'] + modelname = config_path.stem + output_path = Path(data_path).joinpath('model') / modelname + try: + output_path.mkdir(parents=True, exist_ok=False) + except FileExistsError: + output_path = Path(str(output_path)+'_'+now) + output_path.mkdir(exist_ok=True) + print(f'Model and log files will be saved to: {output_path}') task = params['global']['task'] num_classes = params['global']['num_classes'] batch_size = params['training']['batch_size'] @@ -330,25 +310,23 @@ def main(params): progress_log = Path(output_path) / 'progress.log' if not progress_log.exists(): - # Add header - # TODO overwrite existing log? - progress_log.open('w', buffering=1).write(tsv_line('ep_idx', 'phase', 'iter', 'i_p_ep', 'time')) + progress_log.open('w', buffering=1).write(tsv_line('ep_idx', 'phase', 'iter', 'i_p_ep', 'time')) # Add header trn_log = InformationLogger(output_path, 'trn') val_log = InformationLogger(output_path, 'val') tst_log = InformationLogger(output_path, 'tst') - model, criterion, optimizer, lr_scheduler, device, num_devices = set_hyperparameters(params, model, state_dict_path) + model, criterion, optimizer, lr_scheduler, device, num_devices = set_hyperparameters(params, model, checkpoint) num_samples = get_num_samples(data_path=data_path, params=params) print(f"Number of samples : {num_samples}") trn_dataloader, val_dataloader, tst_dataloader = create_dataloader(data_path=data_path, num_samples=num_samples, batch_size=batch_size, - task=task) + task=task, + num_devices=num_devices) - now = datetime.datetime.now().strftime("%Y-%m-%d_%I-%M ") - filename = os.path.join(output_path, 'checkpoint.pth.tar') #TODO Should output directory hold same name as config file name? + filename = os.path.join(output_path, 'checkpoint.pth.tar') for epoch in range(0, params['training']['num_epochs']): print(f'\nEpoch {epoch}/{params["training"]["num_epochs"] - 1}\n{"-" * 20}') @@ -405,7 +383,9 @@ def main(params): print(f'Current elapsed time {cur_elapsed // 60:.0f}m {cur_elapsed % 60:.0f}s') # load checkpoint model and evaluate it on test dataset. - model, _ = load_from_checkpoint(filename, model) + if int(params['training']['num_epochs']) > 0: #if num_epochs is set to 0, is loaded model to evaluate on test set + checkpoint = load_checkpoint(filename) + model, _ = load_from_checkpoint(checkpoint, model) tst_report = evaluation(eval_loader=tst_dataloader, model=model, criterion=criterion, @@ -479,6 +459,7 @@ def train(train_loader, model, criterion, optimizer, scheduler, num_classes, bat device=device, gpu_perc=f'{res.gpu} %', gpu_RAM=f'{mem.used/(1024**2):.0f}/{mem.total/(1024**2):.0f} MiB', + learning_rate=optimizer.param_groups[0]['lr'], img_size=data['sat_img'].numpy().shape, sample_size=data['map_img'].numpy().shape, batch_size=batch_size)) @@ -569,9 +550,10 @@ def evaluation(eval_loader, model, criterion, num_classes, batch_size, task, ep_ parser.add_argument('param_file', metavar='DIR', help='Path to training parameters stored in yaml') args = parser.parse_args() + config_path = Path(args.param_file) params = read_parameters(args.param_file) debug = True if params['global']['debug_mode'] else False - main(params) + main(params, config_path) print('End of training') diff --git a/utils/utils.py b/utils/utils.py index fff8a14b..00c0e60b 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -89,56 +89,42 @@ def assert_band_number(in_image, band_count_yaml): assert in_array.shape[2] == band_count_yaml, msg -def load_from_checkpoint(filename, model, optimizer=None): +def load_from_checkpoint(checkpoint, model, optimizer=None): """Load weights from a previous checkpoint Args: - filename: full file path of file containing checkpoint + checkpoint: (dict) checkpoint as loaded in model_choice.py model: model to replace optimizer: optimiser to be used """ - if os.path.isfile(filename): - print("=> loading model '{}'".format(filename)) - - checkpoint = torch.load(filename) if torch.cuda.is_available() else torch.load(filename, map_location='cpu') - - # For loading external models with different structure in state dict. May cause problems when trying to load optimizer - if 'model' not in checkpoint.keys(): - temp_checkpoint = {} - temp_checkpoint['model'] = {k: v for k, v in checkpoint.items()} # Place entire state_dict inside 'model' key - del checkpoint - checkpoint = temp_checkpoint - - # Corrects exception with test loop. Problem with loading generic checkpoint into DataParallel model - # https://github.com/bearpaw/pytorch-classification/issues/27 - # https://discuss.pytorch.org/t/solved-keyerror-unexpected-key-module-encoder-embedding-weight-in-state-dict/1686/3 - if isinstance(model, nn.DataParallel) and not list(checkpoint['model'].keys())[0].startswith('module'): - new_state_dict = model.state_dict().copy() - new_state_dict['model'] = {'module.'+k: v for k, v in checkpoint['model'].items()} # Very flimsy - checkpoint['model'] = new_state_dict['model'] - + # Corrects exception with test loop. Problem with loading generic checkpoint into DataParallel model + # https://github.com/bearpaw/pytorch-classification/issues/27 + # https://discuss.pytorch.org/t/solved-keyerror-unexpected-key-module-encoder-embedding-weight-in-state-dict/1686/3 + if isinstance(model, nn.DataParallel) and not list(checkpoint['model'].keys())[0].startswith('module'): + new_state_dict = model.state_dict().copy() + new_state_dict['model'] = {'module.'+k: v for k, v in checkpoint['model'].items()} # Very flimsy + checkpoint['model'] = new_state_dict['model'] + + try: + model.load_state_dict(checkpoint['model']) + except RuntimeError as error: try: - model.load_state_dict(checkpoint['model']) + list_errors = str(error).split('\n\t') + mismatched_layers = [] + for error in list_errors: + if error.startswith('size mismatch'): + mismatch_layer = error.split("size mismatch for ")[1].split(":")[0] # get name of problematic layer + print(f'Oups. {error}. We will try chopping "{mismatch_layer}" out of pretrained dictionary.') + mismatched_layers.append(mismatch_layer) + chopped_checkpt = chop_layer(checkpoint['model'], layer_names=mismatched_layers) + # overwrite entries in the existing state dict + model.load_state_dict(chopped_checkpt, strict=False) except RuntimeError as error: - try: - list_errors = str(error).split('\n\t') - mismatched_layers = [] - for error in list_errors: - if error.startswith('size mismatch'): - mismatch_layer = error.split("size mismatch for ")[1].split(":")[0] # get name of problematic layer - print(f'Oups. {error}. We will try chopping "{mismatch_layer}" out of pretrained dictionary.') - mismatched_layers.append(mismatch_layer) - chopped_checkpt = chop_layer(checkpoint['model'], layer_names=mismatched_layers) - # overwrite entries in the existing state dict - model.load_state_dict(chopped_checkpt, strict=False) - except RuntimeError as error: - raise RuntimeError(error) - - print(f"=> loaded model '{filename}'") - if optimizer and 'optimizer' in checkpoint.keys(): # 2nd condition if loading a model without optimizer - optimizer.load_state_dict(checkpoint['optimizer']) - return model, optimizer - else: - print(f"=> no model found at '{filename}'") + raise RuntimeError(error) + + print(f"=> loaded model") + if optimizer and 'optimizer' in checkpoint.keys(): # 2nd condition if loading a model without optimizer + optimizer.load_state_dict(checkpoint['optimizer']) + return model, optimizer def image_reader_as_array(file_name): @@ -149,14 +135,11 @@ def image_reader_as_array(file_name): Return: numm_py_array of the image read """ - try: - with rasterio.open(file_name, 'r') as src: - np_array = np.empty([src.height, src.width, src.count], dtype=np.float32) - for i in range(src.count): - band = src.read(i+1) # Bands starts at 1 in rasterio not 0 - np_array[:, :, i] = band - except IOError: - raise IOError(f'Could not locate "{file_name}". Make sure file exists in this directory.') + with rasterio.open(file_name, 'r') as src: + np_array = np.empty([src.height, src.width, src.count], dtype=np.float32) + for i in range(src.count): + band = src.read(i+1) # Bands starts at 1 in rasterio not 0 + np_array[:, :, i] = band return np_array @@ -219,7 +202,7 @@ def read_csv(csv_file_name, inference=False): return sorted(list_values, key=lambda k: k['dataset']) -def get_device_ids(number_requested): +def get_device_ids(number_requested): #FIXME if some memory is used on a GPU before call to this function, the GPU will be excluded. """ Function to check which GPU devices are available and unused. :param number_requested: (int) Number of devices requested. @@ -231,9 +214,8 @@ def get_device_ids(number_requested): if number_requested > 0: device_count = nvmlDeviceGetCount() for i in range(device_count): - handle = nvmlDeviceGetHandleByIndex(i) - info = nvmlDeviceGetMemoryInfo(handle) - if round(info.used / 1024 ** 3, 1) == 0.0: + res, mem = gpu_stats(i) + if round(mem.used/(1024**2), 1) < 1500.0 and res.gpu < 10: # Hardcoded tolerance for memory and usage lst_free_devices.append(i) if len(lst_free_devices) == number_requested: break @@ -245,4 +227,17 @@ def get_device_ids(number_requested): except NVMLError as error: raise ValueError(f"{error}. Make sure that the latest NVIDIA driver is installed and running.") - return lst_free_devices \ No newline at end of file + return lst_free_devices + + +def gpu_stats(device=0): + """ + Provides GPU utilization (%) and RAM usage + :return: res.gpu, res.memory + """ + nvmlInit() + handle = nvmlDeviceGetHandleByIndex(device) + res = nvmlDeviceGetUtilizationRates(handle) + mem = nvmlDeviceGetMemoryInfo(handle) + + return res, mem \ No newline at end of file