diff --git a/README.md b/README.md index 4e5dd455..7e7e9ed8 100644 --- a/README.md +++ b/README.md @@ -82,10 +82,11 @@ Before evaluating the model, you will need to download the COCO and VQAv2 datase pip install pycocoevalcap ``` -To evaluate the model, use script open_flamingo/eval/evaluate.py with the following arguments: +To evaluate models, use script open_flamingo/eval/online_evaluate.py with the following arguments: ``` -python evaluate.py +python online_evaluate.py +--model_dir model_checkpoints --lm_path facebook/opt-1.3b --lm_tokenizer_path facebook/opt-1.3b --clip_path openai/clip-vit-large-patch14 @@ -96,8 +97,11 @@ python evaluate.py --vqav2_image_dir_path path/to/vqav2/images --vqav2_annotation_path path/to/vqav2/v2_mscoco_train2014_annotations.json --vqav2_question_path path/to/vqav2/v2_OpenEnded_mscoco_train2014_questions.json +--report_to_wandb +--wandb_run_name online-eval ``` +This script will evaluate checkpoints in the model_dir directory as they are saved and push results to wandb. ### Wandb To log to wandb, use the --report_to wandb flag. The run name will be specified using the --run_name argument. To specify the wandb project, use the --wandb_project argument and use wandb_entity to specify the wandb entity. diff --git a/open_flamingo/eval/run_eval.py b/open_flamingo/eval/run_eval.py new file mode 100644 index 00000000..06240d56 --- /dev/null +++ b/open_flamingo/eval/run_eval.py @@ -0,0 +1,198 @@ +import argparse +import os +import time + +import numpy as np +import torch +from testbed import evaluate_coco, evaluate_vqa + +import wandb +from open_flamingo.src.factory import create_model_and_transforms + +parser = argparse.ArgumentParser() +parser.add_argument("--lm_path", type=str, default="facebook/opt-1.3b") +parser.add_argument("--lm_tokenizer_path", type=str, + default="facebook/opt-30b") +parser.add_argument("--clip_path", type=str, + default="openai/clip-vit-large-patch14") + +parser.add_argument("--model_dir", type=str, required=True, help="Path to model directory containing checkpoints") + +# Trial arguments +parser.add_argument("--shots", nargs="+", default=[0, 4]) +parser.add_argument( + "--num_trials", + type=int, + default=3, + help="Number of trials to run for each shot using different demonstrations", +) +parser.add_argument( + "--trial_seeds", + nargs="+", + default=[0, 2, 4], + help="Seeds to use for each trial for picking demonstrations and eval sets", +) +parser.add_argument( + "--num_samples", type=int, default=5000, help="Number of samples to evaluate on" +) + +parser.add_argument("--batch_size", type=int, default=16) +parser.add_argument("--device", type=int, default=0) + +# Dataset arguments +parser.add_argument( + "--coco_image_dir_path", + type=str, + default="/fsx/home-anasawadalla/data/coco/train2017", +) +parser.add_argument( + "--coco_annotations_json_path", + type=str, + default="/fsx/home-anasawadalla/data/coco/annotations/captions_train2017.json", +) +parser.add_argument( + "--vqav2_image_dir_path", + type=str, + default="/fsx/home-anasawadalla/data/vqav2/train2014", +) +parser.add_argument( + "--vqav2_questions_json_path", + type=str, + default="/fsx/home-anasawadalla/data/vqav2/v2_OpenEnded_mscoco_train2014_questions.json", +) +parser.add_argument( + "--vqav2_annotations_json_path", + type=str, + default="/fsx/home-anasawadalla/data/vqav2/v2_mscoco_train2014_annotations.json", +) + +# Wandb arguments +parser.add_argument( + "--report_to_wandb", + action="store_true", +) +parser.add_argument( + "--wandb_project", + type=str, + default="open-flamingo", +) +parser.add_argument( + "--wandb_entity", + type=str, + default="anas-awadalla", +) +parser.add_argument( + "--wandb_run_name", + type=str, + default="online-eval", +) + +def run_evaluation_suite(args): + flamingo, image_processor, tokenizer = create_model_and_transforms( + args.clip_path, + args.clip_path, + args.lm_path, + args.lm_tokenizer_path, + ) + + checkpoint = torch.load(args.checkpoint_path, map_location="cpu")[ + "model_state_dict"] + # remove the "module." prefix from the keys + checkpoint = {k.replace("module.", ""): v for k, v in checkpoint.items()} + + flamingo.load_state_dict(checkpoint, strict=False) + flamingo.to(args.device if args.device >= 0 else "cpu") + + results = {"coco": [], "vqav2": []} + + print("Evaluating on COCO...") + for shot in args.shots: + scores = [] + for seed, trial in zip(args.trial_seeds, range(args.num_trials)): + cider_score = evaluate_coco( + model=flamingo, + tokenizer=tokenizer, + image_processor=image_processor, + batch_size=args.batch_size, + image_dir_path=args.coco_image_dir_path, + annotations_json_path=args.coco_annotations_json_path, + num_samples=args.num_samples, + num_shots=shot, + device=args.device, + seed=seed, + ) + print(f"Shots {shot} Trial {trial} CIDEr score: {cider_score}") + scores.append(cider_score) + print(f"Shots {shot} Mean CIDEr score: {np.mean(scores)}") + results["coco"].append( + {"shots": shot, "trials": scores, "mean": np.mean(scores)}) + + print("Evaluating on VQAv2...") + for shot in args.shots: + scores = [] + for seed, trial in zip(args.trial_seeds, range(args.num_trials)): + vqa_score = evaluate_vqa( + model=flamingo, + tokenizer=tokenizer, + image_processor=image_processor, + batch_size=args.batch_size, + num_samples=args.num_samples, + num_shots=shot, + device=args.device, + seed=seed, + image_dir_path=args.vqav2_image_dir_path, + questions_json_path=args.vqav2_questions_json_path, + annotations_json_path=args.vqav2_annotations_json_path, + ) + print(f"Shots {shot} Trial {trial} VQA score: {vqa_score}") + scores.append(vqa_score) + print(f"Shots {shot} Mean VQA score: {np.mean(scores)}") + results["vqav2"].append( + {"shots": shot, "trials": scores, "mean": np.mean(scores)}) + + return results + + +def main(): + args = parser.parse_args() + + if args.report_to_wandb: + wandb.init(project=args.wandb_project, + entity=args.wandb_entity, name=args.wandb_run_name) + + evaluated_checkpoints = set() + while True: + # check for new checkpoints + checkpoints = set([f for f in os.listdir( + args.model_dir) if f.startswith('checkpoint')]) + + if len(checkpoints) > 0: + # remove already evaluated checkpoints + checkpoints.difference_update(evaluated_checkpoints) + + # sort checkpoints by epoch + checkpoints = sorted(checkpoints, key=lambda x: int( + x.split('_')[-1].split('.')[0])) + + for path in checkpoints: + # pick the last checkpoint + checkpoint = os.path.join(args.model_dir, path) + epoch = int(checkpoint.split('_')[-1].split('.')[0]) + print('found new checkpoint: {}'.format(checkpoint)) + # evaluate the model + args.checkpoint_path = checkpoint + results = run_evaluation_suite(args) + evaluated_checkpoints.add(path) + + if args.report_to_wandb: + for dataset in results: + for result in results[dataset]: + metric_name = f"{dataset} {'cider' if dataset == 'coco' else 'vqa accuracy'} (shots = {result['shots']})" + wandb.log({metric_name: result['mean'], "epoch": epoch}) + else: + print('no checkpoint found, waiting for 10 mins...') + time.sleep(10 * 60) + +if __name__ == "__main__": + main() + diff --git a/open_flamingo/eval/evaluate.py b/open_flamingo/eval/testbed.py similarity index 70% rename from open_flamingo/eval/evaluate.py rename to open_flamingo/eval/testbed.py index f002291a..c7dc4da5 100644 --- a/open_flamingo/eval/evaluate.py +++ b/open_flamingo/eval/testbed.py @@ -12,140 +12,6 @@ from tqdm import tqdm from vqa_metric import compute_vqa_accuracy, postprocess_vqa_generation -from open_flamingo.src.factory import create_model_and_transforms - -parser = argparse.ArgumentParser() -parser.add_argument("--lm_path", type=str, default="facebook/opt-1.3b") -parser.add_argument("--lm_tokenizer_path", type=str, - default="facebook/opt-30b") -parser.add_argument("--clip_path", type=str, - default="openai/clip-vit-large-patch14") -parser.add_argument("--checkpoint_path", type=str, required=True) - -# Trial arguments -parser.add_argument("--shots", nargs="+", default=[0, 8]) -parser.add_argument( - "--num_trials", - type=int, - default=3, - help="Number of trials to run for each shot using different demonstrations", -) -parser.add_argument( - "--trial_seeds", - nargs="+", - default=[0, 1, 2], - help="Seeds to use for each trial for picking demonstrations and eval sets", -) -parser.add_argument( - "--num_samples", type=int, default=5000, help="Number of samples to evaluate on" -) - -parser.add_argument("--batch_size", type=int, default=8) -parser.add_argument("--device", type=int, default=0) - -# Dataset arguments -parser.add_argument( - "--coco_image_dir_path", - type=str, - default="/fsx/home-anasawadalla/data/coco/train2017", -) -parser.add_argument( - "--coco_annotations_json_path", - type=str, - default="/fsx/home-anasawadalla/data/coco/annotations/captions_train2017.json", -) -parser.add_argument( - "--vqav2_image_dir_path", - type=str, - default="/fsx/home-anasawadalla/data/vqav2/train2014", -) -parser.add_argument( - "--vqav2_questions_json_path", - type=str, - default="/fsx/home-anasawadalla/data/vqav2/v2_OpenEnded_mscoco_train2014_questions.json", -) -parser.add_argument( - "--vqav2_annotations_json_path", - type=str, - default="/fsx/home-anasawadalla/data/vqav2/v2_mscoco_train2014_annotations.json", -) - -parser.add_argument("--results_file", type=str, default=None, - help="JSON file to save results") - - -def main(): - args = parser.parse_args() - - # load model - flamingo, image_processor, tokenizer = create_model_and_transforms( - args.clip_path, - args.clip_path, - args.lm_path, - args.lm_tokenizer_path, - ) - - checkpoint = torch.load(args.checkpoint_path, map_location="cpu")[ - "model_state_dict" - ] - # remove the "module." prefix from the keys - checkpoint = {k.replace("module.", ""): v for k, v in checkpoint.items()} - - flamingo.load_state_dict(checkpoint, strict=False) - flamingo.to(args.device if args.device >= 0 else "cpu") - - results = {"coco": [], "vqav2": []} # results to be saved to file - - print("Evaluating on COCO...") - for shot in args.shots: - scores = [] - for seed, trial in zip(args.trial_seeds, range(args.num_trials)): - cider_score = evaluate_coco( - model=flamingo, - tokenizer=tokenizer, - image_processor=image_processor, - batch_size=args.batch_size, - image_dir_path=args.coco_image_dir_path, - annotations_json_path=args.coco_annotations_json_path, - num_samples=args.num_samples, - num_shots=shot, - device=args.device, - seed=seed, - ) - print(f"Shots {shot} Trial {trial} CIDEr score: {cider_score}") - scores.append(cider_score) - print(f"Shots {shot} Mean CIDEr score: {np.mean(scores)}") - results["coco"].append( - {"shots": shot, "trials": scores, "mean": np.mean(scores)}) - - print("Evaluating on VQAv2...") - for shot in args.shots: - scores = [] - for seed, trial in zip(args.trial_seeds, range(args.num_trials)): - vqa_score = evaluate_vqa( - model=flamingo, - tokenizer=tokenizer, - image_processor=image_processor, - batch_size=args.batch_size, - num_samples=args.num_samples, - num_shots=shot, - device=args.device, - seed=seed, - image_dir_path=args.vqav2_image_dir_path, - questions_json_path=args.vqav2_questions_json_path, - annotations_json_path=args.vqav2_annotations_json_path, - ) - print(f"Shots {shot} Trial {trial} VQA score: {vqa_score}") - scores.append(vqa_score) - print(f"Shots {shot} Mean VQA score: {np.mean(scores)}") - results["vqav2"].append( - {"shots": shot, "trials": scores, "mean": np.mean(scores)}) - - if args.results_file is not None: - with open(args.results_file, "w") as f: - json.dump(results, f) - - def evaluate_coco( model, tokenizer, @@ -467,7 +333,3 @@ def get_prompt(sample, train=True): os.remove(f"vqaresults_{random_uuid}.json") return acc - - -if __name__ == "__main__": - main()