Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Won't merge this] Add online evaluation script #68

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,11 @@ Before evaluating the model, you will need to download the COCO and VQAv2 datase
pip install pycocoevalcap
```

To evaluate the model, use script open_flamingo/eval/evaluate.py with the following arguments:
To evaluate models, use script open_flamingo/eval/online_evaluate.py with the following arguments:

```
python evaluate.py
python online_evaluate.py
--model_dir model_checkpoints
--lm_path facebook/opt-1.3b
--lm_tokenizer_path facebook/opt-1.3b
--clip_path openai/clip-vit-large-patch14
Expand All @@ -96,8 +97,11 @@ python evaluate.py
--vqav2_image_dir_path path/to/vqav2/images
--vqav2_annotation_path path/to/vqav2/v2_mscoco_train2014_annotations.json
--vqav2_question_path path/to/vqav2/v2_OpenEnded_mscoco_train2014_questions.json
--report_to_wandb
--wandb_run_name online-eval
```

This script will evaluate checkpoints in the model_dir directory as they are saved and push results to wandb.

### Wandb
To log to wandb, use the --report_to wandb flag. The run name will be specified using the --run_name argument. To specify the wandb project, use the --wandb_project argument and use wandb_entity to specify the wandb entity.
Expand Down
198 changes: 198 additions & 0 deletions open_flamingo/eval/run_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
import argparse
import os
import time

import numpy as np
import torch
from testbed import evaluate_coco, evaluate_vqa

import wandb
from open_flamingo.src.factory import create_model_and_transforms

parser = argparse.ArgumentParser()
parser.add_argument("--lm_path", type=str, default="facebook/opt-1.3b")
parser.add_argument("--lm_tokenizer_path", type=str,
default="facebook/opt-30b")
parser.add_argument("--clip_path", type=str,
default="openai/clip-vit-large-patch14")

parser.add_argument("--model_dir", type=str, required=True, help="Path to model directory containing checkpoints")

# Trial arguments
parser.add_argument("--shots", nargs="+", default=[0, 4])
parser.add_argument(
"--num_trials",
type=int,
default=3,
help="Number of trials to run for each shot using different demonstrations",
)
parser.add_argument(
"--trial_seeds",
nargs="+",
default=[0, 2, 4],
help="Seeds to use for each trial for picking demonstrations and eval sets",
)
parser.add_argument(
"--num_samples", type=int, default=5000, help="Number of samples to evaluate on"
)

parser.add_argument("--batch_size", type=int, default=16)
parser.add_argument("--device", type=int, default=0)

# Dataset arguments
parser.add_argument(
"--coco_image_dir_path",
type=str,
default="/fsx/home-anasawadalla/data/coco/train2017",
)
parser.add_argument(
"--coco_annotations_json_path",
type=str,
default="/fsx/home-anasawadalla/data/coco/annotations/captions_train2017.json",
)
parser.add_argument(
"--vqav2_image_dir_path",
type=str,
default="/fsx/home-anasawadalla/data/vqav2/train2014",
)
parser.add_argument(
"--vqav2_questions_json_path",
type=str,
default="/fsx/home-anasawadalla/data/vqav2/v2_OpenEnded_mscoco_train2014_questions.json",
)
parser.add_argument(
"--vqav2_annotations_json_path",
type=str,
default="/fsx/home-anasawadalla/data/vqav2/v2_mscoco_train2014_annotations.json",
)

# Wandb arguments
parser.add_argument(
"--report_to_wandb",
action="store_true",
)
parser.add_argument(
"--wandb_project",
type=str,
default="open-flamingo",
)
parser.add_argument(
"--wandb_entity",
type=str,
default="anas-awadalla",
)
parser.add_argument(
"--wandb_run_name",
type=str,
default="online-eval",
)

def run_evaluation_suite(args):
flamingo, image_processor, tokenizer = create_model_and_transforms(
args.clip_path,
args.clip_path,
args.lm_path,
args.lm_tokenizer_path,
)

checkpoint = torch.load(args.checkpoint_path, map_location="cpu")[
"model_state_dict"]
# remove the "module." prefix from the keys
checkpoint = {k.replace("module.", ""): v for k, v in checkpoint.items()}

flamingo.load_state_dict(checkpoint, strict=False)
flamingo.to(args.device if args.device >= 0 else "cpu")

results = {"coco": [], "vqav2": []}

print("Evaluating on COCO...")
for shot in args.shots:
scores = []
for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
cider_score = evaluate_coco(
model=flamingo,
tokenizer=tokenizer,
image_processor=image_processor,
batch_size=args.batch_size,
image_dir_path=args.coco_image_dir_path,
annotations_json_path=args.coco_annotations_json_path,
num_samples=args.num_samples,
num_shots=shot,
device=args.device,
seed=seed,
)
print(f"Shots {shot} Trial {trial} CIDEr score: {cider_score}")
scores.append(cider_score)
print(f"Shots {shot} Mean CIDEr score: {np.mean(scores)}")
results["coco"].append(
{"shots": shot, "trials": scores, "mean": np.mean(scores)})

print("Evaluating on VQAv2...")
for shot in args.shots:
scores = []
for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
vqa_score = evaluate_vqa(
model=flamingo,
tokenizer=tokenizer,
image_processor=image_processor,
batch_size=args.batch_size,
num_samples=args.num_samples,
num_shots=shot,
device=args.device,
seed=seed,
image_dir_path=args.vqav2_image_dir_path,
questions_json_path=args.vqav2_questions_json_path,
annotations_json_path=args.vqav2_annotations_json_path,
)
print(f"Shots {shot} Trial {trial} VQA score: {vqa_score}")
scores.append(vqa_score)
print(f"Shots {shot} Mean VQA score: {np.mean(scores)}")
results["vqav2"].append(
{"shots": shot, "trials": scores, "mean": np.mean(scores)})

return results


def main():
args = parser.parse_args()

if args.report_to_wandb:
wandb.init(project=args.wandb_project,
entity=args.wandb_entity, name=args.wandb_run_name)

evaluated_checkpoints = set()
while True:
# check for new checkpoints
checkpoints = set([f for f in os.listdir(
args.model_dir) if f.startswith('checkpoint')])

if len(checkpoints) > 0:
# remove already evaluated checkpoints
checkpoints.difference_update(evaluated_checkpoints)

# sort checkpoints by epoch
checkpoints = sorted(checkpoints, key=lambda x: int(
x.split('_')[-1].split('.')[0]))

for path in checkpoints:
# pick the last checkpoint
checkpoint = os.path.join(args.model_dir, path)
epoch = int(checkpoint.split('_')[-1].split('.')[0])
print('found new checkpoint: {}'.format(checkpoint))
# evaluate the model
args.checkpoint_path = checkpoint
results = run_evaluation_suite(args)
evaluated_checkpoints.add(path)

if args.report_to_wandb:
for dataset in results:
for result in results[dataset]:
metric_name = f"{dataset} {'cider' if dataset == 'coco' else 'vqa accuracy'} (shots = {result['shots']})"
wandb.log({metric_name: result['mean'], "epoch": epoch})
else:
print('no checkpoint found, waiting for 10 mins...')
time.sleep(10 * 60)

if __name__ == "__main__":
main()

138 changes: 0 additions & 138 deletions open_flamingo/eval/evaluate.py → open_flamingo/eval/testbed.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,140 +12,6 @@
from tqdm import tqdm
from vqa_metric import compute_vqa_accuracy, postprocess_vqa_generation

from open_flamingo.src.factory import create_model_and_transforms

parser = argparse.ArgumentParser()
parser.add_argument("--lm_path", type=str, default="facebook/opt-1.3b")
parser.add_argument("--lm_tokenizer_path", type=str,
default="facebook/opt-30b")
parser.add_argument("--clip_path", type=str,
default="openai/clip-vit-large-patch14")
parser.add_argument("--checkpoint_path", type=str, required=True)

# Trial arguments
parser.add_argument("--shots", nargs="+", default=[0, 8])
parser.add_argument(
"--num_trials",
type=int,
default=3,
help="Number of trials to run for each shot using different demonstrations",
)
parser.add_argument(
"--trial_seeds",
nargs="+",
default=[0, 1, 2],
help="Seeds to use for each trial for picking demonstrations and eval sets",
)
parser.add_argument(
"--num_samples", type=int, default=5000, help="Number of samples to evaluate on"
)

parser.add_argument("--batch_size", type=int, default=8)
parser.add_argument("--device", type=int, default=0)

# Dataset arguments
parser.add_argument(
"--coco_image_dir_path",
type=str,
default="/fsx/home-anasawadalla/data/coco/train2017",
)
parser.add_argument(
"--coco_annotations_json_path",
type=str,
default="/fsx/home-anasawadalla/data/coco/annotations/captions_train2017.json",
)
parser.add_argument(
"--vqav2_image_dir_path",
type=str,
default="/fsx/home-anasawadalla/data/vqav2/train2014",
)
parser.add_argument(
"--vqav2_questions_json_path",
type=str,
default="/fsx/home-anasawadalla/data/vqav2/v2_OpenEnded_mscoco_train2014_questions.json",
)
parser.add_argument(
"--vqav2_annotations_json_path",
type=str,
default="/fsx/home-anasawadalla/data/vqav2/v2_mscoco_train2014_annotations.json",
)

parser.add_argument("--results_file", type=str, default=None,
help="JSON file to save results")


def main():
args = parser.parse_args()

# load model
flamingo, image_processor, tokenizer = create_model_and_transforms(
args.clip_path,
args.clip_path,
args.lm_path,
args.lm_tokenizer_path,
)

checkpoint = torch.load(args.checkpoint_path, map_location="cpu")[
"model_state_dict"
]
# remove the "module." prefix from the keys
checkpoint = {k.replace("module.", ""): v for k, v in checkpoint.items()}

flamingo.load_state_dict(checkpoint, strict=False)
flamingo.to(args.device if args.device >= 0 else "cpu")

results = {"coco": [], "vqav2": []} # results to be saved to file

print("Evaluating on COCO...")
for shot in args.shots:
scores = []
for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
cider_score = evaluate_coco(
model=flamingo,
tokenizer=tokenizer,
image_processor=image_processor,
batch_size=args.batch_size,
image_dir_path=args.coco_image_dir_path,
annotations_json_path=args.coco_annotations_json_path,
num_samples=args.num_samples,
num_shots=shot,
device=args.device,
seed=seed,
)
print(f"Shots {shot} Trial {trial} CIDEr score: {cider_score}")
scores.append(cider_score)
print(f"Shots {shot} Mean CIDEr score: {np.mean(scores)}")
results["coco"].append(
{"shots": shot, "trials": scores, "mean": np.mean(scores)})

print("Evaluating on VQAv2...")
for shot in args.shots:
scores = []
for seed, trial in zip(args.trial_seeds, range(args.num_trials)):
vqa_score = evaluate_vqa(
model=flamingo,
tokenizer=tokenizer,
image_processor=image_processor,
batch_size=args.batch_size,
num_samples=args.num_samples,
num_shots=shot,
device=args.device,
seed=seed,
image_dir_path=args.vqav2_image_dir_path,
questions_json_path=args.vqav2_questions_json_path,
annotations_json_path=args.vqav2_annotations_json_path,
)
print(f"Shots {shot} Trial {trial} VQA score: {vqa_score}")
scores.append(vqa_score)
print(f"Shots {shot} Mean VQA score: {np.mean(scores)}")
results["vqav2"].append(
{"shots": shot, "trials": scores, "mean": np.mean(scores)})

if args.results_file is not None:
with open(args.results_file, "w") as f:
json.dump(results, f)


def evaluate_coco(
model,
tokenizer,
Expand Down Expand Up @@ -467,7 +333,3 @@ def get_prompt(sample, train=True):
os.remove(f"vqaresults_{random_uuid}.json")

return acc


if __name__ == "__main__":
main()