Skip to content

Commit

Permalink
[Fix] Add try-except block in model-dataset combination processing to…
Browse files Browse the repository at this point in the history
… prevent single errors from halting subsequent runs
  • Loading branch information
Mor-Li committed Sep 20, 2024
2 parents 2ffc251 + f3a50e3 commit 9ca28fd
Showing 1 changed file with 154 additions and 146 deletions.
300 changes: 154 additions & 146 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,155 +70,163 @@ def main():
os.makedirs(pred_root, exist_ok=True)

for _, dataset_name in enumerate(args.data):
dataset_kwargs = {}
if dataset_name in ['MMLongBench_DOC', 'DUDE', 'DUDE_MINI', 'SLIDEVQA', 'SLIDEVQA_MINI']:
dataset_kwargs['model'] = model_name
if dataset_name == 'MMBench-Video':
dataset_kwargs['pack'] = args.pack
if dataset_name == 'Video-MME':
dataset_kwargs['use_subtitle'] = args.use_subtitle

# If distributed, first build the dataset on the main process for doing preparation works
if world_size > 1:
dataset = build_dataset(dataset_name, **dataset_kwargs) if rank == 0 else None
dist.barrier()
dataset_list = [dataset]
dist.broadcast_object_list(dataset_list, src=0)
dataset = dataset_list[0]
else:
dataset = build_dataset(dataset_name, **dataset_kwargs)
if dataset is None:
logger.error(f'Dataset {dataset_name} is not valid, will be skipped. ')
continue

result_file = f'{pred_root}/{model_name}_{dataset_name}.xlsx'
if dataset_name in ['MMBench-Video']:
packstr = 'pack' if args.pack else 'nopack'
result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx'
elif dataset.MODALITY == 'VIDEO':
if args.pack:
logger.info(f'{dataset_name} not support Pack Mode, directly change to unpack')
args.pack = False
packstr = 'pack' if args.pack else 'nopack'
result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx'
if dataset_name in ['Video-MME']:
subtitlestr = 'subs' if args.use_subtitle else 'nosubs'
result_file = result_file.replace('.xlsx', f'_{subtitlestr}.xlsx')

if dataset.TYPE == 'MT':
result_file = result_file.replace('.xlsx', '.tsv')

if osp.exists(result_file) and args.rerun:
for keyword in ['openai', 'gpt', 'auxmatch']:
os.system(f'rm {pred_root}/{model_name}_{dataset_name}_{keyword}*')

if model is None:
model = model_name # which is only a name

# Perform the Inference
if dataset.MODALITY == 'VIDEO':
model = infer_data_job_video(
model,
work_dir=pred_root,
model_name=model_name,
dataset=dataset,
nframe=args.nframe,
pack=args.pack,
verbose=args.verbose,
subtitle=args.use_subtitle,
api_nproc=args.nproc)
elif dataset.TYPE == 'MT':
model = infer_data_job_mt(
model,
work_dir=pred_root,
model_name=model_name,
dataset=dataset,
verbose=args.verbose,
api_nproc=args.nproc,
ignore_failed=args.ignore)
else:
model = infer_data_job(
model,
work_dir=pred_root,
model_name=model_name,
dataset=dataset,
verbose=args.verbose,
api_nproc=args.nproc,
ignore_failed=args.ignore)

# Set the judge kwargs first before evaluation or dumping
judge_kwargs = {
'nproc': args.nproc,
'verbose': args.verbose,
}
if args.retry is not None:
judge_kwargs['retry'] = args.retry
if args.judge is not None:
judge_kwargs['model'] = args.judge
else:
if dataset.TYPE in ['MCQ', 'Y/N']:
judge_kwargs['model'] = 'chatgpt-0125'
elif listinstr(['MMVet', 'MathVista', 'LLaVABench', 'MMBench-Video', 'MathVision'], dataset_name):
judge_kwargs['model'] = 'gpt-4-turbo'
elif listinstr(['MMLongBench', 'MMDU', 'DUDE', 'DUDE_MINI', 'SLIDEVQA', 'SLIDEVQA_MINI'], dataset_name):
judge_kwargs['model'] = 'gpt-4o'
if 'OPENAI_API_KEY_JUDGE' in os.environ and len(os.environ['OPENAI_API_KEY_JUDGE']):
judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
if 'OPENAI_API_BASE_JUDGE' in os.environ and len(os.environ['OPENAI_API_BASE_JUDGE']):
judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']

if rank == 0:
if dataset_name in ['MMMU_TEST']:
result_json = MMMU_result_transfer(result_file)
logger.info(f'Transfer MMMU_TEST result to json for official evaluation, '
f'json file saved in {result_json}') # noqa: E501
continue
elif 'MMT-Bench_ALL' in dataset_name:
submission_file = MMTBench_result_transfer(result_file, **judge_kwargs)
logger.info(f'Extract options from prediction of MMT-Bench FULL split for official evaluation '
f'(https://eval.ai/web/challenges/challenge-page/2328/overview), '
f'submission file saved in {submission_file}') # noqa: E501
continue
elif 'MLLMGuard_DS' in dataset_name:
logger.info('The evaluation of MLLMGuard_DS is not supported yet. ') # noqa: E501
continue
elif 'AesBench_TEST' == dataset_name:
logger.info(f'The results are saved in {result_file}. '
f'Please send it to the AesBench Team via [email protected].') # noqa: E501
try:
dataset_kwargs = {}
if dataset_name in ['MMLongBench_DOC', 'DUDE', 'DUDE_MINI', 'SLIDEVQA', 'SLIDEVQA_MINI']:
dataset_kwargs['model'] = model_name
if dataset_name == 'MMBench-Video':
dataset_kwargs['pack'] = args.pack
if dataset_name == 'Video-MME':
dataset_kwargs['use_subtitle'] = args.use_subtitle

# If distributed, first build the dataset on the main process for doing preparation works
if world_size > 1:
dataset = build_dataset(dataset_name, **dataset_kwargs) if rank == 0 else None
dist.barrier()
dataset_list = [dataset]
dist.broadcast_object_list(dataset_list, src=0)
dataset = dataset_list[0]
else:
dataset = build_dataset(dataset_name, **dataset_kwargs)
if dataset is None:
logger.error(f'Dataset {dataset_name} is not valid, will be skipped. ')
continue

if dataset_name in [
'MMBench_TEST_CN', 'MMBench_TEST_EN', 'MMBench', 'MMBench_CN',
'MMBench_TEST_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_V11', 'MMBench_CN_V11'
]:
if not MMBenchOfficialServer(dataset_name):
logger.error(
f'Can not evaluate {dataset_name} on non-official servers, '
'will skip the evaluation. '
)
continue
result_file = f'{pred_root}/{model_name}_{dataset_name}.xlsx'
if dataset_name in ['MMBench-Video']:
packstr = 'pack' if args.pack else 'nopack'
result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx'
elif dataset.MODALITY == 'VIDEO':
if args.pack:
logger.info(f'{dataset_name} not support Pack Mode, directly change to unpack')
args.pack = False
packstr = 'pack' if args.pack else 'nopack'
result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx'
if dataset_name in ['Video-MME']:
subtitlestr = 'subs' if args.use_subtitle else 'nosubs'
result_file = result_file.replace('.xlsx', f'_{subtitlestr}.xlsx')

if dataset.TYPE == 'MT':
result_file = result_file.replace('.xlsx', '.tsv')

if osp.exists(result_file) and args.rerun:
for keyword in ['openai', 'gpt', 'auxmatch']:
os.system(f'rm {pred_root}/{model_name}_{dataset_name}_{keyword}*')

if model is None:
model = model_name # which is only a name

# Perform the Inference
if dataset.MODALITY == 'VIDEO':
model = infer_data_job_video(
model,
work_dir=pred_root,
model_name=model_name,
dataset=dataset,
nframe=args.nframe,
pack=args.pack,
verbose=args.verbose,
subtitle=args.use_subtitle,
api_nproc=args.nproc)
elif dataset.TYPE == 'MT':
model = infer_data_job_mt(
model,
work_dir=pred_root,
model_name=model_name,
dataset=dataset,
verbose=args.verbose,
api_nproc=args.nproc,
ignore_failed=args.ignore)
else:
model = infer_data_job(
model,
work_dir=pred_root,
model_name=model_name,
dataset=dataset,
verbose=args.verbose,
api_nproc=args.nproc,
ignore_failed=args.ignore)

# Set the judge kwargs first before evaluation or dumping
judge_kwargs = {
'nproc': args.nproc,
'verbose': args.verbose,
}
if args.retry is not None:
judge_kwargs['retry'] = args.retry
if args.judge is not None:
judge_kwargs['model'] = args.judge
else:
if dataset.TYPE in ['MCQ', 'Y/N']:
judge_kwargs['model'] = 'chatgpt-0125'
elif listinstr(['MMVet', 'MathVista', 'LLaVABench', 'MMBench-Video', 'MathVision'],
dataset_name):
judge_kwargs['model'] = 'gpt-4-turbo'
elif listinstr(['MMLongBench', 'MMDU', 'DUDE', 'DUDE_MINI', 'SLIDEVQA', 'SLIDEVQA_MINI'],
dataset_name):
judge_kwargs['model'] = 'gpt-4o'
if 'OPENAI_API_KEY_JUDGE' in os.environ and len(os.environ['OPENAI_API_KEY_JUDGE']):
judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
if 'OPENAI_API_BASE_JUDGE' in os.environ and len(os.environ['OPENAI_API_BASE_JUDGE']):
judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']

if rank == 0:
if dataset_name in ['MMMU_TEST']:
result_json = MMMU_result_transfer(result_file)
logger.info(f'Transfer MMMU_TEST result to json for official evaluation, '
f'json file saved in {result_json}') # noqa: E501
continue
elif 'MMT-Bench_ALL' in dataset_name:
submission_file = MMTBench_result_transfer(result_file, **judge_kwargs)
logger.info(f'Extract options from prediction of MMT-Bench FULL split for official evaluation '
f'(https://eval.ai/web/challenges/challenge-page/2328/overview), '
f'submission file saved in {submission_file}') # noqa: E501
continue
elif 'MLLMGuard_DS' in dataset_name:
logger.info('The evaluation of MLLMGuard_DS is not supported yet. ') # noqa: E501
continue
elif 'AesBench_TEST' == dataset_name:
logger.info(f'The results are saved in {result_file}. '
f'Please send it to the AesBench Team via [email protected].') # noqa: E501
continue

if dataset_name in [
'MMBench_TEST_CN', 'MMBench_TEST_EN', 'MMBench', 'MMBench_CN',
'MMBench_TEST_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_V11', 'MMBench_CN_V11'
]:
if not MMBenchOfficialServer(dataset_name):
logger.error(
f'Can not evaluate {dataset_name} on non-official servers, '
'will skip the evaluation. '
)
continue

eval_proxy = os.environ.get('EVAL_PROXY', None)
old_proxy = os.environ.get('HTTP_PROXY', '')

if rank == 0 and args.mode == 'all':
if eval_proxy is not None:
proxy_set(eval_proxy)

eval_results = dataset.evaluate(result_file, **judge_kwargs)
if eval_results is not None:
assert isinstance(eval_results, dict) or isinstance(eval_results, pd.DataFrame)
logger.info(f'The evaluation of model {model_name} x dataset {dataset_name} has finished! ')
logger.info('Evaluation Results:')
if isinstance(eval_results, dict):
logger.info('\n' + json.dumps(eval_results, indent=4))
elif isinstance(eval_results, pd.DataFrame):
if len(eval_results) < len(eval_results.columns):
eval_results = eval_results.T
logger.info('\n' + tabulate(eval_results))

if eval_proxy is not None:
proxy_set(old_proxy)
except Exception as e:
logger.exception(f'Model {model_name} x Dataset {dataset_name} combination failed: {e}, '
'skipping this combination.')
continue

eval_proxy = os.environ.get('EVAL_PROXY', None)
old_proxy = os.environ.get('HTTP_PROXY', '')

if rank == 0 and args.mode == 'all':
if eval_proxy is not None:
proxy_set(eval_proxy)

eval_results = dataset.evaluate(result_file, **judge_kwargs)
if eval_results is not None:
assert isinstance(eval_results, dict) or isinstance(eval_results, pd.DataFrame)
logger.info(f'The evaluation of model {model_name} x dataset {dataset_name} has finished! ')
logger.info('Evaluation Results:')
if isinstance(eval_results, dict):
logger.info('\n' + json.dumps(eval_results, indent=4))
elif isinstance(eval_results, pd.DataFrame):
if len(eval_results) < len(eval_results.columns):
eval_results = eval_results.T
logger.info('\n' + tabulate(eval_results))

if eval_proxy is not None:
proxy_set(old_proxy)
if world_size > 1:
dist.barrier()

Expand Down

0 comments on commit 9ca28fd

Please sign in to comment.