-
Notifications
You must be signed in to change notification settings - Fork 162
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Fix] Add try-except block in model-dataset combination processing to…
… prevent single errors from halting subsequent runs
- Loading branch information
Showing
1 changed file
with
154 additions
and
146 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -70,155 +70,163 @@ def main(): | |
os.makedirs(pred_root, exist_ok=True) | ||
|
||
for _, dataset_name in enumerate(args.data): | ||
dataset_kwargs = {} | ||
if dataset_name in ['MMLongBench_DOC', 'DUDE', 'DUDE_MINI', 'SLIDEVQA', 'SLIDEVQA_MINI']: | ||
dataset_kwargs['model'] = model_name | ||
if dataset_name == 'MMBench-Video': | ||
dataset_kwargs['pack'] = args.pack | ||
if dataset_name == 'Video-MME': | ||
dataset_kwargs['use_subtitle'] = args.use_subtitle | ||
|
||
# If distributed, first build the dataset on the main process for doing preparation works | ||
if world_size > 1: | ||
dataset = build_dataset(dataset_name, **dataset_kwargs) if rank == 0 else None | ||
dist.barrier() | ||
dataset_list = [dataset] | ||
dist.broadcast_object_list(dataset_list, src=0) | ||
dataset = dataset_list[0] | ||
else: | ||
dataset = build_dataset(dataset_name, **dataset_kwargs) | ||
if dataset is None: | ||
logger.error(f'Dataset {dataset_name} is not valid, will be skipped. ') | ||
continue | ||
|
||
result_file = f'{pred_root}/{model_name}_{dataset_name}.xlsx' | ||
if dataset_name in ['MMBench-Video']: | ||
packstr = 'pack' if args.pack else 'nopack' | ||
result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx' | ||
elif dataset.MODALITY == 'VIDEO': | ||
if args.pack: | ||
logger.info(f'{dataset_name} not support Pack Mode, directly change to unpack') | ||
args.pack = False | ||
packstr = 'pack' if args.pack else 'nopack' | ||
result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx' | ||
if dataset_name in ['Video-MME']: | ||
subtitlestr = 'subs' if args.use_subtitle else 'nosubs' | ||
result_file = result_file.replace('.xlsx', f'_{subtitlestr}.xlsx') | ||
|
||
if dataset.TYPE == 'MT': | ||
result_file = result_file.replace('.xlsx', '.tsv') | ||
|
||
if osp.exists(result_file) and args.rerun: | ||
for keyword in ['openai', 'gpt', 'auxmatch']: | ||
os.system(f'rm {pred_root}/{model_name}_{dataset_name}_{keyword}*') | ||
|
||
if model is None: | ||
model = model_name # which is only a name | ||
|
||
# Perform the Inference | ||
if dataset.MODALITY == 'VIDEO': | ||
model = infer_data_job_video( | ||
model, | ||
work_dir=pred_root, | ||
model_name=model_name, | ||
dataset=dataset, | ||
nframe=args.nframe, | ||
pack=args.pack, | ||
verbose=args.verbose, | ||
subtitle=args.use_subtitle, | ||
api_nproc=args.nproc) | ||
elif dataset.TYPE == 'MT': | ||
model = infer_data_job_mt( | ||
model, | ||
work_dir=pred_root, | ||
model_name=model_name, | ||
dataset=dataset, | ||
verbose=args.verbose, | ||
api_nproc=args.nproc, | ||
ignore_failed=args.ignore) | ||
else: | ||
model = infer_data_job( | ||
model, | ||
work_dir=pred_root, | ||
model_name=model_name, | ||
dataset=dataset, | ||
verbose=args.verbose, | ||
api_nproc=args.nproc, | ||
ignore_failed=args.ignore) | ||
|
||
# Set the judge kwargs first before evaluation or dumping | ||
judge_kwargs = { | ||
'nproc': args.nproc, | ||
'verbose': args.verbose, | ||
} | ||
if args.retry is not None: | ||
judge_kwargs['retry'] = args.retry | ||
if args.judge is not None: | ||
judge_kwargs['model'] = args.judge | ||
else: | ||
if dataset.TYPE in ['MCQ', 'Y/N']: | ||
judge_kwargs['model'] = 'chatgpt-0125' | ||
elif listinstr(['MMVet', 'MathVista', 'LLaVABench', 'MMBench-Video', 'MathVision'], dataset_name): | ||
judge_kwargs['model'] = 'gpt-4-turbo' | ||
elif listinstr(['MMLongBench', 'MMDU', 'DUDE', 'DUDE_MINI', 'SLIDEVQA', 'SLIDEVQA_MINI'], dataset_name): | ||
judge_kwargs['model'] = 'gpt-4o' | ||
if 'OPENAI_API_KEY_JUDGE' in os.environ and len(os.environ['OPENAI_API_KEY_JUDGE']): | ||
judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE'] | ||
if 'OPENAI_API_BASE_JUDGE' in os.environ and len(os.environ['OPENAI_API_BASE_JUDGE']): | ||
judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE'] | ||
|
||
if rank == 0: | ||
if dataset_name in ['MMMU_TEST']: | ||
result_json = MMMU_result_transfer(result_file) | ||
logger.info(f'Transfer MMMU_TEST result to json for official evaluation, ' | ||
f'json file saved in {result_json}') # noqa: E501 | ||
continue | ||
elif 'MMT-Bench_ALL' in dataset_name: | ||
submission_file = MMTBench_result_transfer(result_file, **judge_kwargs) | ||
logger.info(f'Extract options from prediction of MMT-Bench FULL split for official evaluation ' | ||
f'(https://eval.ai/web/challenges/challenge-page/2328/overview), ' | ||
f'submission file saved in {submission_file}') # noqa: E501 | ||
continue | ||
elif 'MLLMGuard_DS' in dataset_name: | ||
logger.info('The evaluation of MLLMGuard_DS is not supported yet. ') # noqa: E501 | ||
continue | ||
elif 'AesBench_TEST' == dataset_name: | ||
logger.info(f'The results are saved in {result_file}. ' | ||
f'Please send it to the AesBench Team via [email protected].') # noqa: E501 | ||
try: | ||
dataset_kwargs = {} | ||
if dataset_name in ['MMLongBench_DOC', 'DUDE', 'DUDE_MINI', 'SLIDEVQA', 'SLIDEVQA_MINI']: | ||
dataset_kwargs['model'] = model_name | ||
if dataset_name == 'MMBench-Video': | ||
dataset_kwargs['pack'] = args.pack | ||
if dataset_name == 'Video-MME': | ||
dataset_kwargs['use_subtitle'] = args.use_subtitle | ||
|
||
# If distributed, first build the dataset on the main process for doing preparation works | ||
if world_size > 1: | ||
dataset = build_dataset(dataset_name, **dataset_kwargs) if rank == 0 else None | ||
dist.barrier() | ||
dataset_list = [dataset] | ||
dist.broadcast_object_list(dataset_list, src=0) | ||
dataset = dataset_list[0] | ||
else: | ||
dataset = build_dataset(dataset_name, **dataset_kwargs) | ||
if dataset is None: | ||
logger.error(f'Dataset {dataset_name} is not valid, will be skipped. ') | ||
continue | ||
|
||
if dataset_name in [ | ||
'MMBench_TEST_CN', 'MMBench_TEST_EN', 'MMBench', 'MMBench_CN', | ||
'MMBench_TEST_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_V11', 'MMBench_CN_V11' | ||
]: | ||
if not MMBenchOfficialServer(dataset_name): | ||
logger.error( | ||
f'Can not evaluate {dataset_name} on non-official servers, ' | ||
'will skip the evaluation. ' | ||
) | ||
continue | ||
result_file = f'{pred_root}/{model_name}_{dataset_name}.xlsx' | ||
if dataset_name in ['MMBench-Video']: | ||
packstr = 'pack' if args.pack else 'nopack' | ||
result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx' | ||
elif dataset.MODALITY == 'VIDEO': | ||
if args.pack: | ||
logger.info(f'{dataset_name} not support Pack Mode, directly change to unpack') | ||
args.pack = False | ||
packstr = 'pack' if args.pack else 'nopack' | ||
result_file = f'{pred_root}/{model_name}_{dataset_name}_{args.nframe}frame_{packstr}.xlsx' | ||
if dataset_name in ['Video-MME']: | ||
subtitlestr = 'subs' if args.use_subtitle else 'nosubs' | ||
result_file = result_file.replace('.xlsx', f'_{subtitlestr}.xlsx') | ||
|
||
if dataset.TYPE == 'MT': | ||
result_file = result_file.replace('.xlsx', '.tsv') | ||
|
||
if osp.exists(result_file) and args.rerun: | ||
for keyword in ['openai', 'gpt', 'auxmatch']: | ||
os.system(f'rm {pred_root}/{model_name}_{dataset_name}_{keyword}*') | ||
|
||
if model is None: | ||
model = model_name # which is only a name | ||
|
||
# Perform the Inference | ||
if dataset.MODALITY == 'VIDEO': | ||
model = infer_data_job_video( | ||
model, | ||
work_dir=pred_root, | ||
model_name=model_name, | ||
dataset=dataset, | ||
nframe=args.nframe, | ||
pack=args.pack, | ||
verbose=args.verbose, | ||
subtitle=args.use_subtitle, | ||
api_nproc=args.nproc) | ||
elif dataset.TYPE == 'MT': | ||
model = infer_data_job_mt( | ||
model, | ||
work_dir=pred_root, | ||
model_name=model_name, | ||
dataset=dataset, | ||
verbose=args.verbose, | ||
api_nproc=args.nproc, | ||
ignore_failed=args.ignore) | ||
else: | ||
model = infer_data_job( | ||
model, | ||
work_dir=pred_root, | ||
model_name=model_name, | ||
dataset=dataset, | ||
verbose=args.verbose, | ||
api_nproc=args.nproc, | ||
ignore_failed=args.ignore) | ||
|
||
# Set the judge kwargs first before evaluation or dumping | ||
judge_kwargs = { | ||
'nproc': args.nproc, | ||
'verbose': args.verbose, | ||
} | ||
if args.retry is not None: | ||
judge_kwargs['retry'] = args.retry | ||
if args.judge is not None: | ||
judge_kwargs['model'] = args.judge | ||
else: | ||
if dataset.TYPE in ['MCQ', 'Y/N']: | ||
judge_kwargs['model'] = 'chatgpt-0125' | ||
elif listinstr(['MMVet', 'MathVista', 'LLaVABench', 'MMBench-Video', 'MathVision'], | ||
dataset_name): | ||
judge_kwargs['model'] = 'gpt-4-turbo' | ||
elif listinstr(['MMLongBench', 'MMDU', 'DUDE', 'DUDE_MINI', 'SLIDEVQA', 'SLIDEVQA_MINI'], | ||
dataset_name): | ||
judge_kwargs['model'] = 'gpt-4o' | ||
if 'OPENAI_API_KEY_JUDGE' in os.environ and len(os.environ['OPENAI_API_KEY_JUDGE']): | ||
judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE'] | ||
if 'OPENAI_API_BASE_JUDGE' in os.environ and len(os.environ['OPENAI_API_BASE_JUDGE']): | ||
judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE'] | ||
|
||
if rank == 0: | ||
if dataset_name in ['MMMU_TEST']: | ||
result_json = MMMU_result_transfer(result_file) | ||
logger.info(f'Transfer MMMU_TEST result to json for official evaluation, ' | ||
f'json file saved in {result_json}') # noqa: E501 | ||
continue | ||
elif 'MMT-Bench_ALL' in dataset_name: | ||
submission_file = MMTBench_result_transfer(result_file, **judge_kwargs) | ||
logger.info(f'Extract options from prediction of MMT-Bench FULL split for official evaluation ' | ||
f'(https://eval.ai/web/challenges/challenge-page/2328/overview), ' | ||
f'submission file saved in {submission_file}') # noqa: E501 | ||
continue | ||
elif 'MLLMGuard_DS' in dataset_name: | ||
logger.info('The evaluation of MLLMGuard_DS is not supported yet. ') # noqa: E501 | ||
continue | ||
elif 'AesBench_TEST' == dataset_name: | ||
logger.info(f'The results are saved in {result_file}. ' | ||
f'Please send it to the AesBench Team via [email protected].') # noqa: E501 | ||
continue | ||
|
||
if dataset_name in [ | ||
'MMBench_TEST_CN', 'MMBench_TEST_EN', 'MMBench', 'MMBench_CN', | ||
'MMBench_TEST_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_V11', 'MMBench_CN_V11' | ||
]: | ||
if not MMBenchOfficialServer(dataset_name): | ||
logger.error( | ||
f'Can not evaluate {dataset_name} on non-official servers, ' | ||
'will skip the evaluation. ' | ||
) | ||
continue | ||
|
||
eval_proxy = os.environ.get('EVAL_PROXY', None) | ||
old_proxy = os.environ.get('HTTP_PROXY', '') | ||
|
||
if rank == 0 and args.mode == 'all': | ||
if eval_proxy is not None: | ||
proxy_set(eval_proxy) | ||
|
||
eval_results = dataset.evaluate(result_file, **judge_kwargs) | ||
if eval_results is not None: | ||
assert isinstance(eval_results, dict) or isinstance(eval_results, pd.DataFrame) | ||
logger.info(f'The evaluation of model {model_name} x dataset {dataset_name} has finished! ') | ||
logger.info('Evaluation Results:') | ||
if isinstance(eval_results, dict): | ||
logger.info('\n' + json.dumps(eval_results, indent=4)) | ||
elif isinstance(eval_results, pd.DataFrame): | ||
if len(eval_results) < len(eval_results.columns): | ||
eval_results = eval_results.T | ||
logger.info('\n' + tabulate(eval_results)) | ||
|
||
if eval_proxy is not None: | ||
proxy_set(old_proxy) | ||
except Exception as e: | ||
logger.exception(f'Model {model_name} x Dataset {dataset_name} combination failed: {e}, ' | ||
'skipping this combination.') | ||
continue | ||
|
||
eval_proxy = os.environ.get('EVAL_PROXY', None) | ||
old_proxy = os.environ.get('HTTP_PROXY', '') | ||
|
||
if rank == 0 and args.mode == 'all': | ||
if eval_proxy is not None: | ||
proxy_set(eval_proxy) | ||
|
||
eval_results = dataset.evaluate(result_file, **judge_kwargs) | ||
if eval_results is not None: | ||
assert isinstance(eval_results, dict) or isinstance(eval_results, pd.DataFrame) | ||
logger.info(f'The evaluation of model {model_name} x dataset {dataset_name} has finished! ') | ||
logger.info('Evaluation Results:') | ||
if isinstance(eval_results, dict): | ||
logger.info('\n' + json.dumps(eval_results, indent=4)) | ||
elif isinstance(eval_results, pd.DataFrame): | ||
if len(eval_results) < len(eval_results.columns): | ||
eval_results = eval_results.T | ||
logger.info('\n' + tabulate(eval_results)) | ||
|
||
if eval_proxy is not None: | ||
proxy_set(old_proxy) | ||
if world_size > 1: | ||
dist.barrier() | ||
|
||
|