[Benchmark] MathVerse (#483)

* Add MathVerse * Fix format.
open-compass · Sep 23, 2024 · e254f00 · e254f00
1 parent 48b3224
commit e254f00
Show file tree

Hide file tree

Showing 8 changed files with 311 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -73,8 +73,9 @@ English | [简体中文](/docs/zh-CN/README_zh-CN.md) | [日本語](/docs/ja/REA
 | [**A-OKVQA**](https://arxiv.org/abs/2206.01718)+ | A-OKVQA | MCQ | [**MuirBench**](https://muirbench.github.io)+ | MUIRBench | MCQ |
 | [**GMAI-MMBench**](https://huggingface.co/papers/2408.03361)+ | GMAI-MMBench_VAL | MCQ | [**TableVQABench**](https://arxiv.org/abs/2404.19205)+ | TableVQABench | VQA |
 | [**MME-RealWorld**](https://arxiv.org/abs/2408.13257)+ | MME-RealWorld[-CN] | MCQ | [**HRBench**](https://arxiv.org/abs/2408.15556)+ | HRBench[4K/8K] | MCQ |
+| [**MathVerse**](https://mathverse-cuhk.github.io/)+ | MathVerse_MINI<br/>MathVerse_MINI_Vision_Only <br/>MathVerse_MINI_Vision_Dominant<br/>MathVerse_MINI_Vision_Intensive<br/>MathVerse_MINI_Text_Lite<br/>MathVerse_MINI_Text_Dominant | VQA |  |  |  |
 
-**\*** We only provide a subset of the evaluation results, since some VLMs do not yield reasonable results under the zero-shot setting
+d reasonable results under the zero-shot setting
 
 **\+** The evaluation results are not available yet
 

diff --git a/run.py b/run.py
@@ -157,7 +157,7 @@ def main():
                 if args.judge is not None:
                     judge_kwargs['model'] = args.judge
                 else:
-                    if dataset.TYPE in ['MCQ', 'Y/N']:
+                    if dataset.TYPE in ['MCQ', 'Y/N'] or listinstr(['MathVerse'], dataset_name):
                         judge_kwargs['model'] = 'chatgpt-0125'
                     elif listinstr(['MMVet', 'MathVista', 'LLaVABench', 'MMBench-Video', 'MathVision'],
                                    dataset_name):

diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py
@@ -9,7 +9,7 @@
 from .image_mt import MMDUDataset
 from .image_vqa import (
     ImageVQADataset, MathVision, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
-    CustomVQADataset, CRPE
+    CustomVQADataset, CRPE, MathVerse
 )
 
 from .vcr import VCRDataset
@@ -115,7 +115,7 @@ def evaluate(self, eval_file, **judge_kwargs):
     ImageCaptionDataset, ImageYORNDataset, ImageMCQDataset, ImageVQADataset, MathVision,
     MMMUDataset, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
     MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset,
-    GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE
+    GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE, MathVerse
 ]
 
 VIDEO_DATASET = [

diff --git a/vlmeval/dataset/image_vqa.py b/vlmeval/dataset/image_vqa.py
@@ -215,6 +215,111 @@ def evaluate(self, eval_file, **judge_kwargs):
         return score
 
 
+class MathVerse(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'MathVerse_MINI': 'https://huggingface.co/datasets/CaraJ/Mathverse_VLMEvalKit/resolve/main/testmini.tsv', # noqa
+        'MathVerse_MINI_Vision_Only': 'https://huggingface.co/datasets/CaraJ/Mathverse_VLMEvalKit/resolve/main/testmini_Vision_Only.tsv', # noqa
+        'MathVerse_MINI_Vision_Dominant': 'https://huggingface.co/datasets/CaraJ/Mathverse_VLMEvalKit/resolve/main/testmini_Vision_Dominant.tsv', # noqa
+        'MathVerse_MINI_Vision_Intensive': 'https://huggingface.co/datasets/CaraJ/Mathverse_VLMEvalKit/resolve/main/testmini_Vision_Intensive.tsv', # noqa
+        'MathVerse_MINI_Text_Lite': 'https://huggingface.co/datasets/CaraJ/Mathverse_VLMEvalKit/resolve/main/testmini_Text_Lite.tsv', # noqa
+        'MathVerse_MINI_Text_Dominant': 'https://huggingface.co/datasets/CaraJ/Mathverse_VLMEvalKit/resolve/main/testmini_Text_Dominant.tsv', # noqa
+    }
+    DATASET_MD5 = {
+        'MathVerse_MINI': '5017caca32b7fa110c350a1bea861b65',
+        'MathVerse_MINI_Vision_Only': '68a11d4680014ac881fa37adeadea3a4',
+        'MathVerse_MINI_Vision_Dominant': 'b8fb63852d261ab2aaefba29cc2414d3',
+        'MathVerse_MINI_Vision_Intensive': '01cbd35be202bb0c4873a4186a63bc19',
+        'MathVerse_MINI_Text_Lite': '19e4b13bdd30b89a03b2e358bcfefa04',
+        'MathVerse_MINI_Text_Dominant': '4f5cd2fa6630ea00bb11d6fde1f6fe6a',
+    }
+
+    # It returns a DataFrame
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.mathverse import MathVerse_auxeval_extract, MathVerse_auxeval_score, MathVerse_acc
+
+        model = judge_kwargs['model']
+        suffix = eval_file.split('.')[-1]
+        storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx')
+        tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl')
+        storage_score = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
+        tmp_file_score = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl')
+        nproc = judge_kwargs.pop('nproc', 4)
+        # stage1: extract the answer
+        if not osp.exists(storage_extract):
+            data = load(eval_file)
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+
+            ans = {}
+            if osp.exists(tmp_file_extract):
+                ans = load(tmp_file_extract)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+
+            if len(indices):
+                new_results = track_progress_rich(
+                    MathVerse_auxeval_extract,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file_extract,
+                )
+                ans = load(tmp_file_extract)
+                for k, v in zip(indices, new_results):
+                    assert k in ans
+                    assert ans[k]['log_extract'] == v['log_extract'] and ans[k]['extract'] == v['extract']
+
+            data['extract'] = [ans[idx]['extract'] for idx in data['index']]
+            data['log_extract'] = [ans[idx]['log_extract'] for idx in data['index']]
+            dump(data, storage_extract)
+
+        # stage2: score the answer
+        if not osp.exists(storage_score):
+            data = load(storage_extract)
+            model = build_judge(max_tokens=128, **judge_kwargs)
+            assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
+            lt = len(data)
+            lines = [data.iloc[i] for i in range(lt)]
+            tups = [(model, line) for line in lines]
+            indices = [line['index'] for line in lines]
+
+            ans = {}
+            if osp.exists(tmp_file_score):
+                ans = load(tmp_file_score)
+            tups = [x for x, i in zip(tups, indices) if i not in ans]
+            indices = [i for i in indices if i not in ans]
+
+            if len(indices):
+                new_results = track_progress_rich(
+                    MathVerse_auxeval_score,
+                    tups,
+                    nproc=nproc,
+                    chunksize=nproc,
+                    keys=indices,
+                    save=tmp_file_score,
+                )
+                ans = load(tmp_file_score)
+                for k, v in zip(indices, new_results):
+                    assert k in ans
+                    assert ans[k]['log_score'] == v['log_score'] and ans[k]['score'] == v['score']
+
+            data['score'] = [ans[idx]['score'] for idx in data['index']]
+            data['log_score'] = [ans[idx]['log_score'] for idx in data['index']]
+            dump(data, storage_score)
+
+        score = MathVerse_acc(storage_score)
+        score_pth = storage_score.replace('.xlsx', '_score.csv')
+        dump(score, score_pth)
+        return score
+
+
 class MathVision(ImageBaseDataset):
     TYPE = 'VQA'
     DATASET_URL = {

diff --git a/vlmeval/dataset/utils/mathverse.py b/vlmeval/dataset/utils/mathverse.py
@@ -0,0 +1,198 @@
+from ...smp import *
+from ...utils import can_infer
+
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+
+def get_gpt4_extract_ICE():
+    example_1 = """
+1.
+Model response: 'Rounded to two decimal places, the perimeter of the sector is approximately:\n\n(-2, 1)'
+Extracted Answer: (-2, 1)
+""" # noqa
+
+    example_2 = """
+2.
+Model response: 'at those points.\n\nTherefore, the correct option that represents the meaning of the intersection points of the graphs is:\n\nD. They give the solutions to the equation $f(t)=g(t)$.",'
+Extracted Answer: D
+""" # noqa
+
+    example_3 = """
+3.
+Model response: ' at 1 (there's a closed circle at y = 1), the range in interval notation is \\((-4, 1]\\).\n\nFinal values:\nDomain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)'
+Extracted Answer: Domain: \\((-3, 3]\\)\nRange: \\((-4, 1]\\)
+""" # noqa
+
+    example_4 = """
+4.
+Model response: 'As it stands, I cannot provide the correct option letter because there isn't enough information to solve for 'y'.'
+Extracted Answer: null
+""" # noqa
+
+    example_5 = """
+5.
+Model response: 'Given that AB = 17.6 meters, we can now substitute into the equation:\n\nd = 17.6 / cos(38\u00b0)\n\nTherefore, to one decimal place, the distance d between Ned and Bart is approximately 22.3 meters.'
+Extracted answer: 22.3
+""" # noqa
+
+    example_6 = """
+6.
+Model response:  have all the coefficients for the quadratic function:\n\\( f(x) = ax^2 + bx + c \\)\n\\( f(x) = -1x^2 - 2x + 1 \\)\n\nTherefore, the equation for the graphed function \\( f \\) is:\n\\( f(x) = -x^2 - 2x + 1 \\)"'
+Extracted answer: f(x) = -x^2 - 2x + 1
+""" # noqa
+
+    return [example_1, example_2, example_3, example_4, example_5, example_6]
+
+
+def get_gpt4_score_ICE():
+    example_1 = """
+[Question]: Write the set of numbers represented on the number line in interval notation.
+[Standard Answer]: (-2,1]
+[Model_answer] : Extracted Answer: \\((-2, 1)\\)
+Judgement: 0
+""" # noqa
+
+    example_2 = """
+[Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
+[Standard Answer]: C
+[Model_answer] : B:2\u221a{{3}}
+Judgement: 0
+""" # noqa
+
+    example_3 = """
+[Question]: Find the domain and range of the function f using interval notation.
+[Standard Answer]: domain: [-4, 0) and range: (-3, 1]
+[Model_answer] : Range: \\((-4, 1]\\)
+Judgement: 0
+""" # noqa
+
+    example_4 = """
+[Question]: As shown in the figure, circle O has a radius 1.0, if angle BAC = 60.0, then the length of BC is ()\nChoices:\nA:2\nB:2\u221a{{3}}\nC:\u221a{{3}}\nD:2\u221a{{2}}
+[Standard Answer]: C
+[Model_answer] : null
+Judgement: 0
+""" # noqa
+
+    return [example_1, example_2, example_3, example_4]
+
+
+def build_mathverse_gpt4_extract_prompt(line):
+    task_description = """
+I am providing you a response from a model to a math problem, termed 'Model Response'. You should extract the answer from the response as 'Extracted Answer'. Directly output the extracted answer with no explanation.\n\n
+""" # noqa
+    prediction = str(line['prediction'])
+    demo_prompt = task_description
+    examples = get_gpt4_extract_ICE()
+    for example in examples:
+        demo_prompt += example + '\n\n'
+    test_prompt = f"Model response: '{prediction}'\nExtracted Answer: "
+    full_prompt = f'{demo_prompt}7.\n{test_prompt}'
+
+    return full_prompt
+
+
+def build_mathverse_gpt4_score_prompt(line):
+    task_description = """
+Below are two answers to a math question. Question is [Question], [Standard Answer] is the standard answer to the question, and [Model_answer] is the answer extracted from a model's output to this question.  Determine whether these two answers are consistent.
+Please note that only when the [Model_answer] completely matches the [Standard Answer] means they are consistent. For non-multiple-choice questions, if the meaning is expressed in the same way, it is also considered consistent, for example, 0.5m and 50cm.
+If they are consistent, Judement is 1; if they are different, Judement is 0.\n\n
+""" # noqa
+    question_for_eval = line['question_for_eval']
+    extract = line['extract']
+    answer = line['answer']
+    demo_prompt = task_description
+    examples = get_gpt4_score_ICE()
+    for example in examples:
+        demo_prompt += example + '\n\n'
+    test_prompt = f"""
+    [Question]: {question_for_eval}
+    [Standard Answer]: {answer}
+    [Model_answer] : {extract}
+    Judgement:"""
+    full_prompt = f'{demo_prompt}{test_prompt}'
+
+    return full_prompt
+
+
+def post_check_score(line, prefetch=False):
+    ans = str(line['answer']).strip()
+    response = str(line['extract']).strip()
+
+    if response == ans:
+        return response if prefetch else True
+    else:
+        return False
+
+
+def MathVerse_auxeval_extract(model, line):
+    prompt = build_mathverse_gpt4_extract_prompt(line)
+    log = ''
+    retry = 5
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+
+        if FAIL_MSG in res:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            return dict(log_extract=log, extract=res)
+    log += 'All 5 retries failed.\n'
+    return dict(log_extract=log, extract='')
+
+
+def MathVerse_auxeval_score(model, line):
+    prompt = build_mathverse_gpt4_score_prompt(line)
+    log = ''
+    retry = 5
+    if post_check_score(line, prefetch=True):
+        res = post_check_score(line, prefetch=True)
+        return dict(log_score='Prefetch succeed', score=True)
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+
+        if FAIL_MSG in res or res.strip() not in ['0', '1']:
+            log += f'Try {i}: output is {prediction}, res is {res}, failed to parse.\n'
+        else:
+            log += 'Succeed'
+            return dict(log_score=log, score=int(res) == 1)
+    log += 'All 5 retries failed.\n'
+    return dict(log_score=log, score=False)
+
+
+def get_acc_with_condition(res_pd, key, value):
+    """
+    Calculate the accuracy of predictions with a specific condition
+    """
+    total_pd = res_pd[res_pd[key] == value]
+    correct_pd = total_pd[total_pd['score']]
+    acc = '{:.2f}'.format(len(correct_pd) / len(total_pd) * 100) if len(total_pd) > 0 else '0.00'
+    return len(correct_pd), len(total_pd), acc
+
+
+def MathVerse_acc(result_file):
+    df = load(result_file)
+    total = len(df)
+    correct = sum(1 for _, row in df.iterrows() if row['score'])
+    accuracy = round(correct / total * 100, 2)
+    scores = {'average': {'accuracy': accuracy, 'correct': correct, 'total': total}}
+
+    df['metadata'] = df['metadata'].apply(lambda x: x.replace("'", '"'))
+    df['metadata'] = df['metadata'].apply(json.loads)
+    df_metadata = pd.json_normalize(df['metadata'])
+    df = pd.concat([df.drop('metadata', axis=1), df_metadata], axis=1)
+
+    target_keys = ['problem_version', 'subfield']
+
+    for key in target_keys:
+        values = df[key].unique()
+        scores[key] = {}
+        for value in values:
+            correct, total, acc = get_acc_with_condition(df, key, value)
+            if total > 0:
+                scores[key][value] = {'accuracy': acc, 'correct': correct, 'total': total}
+        scores[key] = dict(sorted(scores[key].items(), key=lambda item: float(item[1]['accuracy']), reverse=True))
+
+    return pd.DataFrame.from_dict(scores, orient='index')
diff --git a/vlmeval/tools.py b/vlmeval/tools.py
@@ -351,7 +351,7 @@ def EVAL(dataset_name, data_file):
     dataset = build_dataset(dataset_name)
     # Set the judge kwargs first before evaluation or dumping
     judge_kwargs = {'nproc': 4, 'verbose': True}
-    if dataset.TYPE in ['MCQ', 'Y/N']:
+    if dataset.TYPE in ['MCQ', 'Y/N'] or listinstr(['MathVerse'], dataset_name):
         judge_kwargs['model'] = 'chatgpt-0125'
     elif listinstr(['MMVet', 'MathVista', 'LLaVABench', 'MMBench-Video', 'MathVision'], dataset_name):
         judge_kwargs['model'] = 'gpt-4-turbo'

diff --git a/vlmeval/vlm/internvl_chat.py b/vlmeval/vlm/internvl_chat.py
@@ -247,7 +247,7 @@ def build_prompt(self, line, dataset=None):
             prompt = self.build_multi_choice_prompt(line, dataset)
         elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
             question = line['question']
-            if listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet'], dataset):
+            if listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet', 'MathVerse'], dataset):
                 prompt = question
             elif listinstr(['LLaVABench'], dataset):
                 prompt = question + '\nAnswer this question in detail.'

diff --git a/vlmeval/vlm/mmalaya.py b/vlmeval/vlm/mmalaya.py
@@ -306,7 +306,7 @@ def build_prompt(self, line, dataset=None):
         elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
             prompt = self.build_multi_choice_prompt(line, dataset)
         elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
-            if listinstr(['MathVista', 'MathVision'], dataset):
+            if listinstr(['MathVista', 'MathVision', 'MathVerse'], dataset):
                 prompt = line['question']
             elif listinstr(['LLaVABench'], dataset):
                 question = line['question']