Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Add GaoKaoMath Dataset for Evaluation & MATH Model Eval Config #1589

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions configs/datasets/gaokao_math/gaokao_math_gen_9b076f.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import GaoKaoMATHDataset, GaoKaoMATHEvaluator


MATH_CN_PROMPT="""
你是一个数学阅卷专家,任务是从给定的回答句子中提取精确的关键答案。你必须只提供提取的关键答案,不包括任何额外的文字。
我将为你提供一个问题、回答句子和问题类型。回答句子是对所提供问题的回应。利用提供的信息,你必须准确而精确地确定并从回答句子中提取预期的关键答案。请不要对问题发表主观看法。

对于单选题,答案应该是选项字母,例如 "A";
对于多选题,答案应该是一个选项字母的列表,例如 "A" 或 "A", "B", "C";
对于填空题,答案应该是一个填入空白处的答案列表,列表的数量应该与问题中的空白数量相同,例如 ["$$\\frac{{1}}{{2}}$$"] 或 ["$$\\frac{{1}}{{2}}$$", "2"]。
对于问答题,类似填空题,为每个小问抽出相应答案,例如 ["$$\\frac{{1}}{{2}}$$"] 或 ["$$\\frac{{1}}{{2}}$$", "2"]。

如果回答句子提供了多个不同的答案,请仔细判断后面提供的答案是否是对前面答案的修正或修改。如果是这样,提取这个修正或修改后的答案作为最终答案。相反,如果回答句子在多个答案之间波动而没有明确的最终答案,你应该输出 [No valid answer]。
问题类型: {question_type}
原始问题: {question}
回答: {response}
提取的关键答案:
"""

gaokao_math_reader_cfg = dict(input_columns=['question', 'response', 'question_type'], output_column='extract_answer')


gaokao_math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt=MATH_CN_PROMPT),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))

gaokao_math_eval_cfg = dict(
evaluator=dict(type=GaoKaoMATHEvaluator, url='http://0.0.0.0:23333/v1'))

gaokao_math_datasets = [
dict(
type=GaoKaoMATHDataset,
abbr='GaoKaoMATH',
path='./data/gaokao_math/test_2k.json',
reader_cfg=gaokao_math_reader_cfg,
infer_cfg=gaokao_math_infer_cfg,
eval_cfg=gaokao_math_eval_cfg)
]
78 changes: 78 additions & 0 deletions configs/datasets/math/math_0shot_llm_judge_gen_393424.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, GaoKaoMATHEvaluator
from opencompass.utils.model_postprocessors import naive_model_postprocess, xfinder_postprocess
from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE

# ----------------------------- Eval Parameters -----------------------------
## Postprocess function
post_func = 're' # 're', 'xfinder_model', 'naive_model'

## Evalute function
eval_func = 'naive_model' # 're', 'naive_model'

## Model api url
xfinder_url = 'http://0.0.0.0:23333/v1' # for 'xFinder-qwen1505' if post_func is 'xfinder_model'
naive_model_name = 'Qwen/Qwen2.5-72B-Instruct' # replace with your model name
naive_model_url = ['http://22.8.6.22:23333/v1', 'http://22.8.67.84:23333/v1', 'http://22.8.72.81:23333/v1', 'http://22.9.42.143:23333/v1'] # Multi-apis for accerlation

# ----------------------------- Detailed Config -----------------------------

math_reader_cfg = dict(input_columns=['problem'], output_column='solution')

math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024),
)

if post_func == 're':
pred_postprocessor = dict(type=math_postprocess_v2)
elif post_func == 'xfinder_model':
pred_postprocessor = dict(
type=xfinder_postprocess,
question_type='math',
model_name='xFinder-qwen1505',
num_processes=128,
api_url=xfinder_url,
)
elif post_func == 'naive_model':
pred_postprocessor = dict(
type=naive_model_postprocess,
custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE,
model_name=naive_model_name,
num_processes=64,
api_url=naive_model_url,
)

if eval_func == 're':
evaluator = dict(type=MATHEvaluator, version='v2')
elif eval_func == 'naive_model':
evaluator = dict(
type=GaoKaoMATHEvaluator,
model_name=naive_model_name,
url=naive_model_url,
)

math_eval_cfg = dict(
evaluator=evaluator, pred_postprocessor=pred_postprocessor,
)

math_datasets = [
dict(
type=MATHDataset,
abbr='math',
path='opencompass/math',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]
48 changes: 48 additions & 0 deletions opencompass/configs/datasets/gaokao_math/gaokao_math_gen_9b076f.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import GaoKaoMATHDataset, GaoKaoMATHEvaluator


MATH_CN_PROMPT="""
你是一个数学阅卷专家,任务是从给定的回答句子中提取精确的关键答案。你必须只提供提取的关键答案,不包括任何额外的文字。
我将为你提供一个问题、回答句子和问题类型。回答句子是对所提供问题的回应。利用提供的信息,你必须准确而精确地确定并从回答句子中提取预期的关键答案。请不要对问题发表主观看法。

对于单选题,答案应该是选项字母,例如 "A";
对于多选题,答案应该是一个选项字母的列表,例如 "A" 或 "A", "B", "C";
对于填空题,答案应该是一个填入空白处的答案列表,列表的数量应该与问题中的空白数量相同,例如 ["$$\\frac{{1}}{{2}}$$"] 或 ["$$\\frac{{1}}{{2}}$$", "2"]。
对于问答题,类似填空题,为每个小问抽出相应答案,例如 ["$$\\frac{{1}}{{2}}$$"] 或 ["$$\\frac{{1}}{{2}}$$", "2"]。

如果回答句子提供了多个不同的答案,请仔细判断后面提供的答案是否是对前面答案的修正或修改。如果是这样,提取这个修正或修改后的答案作为最终答案。相反,如果回答句子在多个答案之间波动而没有明确的最终答案,你应该输出 [No valid answer]。
问题类型: {question_type}
原始问题: {question}
回答: {response}
提取的关键答案:
"""

gaokao_math_reader_cfg = dict(input_columns=['question', 'response', 'question_type'], output_column='extract_answer')


gaokao_math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt=MATH_CN_PROMPT),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))

gaokao_math_eval_cfg = dict(
evaluator=dict(type=GaoKaoMATHEvaluator, url='http://0.0.0.0:23333/v1'))

gaokao_math_datasets = [
dict(
type=GaoKaoMATHDataset,
abbr='GaoKaoMATH',
path='./data/gaokao_math/test_2k.json',
reader_cfg=gaokao_math_reader_cfg,
infer_cfg=gaokao_math_infer_cfg,
eval_cfg=gaokao_math_eval_cfg)
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, GaoKaoMATHEvaluator
from opencompass.utils.model_postprocessors import naive_model_postprocess, xfinder_postprocess
from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE

# ----------------------------- Eval Parameters -----------------------------
## Postprocess function
post_func = 're' # 're', 'xfinder_model', 'naive_model'

## Evalute function
eval_func = 'naive_model' # 're', 'naive_model'

## Model api url
xfinder_url = 'http://0.0.0.0:23333/v1' # for 'xFinder-qwen1505' if post_func is 'xfinder_model'
naive_model_name = 'Qwen/Qwen2.5-72B-Instruct' # replace with your model name
naive_model_url = ['http://22.8.6.22:23333/v1', 'http://22.8.67.84:23333/v1', 'http://22.8.72.81:23333/v1', 'http://22.9.42.143:23333/v1'] # Multi-apis for accerlation

# ----------------------------- Detailed Config -----------------------------

math_reader_cfg = dict(input_columns=['problem'], output_column='solution')

math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024),
)

if post_func == 're':
pred_postprocessor = dict(type=math_postprocess_v2)
elif post_func == 'xfinder_model':
pred_postprocessor = dict(
type=xfinder_postprocess,
question_type='math',
model_name='xFinder-qwen1505',
num_processes=128,
api_url=xfinder_url,
)
elif post_func == 'naive_model':
pred_postprocessor = dict(
type=naive_model_postprocess,
custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE,
model_name=naive_model_name,
num_processes=64,
api_url=naive_model_url,
)

if eval_func == 're':
evaluator = dict(type=MATHEvaluator, version='v2')
elif eval_func == 'naive_model':
evaluator = dict(
type=GaoKaoMATHEvaluator,
model_name=naive_model_name,
url=naive_model_url,
)

math_eval_cfg = dict(
evaluator=evaluator, pred_postprocessor=pred_postprocessor,
)

math_datasets = [
dict(
type=MATHDataset,
abbr='math',
path='opencompass/math',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]
1 change: 1 addition & 0 deletions opencompass/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
from .flames import * # noqa: F401, F403
from .flores import * # noqa: F401, F403
from .game24 import * # noqa: F401, F403
from .gaokao_math import * # noqa: F401, F403
from .GaokaoBench import * # noqa: F401, F403
from .govrepcrs import * # noqa: F401, F403
from .gpqa import * # noqa: F401, F403
Expand Down
Loading
Loading