From 12b84aeb3bf5bd1f89a0b1287db5ee435b4c3511 Mon Sep 17 00:00:00 2001 From: jxd <44057635+jxd0712@users.noreply.github.com> Date: Fri, 26 Jul 2024 18:42:30 +0800 Subject: [PATCH] [Feature] Update CHARM Memeorziation (#1230) * update gemini api and add gemini models * add openai models * update CHARM evaluation * add CHARM memorization tasks * add CharmMemSummarizer (output eval details for memorization-independent reasoning analysis * update CHARM readme --------- Co-authored-by: wujiang --- configs/datasets/CHARM/README.md | 62 +++++- configs/datasets/CHARM/README_ZH.md | 62 +++++- .../datasets/CHARM/charm_memory_gen_bbbd53.py | 63 ++++++ .../datasets/CHARM/charm_memory_settings.py | 31 +++ configs/eval_charm_mem.py | 94 ++++++++ configs/{eval_charm.py => eval_charm_rea.py} | 58 +++-- configs/models/gemini/gemini_1_5_flash.py | 22 ++ configs/models/gemini/gemini_1_5_pro.py | 22 ++ configs/models/gemini/gemini_pro.py | 3 +- configs/models/openai/gpt_3_5_turbo_0125.py | 20 ++ configs/models/openai/gpt_4o_2024_05_13.py | 20 ++ opencompass/datasets/charm.py | 100 ++++++++- opencompass/models/gemini_api.py | 31 ++- .../summarizers/subjective/__init__.py | 1 + opencompass/summarizers/subjective/charm.py | 208 ++++++++++++++++++ 15 files changed, 759 insertions(+), 38 deletions(-) create mode 100644 configs/datasets/CHARM/charm_memory_gen_bbbd53.py create mode 100644 configs/datasets/CHARM/charm_memory_settings.py create mode 100644 configs/eval_charm_mem.py rename configs/{eval_charm.py => eval_charm_rea.py} (62%) create mode 100644 configs/models/gemini/gemini_1_5_flash.py create mode 100644 configs/models/gemini/gemini_1_5_pro.py create mode 100644 configs/models/openai/gpt_3_5_turbo_0125.py create mode 100644 configs/models/openai/gpt_4o_2024_05_13.py create mode 100644 opencompass/summarizers/subjective/charm.py diff --git a/configs/datasets/CHARM/README.md b/configs/datasets/CHARM/README.md index b297f8528..a89b24758 100644 --- a/configs/datasets/CHARM/README.md +++ b/configs/datasets/CHARM/README.md @@ -86,15 +86,69 @@ Below are the steps for quickly downloading CHARM and using OpenCompass for eval ### 1. Download CHARM ```bash git clone https://github.com/opendatalab/CHARM ${path_to_CHARM_repo} + +cd ${path_to_opencompass} +mkdir data +ln -snf ${path_to_CHARM_repo}/data/CHARM ./data/CHARM ``` ### 2. Run Inference and Evaluation ```bash cd ${path_to_opencompass} -mkdir -p data -ln -snf ${path_to_CHARM_repo}/data/CHARM ./data/CHARM -# Infering and evaluating CHARM with hf_llama3_8b_instruct model -python run.py --models hf_llama3_8b_instruct --datasets charm_gen +# modify config file `configs/eval_charm_rea.py`: uncomment or add models you want to evaluate +python run.py configs/eval_charm_rea.py -r --dump-eval-details + +# modify config file `configs/eval_charm_mem.py`: uncomment or add models you want to evaluate +python run.py configs/eval_charm_mem.py -r --dump-eval-details +``` +The inference and evaluation results would be in `${path_to_opencompass}/outputs`, like this: +```bash +outputs +├── CHARM_mem +│ └── chat +│ └── 20240605_151442 +│ ├── predictions +│ │ ├── internlm2-chat-1.8b-turbomind +│ │ ├── llama-3-8b-instruct-lmdeploy +│ │ └── qwen1.5-1.8b-chat-hf +│ ├── results +│ │ ├── internlm2-chat-1.8b-turbomind_judged-by--GPT-3.5-turbo-0125 +│ │ ├── llama-3-8b-instruct-lmdeploy_judged-by--GPT-3.5-turbo-0125 +│ │ └── qwen1.5-1.8b-chat-hf_judged-by--GPT-3.5-turbo-0125 +│   └── summary +│   └── 20240605_205020 # MEMORY_SUMMARY_DIR +│   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Anachronisms_Judgment +│   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Movie_and_Music_Recommendation +│   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Sport_Understanding +│   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Time_Understanding +│   └── judged-by--GPT-3.5-turbo-0125.csv # MEMORY_SUMMARY_CSV +└── CHARM_rea + └── chat + └── 20240605_152359 + ├── predictions + │ ├── internlm2-chat-1.8b-turbomind + │ ├── llama-3-8b-instruct-lmdeploy + │ └── qwen1.5-1.8b-chat-hf + ├── results # REASON_RESULTS_DIR + │ ├── internlm2-chat-1.8b-turbomind + │ ├── llama-3-8b-instruct-lmdeploy + │ └── qwen1.5-1.8b-chat-hf + └── summary + ├── summary_20240605_205328.csv # REASON_SUMMARY_CSV + └── summary_20240605_205328.txt +``` +### 3. Generate Analysis Results +```bash +cd ${path_to_CHARM_repo} + +# generate Table5, Table6, Table9 and Table10 in https://arxiv.org/abs/2403.14112 +PYTHONPATH=. python tools/summarize_reasoning.py ${REASON_SUMMARY_CSV} + +# generate Figure3 and Figure9 in https://arxiv.org/abs/2403.14112 +PYTHONPATH=. python tools/summarize_mem_rea.py ${REASON_SUMMARY_CSV} ${MEMORY_SUMMARY_CSV} + +# generate Table7, Table12, Table13 and Figure11 in https://arxiv.org/abs/2403.14112 +PYTHONPATH=. python tools/analyze_mem_indep_rea.py data/CHARM ${REASON_RESULTS_DIR} ${MEMORY_SUMMARY_DIR} ${MEMORY_SUMMARY_CSV} ``` ## 🖊️ Citation diff --git a/configs/datasets/CHARM/README_ZH.md b/configs/datasets/CHARM/README_ZH.md index 414c65fe3..c5381030e 100644 --- a/configs/datasets/CHARM/README_ZH.md +++ b/configs/datasets/CHARM/README_ZH.md @@ -84,15 +84,69 @@ ### 1. 下载 CHARM ```bash git clone https://github.com/opendatalab/CHARM ${path_to_CHARM_repo} + +cd ${path_to_opencompass} +mkdir data +ln -snf ${path_to_CHARM_repo}/data/CHARM ./data/CHARM ``` ### 2. 推理和评测 ```bash cd ${path_to_opencompass} -mkdir -p data -ln -snf ${path_to_CHARM_repo}/data/CHARM ./data/CHARM -# 在CHARM上对模型hf_llama3_8b_instruct做推理和评测 -python run.py --models hf_llama3_8b_instruct --datasets charm_gen +# 修改配置文件`configs/eval_charm_rea.py`: 将现有的模型取消注释,或者添加你想评测的模型 +python run.py configs/eval_charm_rea.py -r --dump-eval-details + +# 修改配置文件`configs/eval_charm_mem.py`: 将现有的模型取消注释,或者添加你想评测的模型 +python run.py configs/eval_charm_mem.py -r --dump-eval-details +``` +推理和评测的结果位于路径`${path_to_opencompass}/outputs`, 如下所示: +```bash +outputs +├── CHARM_mem +│ └── chat +│ └── 20240605_151442 +│ ├── predictions +│ │ ├── internlm2-chat-1.8b-turbomind +│ │ ├── llama-3-8b-instruct-lmdeploy +│ │ └── qwen1.5-1.8b-chat-hf +│ ├── results +│ │ ├── internlm2-chat-1.8b-turbomind_judged-by--GPT-3.5-turbo-0125 +│ │ ├── llama-3-8b-instruct-lmdeploy_judged-by--GPT-3.5-turbo-0125 +│ │ └── qwen1.5-1.8b-chat-hf_judged-by--GPT-3.5-turbo-0125 +│   └── summary +│   └── 20240605_205020 # MEMORY_SUMMARY_DIR +│   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Anachronisms_Judgment +│   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Movie_and_Music_Recommendation +│   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Sport_Understanding +│   ├── judged-by--GPT-3.5-turbo-0125-charm-memory-Chinese_Time_Understanding +│   └── judged-by--GPT-3.5-turbo-0125.csv # MEMORY_SUMMARY_CSV +└── CHARM_rea + └── chat + └── 20240605_152359 + ├── predictions + │ ├── internlm2-chat-1.8b-turbomind + │ ├── llama-3-8b-instruct-lmdeploy + │ └── qwen1.5-1.8b-chat-hf + ├── results # REASON_RESULTS_DIR + │ ├── internlm2-chat-1.8b-turbomind + │ ├── llama-3-8b-instruct-lmdeploy + │ └── qwen1.5-1.8b-chat-hf + └── summary + ├── summary_20240605_205328.csv # REASON_SUMMARY_CSV + └── summary_20240605_205328.txt +``` +### 3. 生成分析结果 +```bash +cd ${path_to_CHARM_repo} + +# 生成论文中的Table5, Table6, Table9 and Table10,详见https://arxiv.org/abs/2403.14112 +PYTHONPATH=. python tools/summarize_reasoning.py ${REASON_SUMMARY_CSV} + +# 生成论文中的Figure3 and Figure9,详见https://arxiv.org/abs/2403.14112 +PYTHONPATH=. python tools/summarize_mem_rea.py ${REASON_SUMMARY_CSV} ${MEMORY_SUMMARY_CSV} + +# 生成论文中的Table7, Table12, Table13 and Figure11,详见https://arxiv.org/abs/2403.14112 +PYTHONPATH=. python tools/analyze_mem_indep_rea.py data/CHARM ${REASON_RESULTS_DIR} ${MEMORY_SUMMARY_DIR} ${MEMORY_SUMMARY_CSV} ``` ## 🖊️ 引用 diff --git a/configs/datasets/CHARM/charm_memory_gen_bbbd53.py b/configs/datasets/CHARM/charm_memory_gen_bbbd53.py new file mode 100644 index 000000000..2617dceef --- /dev/null +++ b/configs/datasets/CHARM/charm_memory_gen_bbbd53.py @@ -0,0 +1,63 @@ +import os +from mmengine.config import read_base + +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import CharmDataset, CharmMemoryEvaluator, LMEvaluator + +with read_base(): + from .charm_memory_settings import charm_memory_tasks, judge_system_prompts, dataset_path + +charm_memory_datasets = [] + +for _task in charm_memory_tasks: + + charm_memory_reader_cfg = dict(input_columns=['input'], + output_column='target') + + charm_memory_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role='HUMAN', prompt='请尽可能简短地回答下述问题。\n问题:{input}\n答:') + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), + ) + + if _task == 'Chinese_Movie_and_Music_Recommendation': + charm_memory_eval_cfg = dict( + evaluator=dict(type=CharmMemoryEvaluator), + pred_role='BOT', + ) + else: + judge_system_prompt = judge_system_prompts[_task] + charm_memory_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt=judge_system_prompt + + "\n\n[Question]\n{input}\n[The Start of Reference Answer]\n{target}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{prediction}\n[The End of Assistant's Answer]" # noqa + ), + ]), + ), + ), + pred_role='BOT', + ) + + charm_memory_datasets.append( + dict( + type=CharmDataset, + path=dataset_path, + name=_task, + abbr='charm-memory-' + _task, + reader_cfg=charm_memory_reader_cfg, + infer_cfg=charm_memory_infer_cfg.copy(), + eval_cfg=charm_memory_eval_cfg.copy(), + )) diff --git a/configs/datasets/CHARM/charm_memory_settings.py b/configs/datasets/CHARM/charm_memory_settings.py new file mode 100644 index 000000000..12f722d75 --- /dev/null +++ b/configs/datasets/CHARM/charm_memory_settings.py @@ -0,0 +1,31 @@ +import os + +charm_memory_tasks = [ + 'Chinese_Anachronisms_Judgment', + 'Chinese_Movie_and_Music_Recommendation', + 'Chinese_Sport_Understanding', + 'Chinese_Time_Understanding', +] + +dataset_path = 'data/CHARM/memorization' + +system_prompt_template = """Please act as an impartial judge, comparing the responses of the AI assistants to the reference answer and determining if the answers are correct. +You will receive the reference answer provided by a human and the responses of the AI assistants. +Your task is to judge whether the AI assistant's answers is correct. +{task_specific_prompt} +After providing your explanation, strictly output your final judgment in the following format: “[正确]” if the AI assistant's response is correct, “[错误]” if the AI assistant's response is incorrect. +""" + +task_specific_prompts = { + 'Chinese_Anachronisms_Judgment': + "If the provided reference answer is a list, the model's prediction is considered correct if it matches any item in the list.", + 'Chinese_Time_Understanding': + "When evaluating the AI assistant's response regarding Chinese solar terms, as long as the AI assistant's response falls within the time frame provided in the reference answer, consider it correct.", + 'Chinese_Sport_Understanding': + "If the provided reference answer is a list, the model's prediction is considered correct if it matches any item in the list." +} + +judge_system_prompts = { + k: system_prompt_template.format(task_specific_prompt=v) + for k, v in task_specific_prompts.items() +} diff --git a/configs/eval_charm_mem.py b/configs/eval_charm_mem.py new file mode 100644 index 000000000..9703a336a --- /dev/null +++ b/configs/eval_charm_mem.py @@ -0,0 +1,94 @@ +from mmengine.config import read_base + +from opencompass.models import OpenAI +from opencompass.runners import LocalRunner +from opencompass.partitioners.sub_size import SubjectiveSizePartitioner +from opencompass.tasks.subjective_eval import SubjectiveEvalTask +from opencompass.summarizers import CharmMemSummarizer + +with read_base(): + from .datasets.CHARM.charm_memory_gen_bbbd53 import charm_memory_datasets as datasets + + # ------>>>>>> https://arxiv.org/abs/2403.14112 + # from .models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model + # from .models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model + # from .models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model + # from .models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model + # from .models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model + # from .models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model + # from .models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model + # from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model + # from .models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model # need torch 2.1 + # from .models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model # need torch 2.1 + # from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model + # from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model + # from .models.yi.hf_yi_6b_chat import models as yi_6b_chat_model + # from .models.yi.hf_yi_34b_chat import models as yi_34b_chat_model + # from .models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model + # from .models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model + # from .models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model + # from .models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model + # from .models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model + # <<<<<<------ https://arxiv.org/abs/2403.14112 + + # from .models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model + # from .models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model + # from .models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model + # from .models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model + + # from .models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model + # from .models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model + + # from .models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model + # from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model + # from .models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model + + # from .models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model + # from .models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model + + # from .models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model + + # from .models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model + # from .models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model + # from .models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model + # from .models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +## ------------- JudgeLLM Configuration +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +]) +judge_models = [ + dict( + abbr='GPT-3.5-turbo-0125', + type=OpenAI, + path='gpt-3.5-turbo-0125', + key='ENV', + meta_template=api_meta_template, + query_per_second=16, + max_out_len=2048, + max_seq_len=2048, + batch_size=8, + temperature=0, + ) +] + +## ------------- Evaluation Configuration +eval = dict( + partitioner=dict( + type=SubjectiveSizePartitioner, + max_task_size=1000, + mode='singlescore', + models=models, + judge_models=judge_models, + ), + runner=dict(type=LocalRunner, + max_num_workers=2, + task=dict(type=SubjectiveEvalTask)), +) + +summarizer = dict(type=CharmMemSummarizer) + +work_dir = './outputs/CHARM_mem/chat/' diff --git a/configs/eval_charm.py b/configs/eval_charm_rea.py similarity index 62% rename from configs/eval_charm.py rename to configs/eval_charm_rea.py index c1b65ccc0..68e132c8c 100644 --- a/configs/eval_charm.py +++ b/configs/eval_charm_rea.py @@ -2,35 +2,55 @@ with read_base(): from .datasets.CHARM.charm_reason_gen_f8fca2 import charm_reason_datasets as datasets - from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_7b_chat_model - # from models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model - # from models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model - - # from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model - # from .models.yi.hf_yi_6b_chat import models as yi_6b_chat_model - # from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model - # from .models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model - # from .models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model # need torch 2.1 + # ------>>>>>> https://arxiv.org/abs/2403.14112 + # from .models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model + # from .models.openai.gpt_4_1106_preview import models as gpt_4_1106_preview_model # from .models.hf_llama.hf_llama2_7b_chat import models as llama2_7b_chat_model - # from .models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model - - # from .models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model # need torch 2.1 # from .models.hf_llama.hf_llama2_13b_chat import models as llama2_13b_chat_model + # from .models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model + # from .models.vicuna.hf_vicuna_7b_v15_16k import models as vicuna_7b_v15_16k_model # from .models.vicuna.hf_vicuna_13b_v15_16k import models as vicuna_13b_v15_16k_model + # from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model + # from .models.baichuan.hf_baichuan2_7b_chat import models as baichuan2_7b_chat_model # need torch 2.1 + # from .models.baichuan.hf_baichuan2_13b_chat import models as baichuan2_13b_chat_model # need torch 2.1 + # from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model # from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model - + # from .models.yi.hf_yi_6b_chat import models as yi_6b_chat_model # from .models.yi.hf_yi_34b_chat import models as yi_34b_chat_model - + # from .models.deepseek.hf_deepseek_7b_chat import models as deepseek_7b_chat_model # from .models.deepseek.hf_deepseek_67b_chat import models as deepseek_67b_chat_model - # from .models.hf_llama.hf_llama2_70b_chat import models as llama2_70b_chat_model + # from .models.qwen.hf_qwen_7b_chat import models as qwen_7b_chat_model + # from .models.qwen.hf_qwen_14b_chat import models as qwen_14b_chat_model + # from .models.qwen.hf_qwen_72b_chat import models as qwen_72b_chat_model + # <<<<<<------ https://arxiv.org/abs/2403.14112 + + # from .models.openai.gpt_3_5_turbo_0125 import models as gpt_3_5_turbo_0125_model + # from .models.openai.gpt_4o_2024_05_13 import models as gpt_4o_2024_05_13_model + # from .models.gemini.gemini_1_5_flash import models as gemini_1_5_flash_model + # from .models.gemini.gemini_1_5_pro import models as gemini_1_5_pro_model + + # from .models.hf_llama.lmdeploy_llama3_8b_instruct import models as lmdeploy_llama3_8b_instruct_model + # from .models.hf_llama.lmdeploy_llama3_70b_instruct import models as lmdeploy_llama3_70b_instruct_model + + # from .models.hf_internlm.lmdeploy_internlm2_chat_1_8b import models as lmdeploy_internlm2_chat_1_8b_model + # from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_internlm2_chat_7b_model + # from .models.hf_internlm.lmdeploy_internlm2_chat_20b import models as lmdeploy_internlm2_chat_20b_model + + # from .models.yi.hf_yi_1_5_6b_chat import models as yi_1_5_6b_chat_model + # from .models.yi.hf_yi_1_5_34b_chat import models as yi_1_5_34b_chat_model + + # from .models.deepseek.hf_deepseek_v2_chat import models as deepseek_v2_chat_model + + # from .models.qwen.hf_qwen1_5_1_8b_chat import models as qwen1_5_1_8b_chat_model + # from .models.qwen.hf_qwen1_5_7b_chat import models as qwen1_5_7b_chat_model + # from .models.qwen.hf_qwen1_5_14b_chat import models as qwen1_5_14b_chat_model + # from .models.qwen.hf_qwen1_5_72b_chat import models as qwen1_5_72b_chat_model - # from .models.hf_llama.hf_llama3_8b_instruct import models as llama3_8b_instruct_model - # from .models.hf_llama.hf_llama3_70b_instruct import models as llama3_70b_instruct_model - from .summarizers.charm_rea import summarizer + from .summarizers.charm_reason import summarizer models = sum([v for k, v in locals().items() if k.endswith('_model')], []) -work_dir = './outputs/CHARM/chat/' +work_dir = './outputs/CHARM_rea/chat/' # dataset version metric mode internlm2-chat-7b-turbomind # ------------------------------------------------------------- --------- ------------- ------ ----------------------------- diff --git a/configs/models/gemini/gemini_1_5_flash.py b/configs/models/gemini/gemini_1_5_flash.py new file mode 100644 index 000000000..79eea6a7d --- /dev/null +++ b/configs/models/gemini/gemini_1_5_flash.py @@ -0,0 +1,22 @@ +from opencompass.models import Gemini + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +], ) + +models = [ + dict( + abbr='gemini-1.5-flash', + type=Gemini, + path='gemini-1.5-flash', + key= + 'ENV', # The key will be obtained from $GEMINI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=15, + max_out_len=100, + max_seq_len=2048, + batch_size=1, + temperature=1, + ) +] diff --git a/configs/models/gemini/gemini_1_5_pro.py b/configs/models/gemini/gemini_1_5_pro.py new file mode 100644 index 000000000..1734849b4 --- /dev/null +++ b/configs/models/gemini/gemini_1_5_pro.py @@ -0,0 +1,22 @@ +from opencompass.models import Gemini + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +], ) + +models = [ + dict( + abbr='gemini-1.5-pro', + type=Gemini, + path='gemini-1.5-pro', + key= + 'ENV', # The key will be obtained from $GEMINI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=2, + max_out_len=100, + max_seq_len=2048, + batch_size=1, + temperature=1, + ) +] diff --git a/configs/models/gemini/gemini_pro.py b/configs/models/gemini/gemini_pro.py index 871e02b67..ff2124bce 100644 --- a/configs/models/gemini/gemini_pro.py +++ b/configs/models/gemini/gemini_pro.py @@ -12,8 +12,7 @@ dict(abbr='gemini', type=Gemini, path='gemini-pro', - key='your keys', # The key will be obtained from Environment, but you can write down your key here as well - url = 'your url', + key='ENV', # The key will be obtained from $GEMINI_API_KEY, but you can write down your key here as well meta_template=api_meta_template, query_per_second=16, max_out_len=100, diff --git a/configs/models/openai/gpt_3_5_turbo_0125.py b/configs/models/openai/gpt_3_5_turbo_0125.py new file mode 100644 index 000000000..efab611b4 --- /dev/null +++ b/configs/models/openai/gpt_3_5_turbo_0125.py @@ -0,0 +1,20 @@ +from opencompass.models import OpenAI + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +], ) + +models = [ + dict( + abbr='GPT-3.5-turbo-0125', + type=OpenAI, + path='gpt-3.5-turbo-0125', + key= + 'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=8), +] diff --git a/configs/models/openai/gpt_4o_2024_05_13.py b/configs/models/openai/gpt_4o_2024_05_13.py new file mode 100644 index 000000000..e742f8a41 --- /dev/null +++ b/configs/models/openai/gpt_4o_2024_05_13.py @@ -0,0 +1,20 @@ +from opencompass.models import OpenAI + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +], ) + +models = [ + dict( + abbr='GPT-4o-2024-05-13', + type=OpenAI, + path='gpt-4o-2024-05-13', + key= + 'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=8), +] diff --git a/opencompass/datasets/charm.py b/opencompass/datasets/charm.py index 652b0842d..62e4ff044 100644 --- a/opencompass/datasets/charm.py +++ b/opencompass/datasets/charm.py @@ -1,12 +1,14 @@ import json import os.path as osp import re +from typing import List, Union from datasets import Dataset -from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.openicl.icl_evaluator import BaseEvaluator, LMEvaluator from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET, TEXT_POSTPROCESSORS) +from opencompass.utils import build_dataset_from_cfg from .base import BaseDataset @@ -44,6 +46,102 @@ def score(self, predictions, references): return {'score': score, 'details': details} +UNCERTAIN_LIST = ['不确定', '无法确定', '无法回答', '不知道', '不认识'] + + +def charm_memory_eval(pred: str, ref: Union[str, List[str]]) -> str: + + for uncertain in UNCERTAIN_LIST: + if uncertain in pred: + return '[错误]' + + is_negative = False + if isinstance(ref, str): + if ref.startswith('[not]'): + # 部分CHARM记忆题目的ref是"[not]xxx" + # 即xxx是一个负例,pred中不应该出现xxx + # 例如:https://github.com/opendatalab/CHARM/blob/v1.0/data/CHARM/memorization/Chinese_Movie_and_Music_Recommendation.json#L45 + is_negative = True + + ref = ref[5:] # 去掉[not],保留xxx + references = [ref] + else: + references = ref # 部分CHARM记忆题目的ref是List[str] + assert isinstance(references, list) + + for r in references: + if r in pred: # pred中包含ref + if is_negative: + return '[错误]' + else: + return '[正确]' + + if is_negative: # 已验证pred中不包含ref,且ref是负例,所以pred是正确的 + return '[正确]' + else: + return '[错误]' + + +class CharmMemoryEvaluator(LMEvaluator): + """本Evaluator是基于规则评判CHARM记忆题目的回答是否正确, + 只用于Chinese_Movie_and_Music_Recommendation这一个任务的评判。 + 由于CHARM其他的记忆任务需要使用LLM作为judge(使用LMEvaluator),因而整个eval使用的是SubjectiveEvalTask。 + 因此,本Evaluator的输入输出与LMEvaluator一致。""" + + def __init__(self, prompt_template=None, *nargs, **kwargs): + + if prompt_template is None: + prompt_template = dict( + type='PromptTemplate', + template=dict( + round=[dict(role='HUMAN', prompt='')])) # useless + + super().__init__(prompt_template, *nargs, **kwargs) + + def score(self, predictions, references, **kwargs): + + assert isinstance(predictions, dict) # single-model scoring + references = [{} for _ in range(len(predictions[0]['model_preds'])) + ] if references is None else references + predictions = predictions['model_preds'] + + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + 'length' + } + + eval_results = [ + charm_memory_eval(pred, ref) + for pred, ref in zip(predictions, references) + ] + + dataset = None + if self.dataset_cfg: + dataset = build_dataset_from_cfg(self.dataset_cfg) + + output = dict() + for i in range(len(predictions)): + if dataset is not None: + question = '' + for col in dataset.reader.input_columns: + question += dataset.reader['test'][col][i] + '\n' + output[str(i)] = { + 'origin_prompt': [{ + 'role': + 'HUMAN', + 'prompt': + f"[Question]: {question}[Assistant's Answer]: {predictions[i]}" # noqa + }], + 'prediction': + eval_results[i], + 'gold': + references[i], + } + + return output + + @LOAD_DATASET.register_module() class CharmDataset(BaseDataset): diff --git a/opencompass/models/gemini_api.py b/opencompass/models/gemini_api.py index 0e9b089fb..7695b218f 100644 --- a/opencompass/models/gemini_api.py +++ b/opencompass/models/gemini_api.py @@ -1,5 +1,6 @@ # flake8: noqa: E501 import json +import os import time from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional, Union @@ -48,7 +49,18 @@ def __init__( query_per_second=query_per_second, meta_template=meta_template, retry=retry) - self.url = f'https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent?key={key}' + assert isinstance(key, str) + if key == 'ENV': + if 'GEMINI_API_KEY' not in os.environ: + raise ValueError('GEMINI API key is not set.') + key = os.getenv('GEMINI_API_KEY') + + assert path in [ + 'gemini-1.0-pro', 'gemini-pro', 'gemini-1.5-flash', + 'gemini-1.5-pro' + ] # https://ai.google.dev/gemini-api/docs/models/gemini#model-variations + + self.url = f'https://generativelanguage.googleapis.com/v1beta/models/{path}:generateContent?key={key}' self.temperature = temperature self.top_p = top_p self.top_k = top_k @@ -171,17 +183,20 @@ def _generate( str(raw_response.content)) time.sleep(1) continue - if raw_response.status_code == 200 and response['msg'] == 'ok': - body = response['body'] - if 'candidates' not in body: + if raw_response.status_code == 200: + if 'candidates' not in response: self.logger.error(response) else: - if 'content' not in body['candidates'][0]: + if 'content' not in response['candidates'][0]: return "Due to Google's restrictive policies, I am unable to respond to this question." else: - return body['candidates'][0]['content']['parts'][0][ - 'text'].strip() - self.logger.error(response['msg']) + return response['candidates'][0]['content']['parts'][ + 0]['text'].strip() + try: + msg = response['error']['message'] + self.logger.error(msg) + except KeyError: + pass self.logger.error(response) time.sleep(1) diff --git a/opencompass/summarizers/subjective/__init__.py b/opencompass/summarizers/subjective/__init__.py index 67e19e646..f6093566f 100644 --- a/opencompass/summarizers/subjective/__init__.py +++ b/opencompass/summarizers/subjective/__init__.py @@ -3,6 +3,7 @@ from .all_obj import AllObjSummarizer from .alpacaeval import AlpacaSummarizer from .arenahard import ArenaHardSummarizer +from .charm import CharmMemSummarizer from .compass_arena import CompassArenaSummarizer from .compassbench import CompassBenchSummarizer from .corev2 import Corev2Summarizer diff --git a/opencompass/summarizers/subjective/charm.py b/opencompass/summarizers/subjective/charm.py new file mode 100644 index 000000000..c9c3fed6d --- /dev/null +++ b/opencompass/summarizers/subjective/charm.py @@ -0,0 +1,208 @@ +# flake8: noqa: E501 +import csv +import json +import os +import os.path as osp +import re +from collections import defaultdict +from datetime import datetime + +import mmengine +import numpy as np +import pandas as pd +from mmengine import ConfigDict +from prettytable import from_csv + +from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg, + model_abbr_from_cfg) + +from .utils import get_outdir + + +def post_process_charm_mem(judgement: str): + """Input a string like below: + + xxx[correct]xxx, and extract the judge + """ + pattern = r'(?i)\[(incorrect|correct|正确|错误|Yes|No)\]' + matched_result = re.findall(pattern, judgement) + if matched_result: + content = matched_result[0].lower() + if content in ['correct', '正确', 'yes']: + return {'correct': True} + elif content in ['incorrect', '错误', 'no']: + return {'correct': False} + else: + return None + + +def get_judgeanswer_and_reference_charm_mem(dataset, subdir_path, + post_process): + """Extract judgements (scores), references and original judging prompts. + + Args: + dataset (ConfigDict): Dataset config. + subdir_path (str): Model path in results dir. + post_process (function): The pre-defined extract function. + """ + dataset_abbr = dataset_abbr_from_cfg(dataset) + filename = osp.join(subdir_path, dataset_abbr + '.json') + partial_filename = osp.join(subdir_path, dataset_abbr + '_0.json') + if osp.exists(osp.realpath(filename)): + result = mmengine.load(filename) + elif osp.exists(osp.realpath(partial_filename)): + filename = partial_filename + result = {} + i = 1 + partial_dict_flag = 0 + while osp.exists(osp.realpath(filename)): + res = mmengine.load(filename) + for k, v in res.items(): + result[partial_dict_flag] = v + partial_dict_flag += 1 + filename = osp.join(subdir_path, + dataset_abbr + '_' + str(i) + '.json') + i += 1 + else: + result = {} + + if len(result) == 0: + print('*' * 100) + print('There are no results for ' + filename + ' or ' + + partial_filename) + print('*' * 100) + assert len(result) > 0 + + judging_prompts = [] + judged_answers = [] + references = [] + for k, v in result.items(): + processed_judge = post_process(v['prediction']) + if processed_judge is not None: + judged_answers.append(processed_judge) + references.append(v['gold']) + judging_origin_prompts = v['origin_prompt'] + if len(judging_origin_prompts) > 0: + judging_prompts.append(judging_origin_prompts[0].get( + 'prompt', None)) + if len(judged_answers) != len(result): + print( + f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements, please check!' + ) + if len(judged_answers) == 0: + print('*' * 100) + print( + 'There are no extracted judgements, please change your judge model or check your prompt!!!' + ) + print('*' * 100) + assert len(judged_answers) > 0 + return judged_answers, references, judging_prompts + + +def get_accuracy(judged_answers): + n_total = 0 + n_correct = 0 + for ans in judged_answers: + if ans.get('correct', False): + n_correct += 1 + n_total += 1 + + return round(n_correct / n_total * 100, 2) + + +class CharmMemSummarizer: + """Do the subjectivity analyze based on evaluation results. + + Args: + config (ConfigDict): The configuration object of the evaluation task. + It's expected to be filled out at runtime. + """ + + def __init__(self, config: ConfigDict, judge_type='single') -> None: + self.judge_type = judge_type + self.tasks = [] + self.cfg = config + if self.judge_type == 'single': + self.eval_model_cfgs = self.cfg['eval']['partitioner']['models'] + self.eval_model_abbrs = [ + model_abbr_from_cfg(model) for model in self.eval_model_cfgs + ] + else: + raise NotImplementedError + + self.judge_abbr = model_abbr_from_cfg( + self.cfg['eval']['partitioner']['judge_models'][0]) + self.judge_map = {'single': post_process_charm_mem} + self.judge_function = self.judge_map[self.judge_type] + + def summarize(self, + time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): + """Summarize the subjectivity analysis based on evaluation results. + + Args: + time_str (str): Timestamp for file naming. + + Returns: + pd.DataFrame: The summary results. + """ + if self.judge_type == 'single': + dataset_cfgs = self.cfg['datasets'] + judge_model = self.judge_abbr + output_dir, results_folder = get_outdir(self.cfg, time_str) + + accuracy_df = pd.DataFrame(columns=self.eval_model_abbrs) + for dataset in dataset_cfgs: + dataset_abbr = dataset_abbr_from_cfg(dataset) + dataset_instance = build_dataset_from_cfg(dataset) + out_dir = osp.join( + output_dir, + 'judged-by--' + judge_model + '-' + dataset_abbr) + os.makedirs(out_dir, exist_ok=True) + + cur_acc_dict = {'dataset': dataset_abbr} + for eval_model_abbr in self.eval_model_abbrs: + subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr + subdir_path = os.path.join(results_folder, subdir) + if os.path.isdir(subdir_path): + model = eval_model_abbr + (judged_answers, references, judging_prompts + ) = get_judgeanswer_and_reference_charm_mem( + dataset, + subdir_path, + self.judge_function, + ) + accuracy = get_accuracy(judged_answers) + cur_acc_dict[eval_model_abbr] = accuracy + + detail_dict = {} + for i in range(len(judged_answers)): + cur_dict = {} + cur_dict['judging_prompt'] = judging_prompts[i] + for input_col in dataset_instance.reader.input_columns: + cur_dict[input_col] = dataset_instance.reader[ + 'test'][input_col][i] + cur_dict['reference'] = references[i] + cur_dict.update(judged_answers[i]) + + detail_dict[str(i)] = cur_dict + + out_dict = {'score': accuracy, 'details': detail_dict} + fout = osp.join(out_dir, model + '.json') + with open(fout, 'w', encoding='utf-8') as f: + json.dump(out_dict, + f, + indent=4, + ensure_ascii=False) + else: + print(subdir_path + ' is not exist! please check!') + + accuracy_df = accuracy_df.append(cur_acc_dict, + ignore_index=True) + accuracy_df.set_index('dataset', inplace=True) + + accuracy_file = osp.join(output_dir, + 'judged-by--' + judge_model + '.csv') + accuracy_df.to_csv(accuracy_file, index=True) + with open(accuracy_file, 'r') as f: + x = from_csv(f) + print(x)