Skip to content

Commit

Permalink
[Sync] Update LongEval (#443)
Browse files Browse the repository at this point in the history
  • Loading branch information
philipwangOvO authored Sep 27, 2023
1 parent 2bb7bee commit 3bb3d33
Show file tree
Hide file tree
Showing 28 changed files with 182 additions and 67 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)

LEval_financialqa_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)

LEval_govreport_summ_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)

LEval_legalqa_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)

LEval_meetingsumm_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)

LEval_narrativeqa_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator,),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)

LEval_nq_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)

LEval_newssumm_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)

LEval_ps_summ_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)

LEval_patent_summ_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)

LEval_review_summ_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)

LEval_scientificqa_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator, AccEvaluator
from opencompass.datasets.leval import LEvalTopicRetrievalDataset
from opencompass.datasets.leval import LEvalTopicRetrievalDataset, LEvalEMEvaluator
from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi, general_postprocess

LEval_tr_reader_cfg = dict(
Expand All @@ -28,7 +28,7 @@
)

LEval_tr_eval_cfg = dict(
evaluator=dict(type=EMEvaluator),
evaluator=dict(type=LEvalEMEvaluator),
pred_postprocessor=dict(type=general_postprocess),
pred_role='BOT'
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)

LEval_tvshow_summ_eval_cfg = dict(
evaluator=dict(type=LEvalGPTEvaluator),
evaluator=dict(type=RougeEvaluator),
pred_role='BOT'
)

Expand Down
3 changes: 2 additions & 1 deletion configs/datasets/longbench/longbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from .longbenchmultifieldqa_en.longbench_multifieldqa_en_gen import LongBench_multifieldqa_en_datasets
from .longbenchmultifieldqa_zh.longbench_multifieldqa_zh_gen import LongBench_multifieldqa_zh_datasets
from .longbenchnarrativeqa.longbench_narrativeqa_gen import LongBench_narrativeqa_datasets
from .longbenchnq.longbench_nq_gen import LongBench_nq_datasets
from .longbenchqasper.longbench_qasper_gen import LongBench_qasper_datasets
from .longbenchtriviaqa.longbench_triviaqa_gen import LongBench_triviaqa_datasets
from .longbenchgov_report.longbench_gov_report_gen import LongBench_gov_report_datasets
Expand All @@ -21,5 +20,7 @@
from .longbenchpassage_count.longbench_passage_count_gen import LongBench_passage_count_datasets
from .longbenchtrec.longbench_trec_gen import LongBench_trec_datasets
from .longbenchlsht.longbench_lsht_gen import LongBench_lsht_datasets
from .longbenchmulti_news.longbench_multi_news_gen import LongBench_multi_news_datasets
from .longbenchsamsum.longbench_samsum_gen import LongBench_samsum_datasets

longbench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from mmengine.config import read_base

with read_base():
from .longbench_multi_news_gen_f6e3fb import LongBench_multi_news_datasets # noqa: F401, F403
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import LongBenchRougeEvaluator, LongBenchmulti_newsDataset

LongBench_multi_news_reader_cfg = dict(
input_columns=['context'],
output_column='answers',
train_split='test',
test_split='test'
)

LongBench_multi_news_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:'),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512)
)

LongBench_multi_news_eval_cfg = dict(
evaluator=dict(type=LongBenchRougeEvaluator),
pred_role='BOT'
)

LongBench_multi_news_datasets = [
dict(
type=LongBenchmulti_newsDataset,
abbr='LongBench_multi_news',
path='THUDM/LongBench',
name='multi_news',
reader_cfg=LongBench_multi_news_reader_cfg,
infer_cfg=LongBench_multi_news_infer_cfg,
eval_cfg=LongBench_multi_news_eval_cfg)
]
4 changes: 0 additions & 4 deletions configs/datasets/longbench/longbenchnq/longbench_nq_gen.py

This file was deleted.

38 changes: 0 additions & 38 deletions configs/datasets/longbench/longbenchnq/longbench_nq_gen_d30cb9.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from mmengine.config import read_base

with read_base():
from .longbench_samsum_gen_f4416d import LongBench_samsum_datasets # noqa: F401, F403
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import LongBenchRougeEvaluator, LongBenchsamsumDataset

LongBench_samsum_reader_cfg = dict(
input_columns=['context', 'input'],
output_column='answers',
train_split='test',
test_split='test'
)

LongBench_samsum_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}'),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=128)
)

LongBench_samsum_eval_cfg = dict(
evaluator=dict(type=LongBenchRougeEvaluator),
pred_role='BOT'
)

LongBench_samsum_datasets = [
dict(
type=LongBenchsamsumDataset,
abbr='LongBench_samsum',
path='THUDM/LongBench',
name='samsum',
reader_cfg=LongBench_samsum_reader_cfg,
infer_cfg=LongBench_samsum_infer_cfg,
eval_cfg=LongBench_samsum_eval_cfg)
]
11 changes: 6 additions & 5 deletions configs/summarizers/longbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,20 @@
'--------- LongBench Summarization ---------', # category
'LongBench_gov_report',
'LongBench_qmsum',
'LongBench_multi_news',
'LongBench_vcsum',
'--------- LongBench Few-shot Learning ---------', # category
'LongBench_trec',
'LongBench_nq',
'LongBench_triviaqa',
'LongBench_samsum',
'LongBench_lsht',
'--------- LongBench Code Completion ---------', # category
'LongBench_lcc',
'LongBench_repobench-p',
'--------- LongBench Synthetic Tasks ---------', # category
'LongBench_passage_retrieval_en',
'LongBench_passage_count',
'LongBench_passage_retrieval_en',
'LongBench_passage_retrieval_zh',
'--------- LongBench Code Completion ---------', # category
'LongBench_lcc',
'LongBench_repobench-p',
],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
prompt_db=dict(
Expand Down
1 change: 1 addition & 0 deletions opencompass/datasets/leval/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .evaluators import LEvalEMEvaluator # noqa: F401, F403
from .evaluators import LEvalGPTEvaluator # noqa: F401, F403
from .leval_coursera import * # noqa: F401, F403
from .leval_financial_qa import * # noqa: F401, F403
Expand Down
30 changes: 30 additions & 0 deletions opencompass/datasets/leval/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS
from opencompass.utils.prompt import PromptList
from opencompass.utils.text_postprocessors import general_postprocess


@ICL_EVALUATORS.register_module()
Expand Down Expand Up @@ -107,3 +108,32 @@ def score(self, predictions: List, references: List) -> dict:

score = score / (num_samples - bad_case) * 100
return {'score': score}


@ICL_EVALUATORS.register_module()
class LEvalEMEvaluator(BaseEvaluator):
"""Exact match evaluator."""

def __init__(self) -> None:
super().__init__()

def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
predictions = [
general_postprocess(prediction) for prediction in predictions
]
processed_answers = [general_postprocess(i) for i in references]

cnt = 0
for pred, ans, origin_ans in zip(predictions, processed_answers,
references):
if ans in pred or origin_ans in pred:
cnt += 1

score = cnt / len(predictions) * 100

return {'score': score}
3 changes: 2 additions & 1 deletion opencompass/datasets/longbench/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,18 @@
from .longbench_hotpot_qa import * # noqa: F401, F403
from .longbench_lcc import * # noqa: F401, F403
from .longbench_lsht import * # noqa: F401, F403
from .longbench_multi_news import * # noqa: F401, F403
from .longbench_multifieldqa_en import * # noqa: F401, F403
from .longbench_multifieldqa_zh import * # noqa: F401, F403
from .longbench_musique import * # noqa: F401, F403
from .longbench_narrative_qa import * # noqa: F401, F403
from .longbench_nq import * # noqa: F401, F403
from .longbench_passage_count import * # noqa: F401, F403
from .longbench_passage_retrieval_en import * # noqa: F401, F403
from .longbench_passage_retrieval_zh import * # noqa: F401, F403
from .longbench_qasper import * # noqa: F401, F403
from .longbench_qmsum import * # noqa: F401, F403
from .longbench_repobench import * # noqa: F401, F403
from .longbench_samsum import * # noqa: F401, F403
from .longbench_trec import * # noqa: F401, F403
from .longbench_trivia_qa import * # noqa: F401, F403
from .longbench_vcsum import * # noqa: F401, F403
4 changes: 2 additions & 2 deletions opencompass/datasets/longbench/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,10 +189,10 @@ def score(self, predictions: List, references: List) -> dict:
list(jieba.cut(reference, cut_all=False)))

rouge = Rouge()
if prediction != '':
try:
cur_score = rouge.get_scores([prediction], [reference],
avg=True)['rouge-l']['f']
else:
except Exception:
cur_score = 0.
task_score = max(task_score, cur_score)

Expand Down
Loading

0 comments on commit 3bb3d33

Please sign in to comment.