From 61bd19f07625038878eb12a1eb5e9d4e1da6ba11 Mon Sep 17 00:00:00 2001 From: Prikshit7766 Date: Wed, 25 Oct 2023 20:16:03 +0530 Subject: [PATCH] updated Evaluation_Metrics notebook --- demo/tutorials/misc/Evaluation_Metrics.ipynb | 2181 +++++++----------- 1 file changed, 854 insertions(+), 1327 deletions(-) diff --git a/demo/tutorials/misc/Evaluation_Metrics.ipynb b/demo/tutorials/misc/Evaluation_Metrics.ipynb index d939788cd..f92d5aa44 100644 --- a/demo/tutorials/misc/Evaluation_Metrics.ipynb +++ b/demo/tutorials/misc/Evaluation_Metrics.ipynb @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -182,7 +182,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -207,7 +207,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -238,12 +238,12 @@ } ], "source": [ - "harness = Harness(task=\"question-answering\", model={\"model\": \"text-davinci-003\",\"hub\":\"openai\"}, data={\"data_source\" :\"LogiQA-test-tiny\"})" + "harness = Harness(task=\"question-answering\", model={\"model\": \"text-davinci-003\",\"hub\":\"openai\"}, data={\"data_source\" :\"CommonsenseQA-test-tiny\"})" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -254,11 +254,11 @@ " 'threshold': 0.9},\n", " 'embeddings': {'model': 'text-embedding-ada-002', 'hub': 'openai'},\n", " 'tests': {'defaults': {'min_pass_rate': 0.65},\n", - " 'robustness': {'uppercase': {'min_pass_rate': 0.66},\n", - " 'lowercase': {'min_pass_rate': 0.66}}}}" + " 'robustness': {'add_ocr_typo': {'min_pass_rate': 0.66},\n", + " 'dyslexia_word_swap': {'min_pass_rate': 0.6}}}}" ] }, - "execution_count": 16, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -270,8 +270,8 @@ " \"embeddings\":{\"model\":\"text-embedding-ada-002\",\"hub\":\"openai\"},\n", " 'tests': {'defaults': {'min_pass_rate': 0.65},\n", "\n", - " 'robustness': {'uppercase': {'min_pass_rate': 0.66},\n", - " 'lowercase': {'min_pass_rate': 0.66}\n", + " 'robustness': {'add_ocr_typo': {'min_pass_rate': 0.66},\n", + " 'dyslexia_word_swap':{'min_pass_rate': 0.60}\n", " }\n", " }\n", " }\n", @@ -283,16 +283,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Here we have configured the harness to perform two robustness tests (uppercase and lowercase) and defined the minimum pass rate for each test." + "Here we have configured the harness to perform two robustness tests (add_ocr_typo and dyslexia_word_swap) and defined the minimum pass rate for each test." ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "harness.data= harness.data[:10]" + "harness.data= harness.data[:5]" ] }, { @@ -304,21 +304,24 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Generating testcases...: 100%|██████████| 1/1 [00:00<00:00, 398.55it/s]\n" + "Generating testcases...: 100%|██████████| 1/1 [00:00original_question\n", " perturbed_context\n", " perturbed_question\n", + " expected_result\n", " \n", " \n", " \n", " \n", " 0\n", " robustness\n", - " uppercase\n", - " In the planning of a new district in a townshi...\n", - " Based on the above statement, which of the fol...\n", - " IN THE PLANNING OF A NEW DISTRICT IN A TOWNSHI...\n", - " BASED ON THE ABOVE STATEMENT, WHICH OF THE FOL...\n", + " add_ocr_typo\n", + " -\n", + " The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs\n", + " -\n", + " t^he townhouse was a h^ard vs^ell f^or the realtor, i^t was rig^ht ncxt t^o a higb rlse v^^hat?\\nA. suburban develoi)ment\\nB. apartment buihiing\\nC. bus ftop\\nD. michigan\\nE. suburbs\n", + " B. apartment building\n", " \n", " \n", " 1\n", " robustness\n", - " uppercase\n", - " The company sent three young staff members to ...\n", - " So what are the three young people on business...\n", - " THE COMPANY SENT THREE YOUNG STAFF MEMBERS TO ...\n", - " SO WHAT ARE THE THREE YOUNG PEOPLE ON BUSINESS...\n", + " add_ocr_typo\n", + " -\n", + " There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system\n", + " -\n", + " t^here is a ftar at tle ccnter of v/hat group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer lpace\\nD. constellation\\nE. solar fvftem\n", + " D. constellation\n", " \n", " \n", " 2\n", " robustness\n", - " uppercase\n", - " In a traditional Chinese medicine preparation,...\n", - " According to the above statement, which of the...\n", - " IN A TRADITIONAL CHINESE MEDICINE PREPARATION,...\n", - " ACCORDING TO THE ABOVE STATEMENT, WHICH OF THE...\n", + " add_ocr_typo\n", + " -\n", + " What were the kids doing as they looked up at the sky and clouds?\\nA. ponder\\nB. become adults\\nC. wonder about\\nD. open door\\nE. distracting\n", + " -\n", + " whai were tlic kids doing as tbey looked up at the sky a^nd clouds?\\nA. ponder\\nB. bec6me adults\\nC. w^onder aboui\\nD. ol)en d6or\\nE. distracting\n", + " C. wonder about\n", " \n", " \n", " 3\n", " robustness\n", - " uppercase\n", - " In recent years, graduate entrance examination...\n", - " Which of the following can best strengthen the...\n", - " IN RECENT YEARS, GRADUATE ENTRANCE EXAMINATION...\n", - " WHICH OF THE FOLLOWING CAN BEST STRENGTHEN THE...\n", + " add_ocr_typo\n", + " -\n", + " The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children\n", + " -\n", + " t^he perfon taught an advanced clasf onhy f^or vHo?\\nA. oAvn houfe\\nB. own self\\nC. wonderful memories\\nD. knoAv trutb\\nE. intelligent children\n", + " E. Intelligent children\n", " \n", " \n", " 4\n", " robustness\n", - " uppercase\n", - " A unit conducted the year-end assessment and a...\n", - " According to the above statement, it can be co...\n", - " A UNIT CONDUCTED THE YEAR-END ASSESSMENT AND A...\n", - " ACCORDING TO THE ABOVE STATEMENT, IT CAN BE CO...\n", + " add_ocr_typo\n", + " -\n", + " What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage\n", + " -\n", + " w^hat is a likelv consequence of ignorance of rules?\\nA. f1nd tiuth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage\n", + " D. accidents\n", " \n", " \n", " 5\n", " robustness\n", - " uppercase\n", - " Zhang Ming, Li Ying, Wang Jia and Chen Rui wor...\n", - " According to the above statement, you can get ...\n", - " ZHANG MING, LI YING, WANG JIA AND CHEN RUI WOR...\n", - " ACCORDING TO THE ABOVE STATEMENT, YOU CAN GET ...\n", + " dyslexia_word_swap\n", + " -\n", + " The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs\n", + " -\n", + " The townhouse was a hard sell four the realtor, it was write next too a hi rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs\n", + " B. apartment building\n", " \n", " \n", " 6\n", " robustness\n", - " uppercase\n", - " The person in charge of the relevant departmen...\n", - " Which of the following is true will most weake...\n", - " THE PERSON IN CHARGE OF THE RELEVANT DEPARTMEN...\n", - " WHICH OF THE FOLLOWING IS TRUE WILL MOST WEAKE...\n", + " dyslexia_word_swap\n", + " -\n", + " There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system\n", + " -\n", + " There is a star at the center off what group off celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system\n", + " D. constellation\n", " \n", " \n", " 7\n", " robustness\n", - " uppercase\n", - " There are five teams participating in the game...\n", - " The result of the match showed that only one a...\n", - " THERE ARE FIVE TEAMS PARTICIPATING IN THE GAME...\n", - " THE RESULT OF THE MATCH SHOWED THAT ONLY ONE A...\n", + " dyslexia_word_swap\n", + " -\n", + " The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children\n", + " -\n", + " The person taught an advanced class only four who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. no truth\\nE. intelligent children\n", + " E. Intelligent children\n", " \n", " \n", " 8\n", " robustness\n", - " uppercase\n", - " Compared with small and medium-sized cities, e...\n", - " Which of the following is the conclusion must ...\n", - " COMPARED WITH SMALL AND MEDIUM-SIZED CITIES, E...\n", - " WHICH OF THE FOLLOWING IS THE CONCLUSION MUST ...\n", - " \n", - " \n", - " 9\n", - " robustness\n", - " uppercase\n", - " Researchers recently discovered that there is ...\n", - " Which of the following is assumed by the above...\n", - " RESEARCHERS RECENTLY DISCOVERED THAT THERE IS ...\n", - " WHICH OF THE FOLLOWING IS ASSUMED BY THE ABOVE...\n", - " \n", - " \n", - " 10\n", - " robustness\n", - " lowercase\n", - " In the planning of a new district in a townshi...\n", - " Based on the above statement, which of the fol...\n", - " in the planning of a new district in a townshi...\n", - " based on the above statement, which of the fol...\n", - " \n", - " \n", - " 11\n", - " robustness\n", - " lowercase\n", - " The company sent three young staff members to ...\n", - " So what are the three young people on business...\n", - " the company sent three young staff members to ...\n", - " so what are the three young people on business...\n", - " \n", - " \n", - " 12\n", - " robustness\n", - " lowercase\n", - " In a traditional Chinese medicine preparation,...\n", - " According to the above statement, which of the...\n", - " in a traditional chinese medicine preparation,...\n", - " according to the above statement, which of the...\n", - " \n", - " \n", - " 13\n", - " robustness\n", - " lowercase\n", - " In recent years, graduate entrance examination...\n", - " Which of the following can best strengthen the...\n", - " in recent years, graduate entrance examination...\n", - " which of the following can best strengthen the...\n", - " \n", - " \n", - " 14\n", - " robustness\n", - " lowercase\n", - " A unit conducted the year-end assessment and a...\n", - " According to the above statement, it can be co...\n", - " a unit conducted the year-end assessment and a...\n", - " according to the above statement, it can be co...\n", - " \n", - " \n", - " 15\n", - " robustness\n", - " lowercase\n", - " Zhang Ming, Li Ying, Wang Jia and Chen Rui wor...\n", - " According to the above statement, you can get ...\n", - " zhang ming, li ying, wang jia and chen rui wor...\n", - " according to the above statement, you can get ...\n", - " \n", - " \n", - " 16\n", - " robustness\n", - " lowercase\n", - " The person in charge of the relevant departmen...\n", - " Which of the following is true will most weake...\n", - " the person in charge of the relevant departmen...\n", - " which of the following is true will most weake...\n", - " \n", - " \n", - " 17\n", - " robustness\n", - " lowercase\n", - " There are five teams participating in the game...\n", - " The result of the match showed that only one a...\n", - " there are five teams participating in the game...\n", - " the result of the match showed that only one a...\n", - " \n", - " \n", - " 18\n", - " robustness\n", - " lowercase\n", - " Compared with small and medium-sized cities, e...\n", - " Which of the following is the conclusion must ...\n", - " compared with small and medium-sized cities, e...\n", - " which of the following is the conclusion must ...\n", - " \n", - " \n", - " 19\n", - " robustness\n", - " lowercase\n", - " Researchers recently discovered that there is ...\n", - " Which of the following is assumed by the above...\n", - " researchers recently discovered that there is ...\n", - " which of the following is assumed by the above...\n", + " dyslexia_word_swap\n", + " -\n", + " What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage\n", + " -\n", + " What is a likely consequence off ignorance off rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage\n", + " D. accidents\n", " \n", " \n", "\n", "" ], "text/plain": [ - " category test_type original_context \\\n", - "0 robustness uppercase In the planning of a new district in a townshi... \n", - "1 robustness uppercase The company sent three young staff members to ... \n", - "2 robustness uppercase In a traditional Chinese medicine preparation,... \n", - "3 robustness uppercase In recent years, graduate entrance examination... \n", - "4 robustness uppercase A unit conducted the year-end assessment and a... \n", - "5 robustness uppercase Zhang Ming, Li Ying, Wang Jia and Chen Rui wor... \n", - "6 robustness uppercase The person in charge of the relevant departmen... \n", - "7 robustness uppercase There are five teams participating in the game... \n", - "8 robustness uppercase Compared with small and medium-sized cities, e... \n", - "9 robustness uppercase Researchers recently discovered that there is ... \n", - "10 robustness lowercase In the planning of a new district in a townshi... \n", - "11 robustness lowercase The company sent three young staff members to ... \n", - "12 robustness lowercase In a traditional Chinese medicine preparation,... \n", - "13 robustness lowercase In recent years, graduate entrance examination... \n", - "14 robustness lowercase A unit conducted the year-end assessment and a... \n", - "15 robustness lowercase Zhang Ming, Li Ying, Wang Jia and Chen Rui wor... \n", - "16 robustness lowercase The person in charge of the relevant departmen... \n", - "17 robustness lowercase There are five teams participating in the game... \n", - "18 robustness lowercase Compared with small and medium-sized cities, e... \n", - "19 robustness lowercase Researchers recently discovered that there is ... \n", + " category test_type original_context \\\n", + "0 robustness add_ocr_typo - \n", + "1 robustness add_ocr_typo - \n", + "2 robustness add_ocr_typo - \n", + "3 robustness add_ocr_typo - \n", + "4 robustness add_ocr_typo - \n", + "5 robustness dyslexia_word_swap - \n", + "6 robustness dyslexia_word_swap - \n", + "7 robustness dyslexia_word_swap - \n", + "8 robustness dyslexia_word_swap - \n", "\n", - " original_question \\\n", - "0 Based on the above statement, which of the fol... \n", - "1 So what are the three young people on business... \n", - "2 According to the above statement, which of the... \n", - "3 Which of the following can best strengthen the... \n", - "4 According to the above statement, it can be co... \n", - "5 According to the above statement, you can get ... \n", - "6 Which of the following is true will most weake... \n", - "7 The result of the match showed that only one a... \n", - "8 Which of the following is the conclusion must ... \n", - "9 Which of the following is assumed by the above... \n", - "10 Based on the above statement, which of the fol... \n", - "11 So what are the three young people on business... \n", - "12 According to the above statement, which of the... \n", - "13 Which of the following can best strengthen the... \n", - "14 According to the above statement, it can be co... \n", - "15 According to the above statement, you can get ... \n", - "16 Which of the following is true will most weake... \n", - "17 The result of the match showed that only one a... \n", - "18 Which of the following is the conclusion must ... \n", - "19 Which of the following is assumed by the above... \n", + " original_question \\\n", + "0 The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs \n", + "1 There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system \n", + "2 What were the kids doing as they looked up at the sky and clouds?\\nA. ponder\\nB. become adults\\nC. wonder about\\nD. open door\\nE. distracting \n", + "3 The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children \n", + "4 What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", + "5 The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs \n", + "6 There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system \n", + "7 The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children \n", + "8 What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", "\n", - " perturbed_context \\\n", - "0 IN THE PLANNING OF A NEW DISTRICT IN A TOWNSHI... \n", - "1 THE COMPANY SENT THREE YOUNG STAFF MEMBERS TO ... \n", - "2 IN A TRADITIONAL CHINESE MEDICINE PREPARATION,... \n", - "3 IN RECENT YEARS, GRADUATE ENTRANCE EXAMINATION... \n", - "4 A UNIT CONDUCTED THE YEAR-END ASSESSMENT AND A... \n", - "5 ZHANG MING, LI YING, WANG JIA AND CHEN RUI WOR... \n", - "6 THE PERSON IN CHARGE OF THE RELEVANT DEPARTMEN... \n", - "7 THERE ARE FIVE TEAMS PARTICIPATING IN THE GAME... \n", - "8 COMPARED WITH SMALL AND MEDIUM-SIZED CITIES, E... \n", - "9 RESEARCHERS RECENTLY DISCOVERED THAT THERE IS ... \n", - "10 in the planning of a new district in a townshi... \n", - "11 the company sent three young staff members to ... \n", - "12 in a traditional chinese medicine preparation,... \n", - "13 in recent years, graduate entrance examination... \n", - "14 a unit conducted the year-end assessment and a... \n", - "15 zhang ming, li ying, wang jia and chen rui wor... \n", - "16 the person in charge of the relevant departmen... \n", - "17 there are five teams participating in the game... \n", - "18 compared with small and medium-sized cities, e... \n", - "19 researchers recently discovered that there is ... \n", + " perturbed_context \\\n", + "0 - \n", + "1 - \n", + "2 - \n", + "3 - \n", + "4 - \n", + "5 - \n", + "6 - \n", + "7 - \n", + "8 - \n", "\n", - " perturbed_question \n", - "0 BASED ON THE ABOVE STATEMENT, WHICH OF THE FOL... \n", - "1 SO WHAT ARE THE THREE YOUNG PEOPLE ON BUSINESS... \n", - "2 ACCORDING TO THE ABOVE STATEMENT, WHICH OF THE... \n", - "3 WHICH OF THE FOLLOWING CAN BEST STRENGTHEN THE... \n", - "4 ACCORDING TO THE ABOVE STATEMENT, IT CAN BE CO... \n", - "5 ACCORDING TO THE ABOVE STATEMENT, YOU CAN GET ... \n", - "6 WHICH OF THE FOLLOWING IS TRUE WILL MOST WEAKE... \n", - "7 THE RESULT OF THE MATCH SHOWED THAT ONLY ONE A... \n", - "8 WHICH OF THE FOLLOWING IS THE CONCLUSION MUST ... \n", - "9 WHICH OF THE FOLLOWING IS ASSUMED BY THE ABOVE... \n", - "10 based on the above statement, which of the fol... \n", - "11 so what are the three young people on business... \n", - "12 according to the above statement, which of the... \n", - "13 which of the following can best strengthen the... \n", - "14 according to the above statement, it can be co... \n", - "15 according to the above statement, you can get ... \n", - "16 which of the following is true will most weake... \n", - "17 the result of the match showed that only one a... \n", - "18 which of the following is the conclusion must ... \n", - "19 which of the following is assumed by the above... " + " perturbed_question \\\n", + "0 t^he townhouse was a h^ard vs^ell f^or the realtor, i^t was rig^ht ncxt t^o a higb rlse v^^hat?\\nA. suburban develoi)ment\\nB. apartment buihiing\\nC. bus ftop\\nD. michigan\\nE. suburbs \n", + "1 t^here is a ftar at tle ccnter of v/hat group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer lpace\\nD. constellation\\nE. solar fvftem \n", + "2 whai were tlic kids doing as tbey looked up at the sky a^nd clouds?\\nA. ponder\\nB. bec6me adults\\nC. w^onder aboui\\nD. ol)en d6or\\nE. distracting \n", + "3 t^he perfon taught an advanced clasf onhy f^or vHo?\\nA. oAvn houfe\\nB. own self\\nC. wonderful memories\\nD. knoAv trutb\\nE. intelligent children \n", + "4 w^hat is a likelv consequence of ignorance of rules?\\nA. f1nd tiuth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", + "5 The townhouse was a hard sell four the realtor, it was write next too a hi rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs \n", + "6 There is a star at the center off what group off celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system \n", + "7 The person taught an advanced class only four who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. no truth\\nE. intelligent children \n", + "8 What is a likely consequence off ignorance off rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", + "\n", + " expected_result \n", + "0 B. apartment building \n", + "1 D. constellation \n", + "2 C. wonder about \n", + "3 E. Intelligent children \n", + "4 D. accidents \n", + "5 B. apartment building \n", + "6 D. constellation \n", + "7 E. Intelligent children \n", + "8 D. accidents " ] }, - "execution_count": 19, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -668,21 +549,21 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Running testcases... : 100%|██████████| 20/20 [00:48<00:00, 2.42s/it]\n" + "Running testcases... : 100%|██████████| 9/9 [00:23<00:00, 2.64s/it]\n" ] }, { "data": { "text/plain": [] }, - "execution_count": 20, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -707,7 +588,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -747,402 +628,182 @@ " \n", " 0\n", " robustness\n", - " uppercase\n", - " In the planning of a new district in a townshi...\n", - " Based on the above statement, which of the fol...\n", - " IN THE PLANNING OF A NEW DISTRICT IN A TOWNSHI...\n", - " BASED ON THE ABOVE STATEMENT, WHICH OF THE FOL...\n", - " B. The leisure area is southwest of the cultu...\n", - " B. The Leisure Area is Southwest of the Cultu...\n", - " 0.999999\n", + " add_ocr_typo\n", + " -\n", + " The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs\n", + " -\n", + " t^he townhouse was a h^ard vs^ell f^or the realtor, i^t was rig^ht ncxt t^o a higb rlse v^^hat?\\nA. suburban develoi)ment\\nB. apartment buihiing\\nC. bus ftop\\nD. michigan\\nE. suburbs\n", + " B. apartment building\n", + " B. Apartment building\n", + " 1.000000\n", " True\n", " \n", " \n", " 1\n", " robustness\n", - " uppercase\n", - " The company sent three young staff members to ...\n", - " So what are the three young people on business...\n", - " THE COMPANY SENT THREE YOUNG STAFF MEMBERS TO ...\n", - " SO WHAT ARE THE THREE YOUNG PEOPLE ON BUSINESS...\n", - " A. 0-year-old accountant, 20-year-old salespe...\n", - " A. 0-YEAR-OLD ACCOUNTANT, 20-YEAR-OLD SALESPE...\n", + " add_ocr_typo\n", + " -\n", + " There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system\n", + " -\n", + " t^here is a ftar at tle ccnter of v/hat group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer lpace\\nD. constellation\\nE. solar fvftem\n", + " D. constellation\n", + " D. constellation\n", " 1.000000\n", " True\n", " \n", " \n", " 2\n", " robustness\n", - " uppercase\n", - " In a traditional Chinese medicine preparation,...\n", - " According to the above statement, which of the...\n", - " IN A TRADITIONAL CHINESE MEDICINE PREPARATION,...\n", - " ACCORDING TO THE ABOVE STATEMENT, WHICH OF THE...\n", - " B. o Shouwu.\n", - " B. O SHOUWU.\n", - " 1.000000\n", + " add_ocr_typo\n", + " -\n", + " What were the kids doing as they looked up at the sky and clouds?\\nA. ponder\\nB. become adults\\nC. wonder about\\nD. open door\\nE. distracting\n", + " -\n", + " whai were tlic kids doing as tbey looked up at the sky a^nd clouds?\\nA. ponder\\nB. bec6me adults\\nC. w^onder aboui\\nD. ol)en d6or\\nE. distracting\n", + " C. wonder about\n", + " C. wonder about\n", + " 0.999803\n", " True\n", " \n", " \n", " 3\n", " robustness\n", - " uppercase\n", - " In recent years, graduate entrance examination...\n", - " Which of the following can best strengthen the...\n", - " IN RECENT YEARS, GRADUATE ENTRANCE EXAMINATION...\n", - " WHICH OF THE FOLLOWING CAN BEST STRENGTHEN THE...\n", - " B. Only those who intend to take the graduate...\n", - " B. ONLY THOSE WHO INTEND TO TAKE THE GRADUATE...\n", - " 1.000000\n", - " True\n", + " add_ocr_typo\n", + " -\n", + " The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children\n", + " -\n", + " t^he perfon taught an advanced clasf onhy f^or vHo?\\nA. oAvn houfe\\nB. own self\\nC. wonderful memories\\nD. knoAv trutb\\nE. intelligent children\n", + " E. Intelligent children\n", + " B. own self\n", + " 0.804418\n", + " False\n", " \n", " \n", " 4\n", " robustness\n", - " uppercase\n", - " A unit conducted the year-end assessment and a...\n", - " According to the above statement, it can be co...\n", - " A UNIT CONDUCTED THE YEAR-END ASSESSMENT AND A...\n", - " ACCORDING TO THE ABOVE STATEMENT, IT CAN BE CO...\n", - " C. C.\n", - " D. DING.\n", - " 0.871277\n", - " False\n", + " add_ocr_typo\n", + " -\n", + " What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage\n", + " -\n", + " w^hat is a likelv consequence of ignorance of rules?\\nA. f1nd tiuth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage\n", + " D. accidents\n", + " D. Accidents\n", + " 0.999998\n", + " True\n", " \n", " \n", " 5\n", " robustness\n", - " uppercase\n", - " Zhang Ming, Li Ying, Wang Jia and Chen Rui wor...\n", - " According to the above statement, you can get ...\n", - " ZHANG MING, LI YING, WANG JIA AND CHEN RUI WOR...\n", - " ACCORDING TO THE ABOVE STATEMENT, YOU CAN GET ...\n", - " A. Chen Rui can't speak the Central Plains Ma...\n", - " A. Hen Rui can't speak the Central Plains Man...\n", - " 0.978401\n", + " dyslexia_word_swap\n", + " -\n", + " The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs\n", + " -\n", + " The townhouse was a hard sell four the realtor, it was write next too a hi rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs\n", + " B. apartment building\n", + " B. apartment building\n", + " 0.999998\n", " True\n", " \n", " \n", " 6\n", " robustness\n", - " uppercase\n", - " The person in charge of the relevant departmen...\n", - " Which of the following is true will most weake...\n", - " THE PERSON IN CHARGE OF THE RELEVANT DEPARTMEN...\n", - " WHICH OF THE FOLLOWING IS TRUE WILL MOST WEAKE...\n", - " A. This newly built house in Hexi Village has...\n", - " A. This newly built house in Hexi Village has...\n", - " 0.999999\n", + " dyslexia_word_swap\n", + " -\n", + " There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system\n", + " -\n", + " There is a star at the center off what group off celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system\n", + " D. constellation\n", + " D. constellation\n", + " 0.999997\n", " True\n", " \n", " \n", " 7\n", " robustness\n", - " uppercase\n", - " There are five teams participating in the game...\n", - " The result of the match showed that only one a...\n", - " THERE ARE FIVE TEAMS PARTICIPATING IN THE GAME...\n", - " THE RESULT OF THE MATCH SHOWED THAT ONLY ONE A...\n", - " B. Jiangnan.\n", - " B. JIANGNAN.\n", - " 1.000000\n", - " True\n", - " \n", - " \n", - " 8\n", - " robustness\n", - " uppercase\n", - " Compared with small and medium-sized cities, e...\n", - " Which of the following is the conclusion must ...\n", - " COMPARED WITH SMALL AND MEDIUM-SIZED CITIES, E...\n", - " WHICH OF THE FOLLOWING IS THE CONCLUSION MUST ...\n", - " B. Simple development of large cities is not ...\n", - " B. Simple development of large cities is not ...\n", + " dyslexia_word_swap\n", + " -\n", + " The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children\n", + " -\n", + " The person taught an advanced class only four who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. no truth\\nE. intelligent children\n", + " E. Intelligent children\n", + " E. Intelligent children\n", " 0.999999\n", " True\n", " \n", " \n", - " 9\n", - " robustness\n", - " uppercase\n", - " Researchers recently discovered that there is ...\n", - " Which of the following is assumed by the above...\n", - " RESEARCHERS RECENTLY DISCOVERED THAT THERE IS ...\n", - " WHICH OF THE FOLLOWING IS ASSUMED BY THE ABOVE...\n", - " D. The brain cannot process too much informat...\n", - " D. The brain cannot process too much informat...\n", - " 0.999998\n", - " True\n", - " \n", - " \n", - " 10\n", - " robustness\n", - " lowercase\n", - " In the planning of a new district in a townshi...\n", - " Based on the above statement, which of the fol...\n", - " in the planning of a new district in a townshi...\n", - " based on the above statement, which of the fol...\n", - " B. The leisure area is southwest of the cultu...\n", - " b. the leisure area is southwest of the cultu...\n", - " 1.000000\n", - " True\n", - " \n", - " \n", - " 11\n", - " robustness\n", - " lowercase\n", - " The company sent three young staff members to ...\n", - " So what are the three young people on business...\n", - " the company sent three young staff members to ...\n", - " so what are the three young people on business...\n", - " A. 0-year-old accountant, 20-year-old salespe...\n", - " a. 0-year-old accountant, 20-year-old salespe...\n", - " 0.999859\n", - " True\n", - " \n", - " \n", - " 12\n", - " robustness\n", - " lowercase\n", - " In a traditional Chinese medicine preparation,...\n", - " According to the above statement, which of the...\n", - " in a traditional chinese medicine preparation,...\n", - " according to the above statement, which of the...\n", - " B. o Shouwu.\n", - " b. o shouwu.\n", - " 0.999983\n", - " True\n", - " \n", - " \n", - " 13\n", - " robustness\n", - " lowercase\n", - " In recent years, graduate entrance examination...\n", - " Which of the following can best strengthen the...\n", - " in recent years, graduate entrance examination...\n", - " which of the following can best strengthen the...\n", - " B. Only those who intend to take the graduate...\n", - " B. only those who intend to take the graduate...\n", - " 1.000000\n", - " True\n", - " \n", - " \n", - " 14\n", - " robustness\n", - " lowercase\n", - " A unit conducted the year-end assessment and a...\n", - " According to the above statement, it can be co...\n", - " a unit conducted the year-end assessment and a...\n", - " according to the above statement, it can be co...\n", - " C. C.\n", - " d. ding.\n", - " 0.871224\n", - " False\n", - " \n", - " \n", - " 15\n", - " robustness\n", - " lowercase\n", - " Zhang Ming, Li Ying, Wang Jia and Chen Rui wor...\n", - " According to the above statement, you can get ...\n", - " zhang ming, li ying, wang jia and chen rui wor...\n", - " according to the above statement, you can get ...\n", - " A. Chen Rui can't speak the Central Plains Ma...\n", - " A. Chen Rui can't speak the Central Plains Ma...\n", - " 1.000000\n", - " True\n", - " \n", - " \n", - " 16\n", - " robustness\n", - " lowercase\n", - " The person in charge of the relevant departmen...\n", - " Which of the following is true will most weake...\n", - " the person in charge of the relevant departmen...\n", - " which of the following is true will most weake...\n", - " A. This newly built house in Hexi Village has...\n", - " A. this newly built house in hexi village has...\n", - " 1.000000\n", - " True\n", - " \n", - " \n", - " 17\n", + " 8\n", " robustness\n", - " lowercase\n", - " There are five teams participating in the game...\n", - " The result of the match showed that only one a...\n", - " there are five teams participating in the game...\n", - " the result of the match showed that only one a...\n", - " B. Jiangnan.\n", - " b. jiangnan.\n", + " dyslexia_word_swap\n", + " -\n", + " What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage\n", + " -\n", + " What is a likely consequence off ignorance off rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage\n", + " D. accidents\n", + " D. Accidents\n", " 0.999998\n", " True\n", " \n", - " \n", - " 18\n", - " robustness\n", - " lowercase\n", - " Compared with small and medium-sized cities, e...\n", - " Which of the following is the conclusion must ...\n", - " compared with small and medium-sized cities, e...\n", - " which of the following is the conclusion must ...\n", - " B. Simple development of large cities is not ...\n", - " b. simple development of large cities is not ...\n", - " 1.000000\n", - " True\n", - " \n", - " \n", - " 19\n", - " robustness\n", - " lowercase\n", - " Researchers recently discovered that there is ...\n", - " Which of the following is assumed by the above...\n", - " researchers recently discovered that there is ...\n", - " which of the following is assumed by the above...\n", - " D. The brain cannot process too much informat...\n", - " d. the brain cannot process too much informat...\n", - " 1.000000\n", - " True\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " category test_type original_context \\\n", - "0 robustness uppercase In the planning of a new district in a townshi... \n", - "1 robustness uppercase The company sent three young staff members to ... \n", - "2 robustness uppercase In a traditional Chinese medicine preparation,... \n", - "3 robustness uppercase In recent years, graduate entrance examination... \n", - "4 robustness uppercase A unit conducted the year-end assessment and a... \n", - "5 robustness uppercase Zhang Ming, Li Ying, Wang Jia and Chen Rui wor... \n", - "6 robustness uppercase The person in charge of the relevant departmen... \n", - "7 robustness uppercase There are five teams participating in the game... \n", - "8 robustness uppercase Compared with small and medium-sized cities, e... \n", - "9 robustness uppercase Researchers recently discovered that there is ... \n", - "10 robustness lowercase In the planning of a new district in a townshi... \n", - "11 robustness lowercase The company sent three young staff members to ... \n", - "12 robustness lowercase In a traditional Chinese medicine preparation,... \n", - "13 robustness lowercase In recent years, graduate entrance examination... \n", - "14 robustness lowercase A unit conducted the year-end assessment and a... \n", - "15 robustness lowercase Zhang Ming, Li Ying, Wang Jia and Chen Rui wor... \n", - "16 robustness lowercase The person in charge of the relevant departmen... \n", - "17 robustness lowercase There are five teams participating in the game... \n", - "18 robustness lowercase Compared with small and medium-sized cities, e... \n", - "19 robustness lowercase Researchers recently discovered that there is ... \n", - "\n", - " original_question \\\n", - "0 Based on the above statement, which of the fol... \n", - "1 So what are the three young people on business... \n", - "2 According to the above statement, which of the... \n", - "3 Which of the following can best strengthen the... \n", - "4 According to the above statement, it can be co... \n", - "5 According to the above statement, you can get ... \n", - "6 Which of the following is true will most weake... \n", - "7 The result of the match showed that only one a... \n", - "8 Which of the following is the conclusion must ... \n", - "9 Which of the following is assumed by the above... \n", - "10 Based on the above statement, which of the fol... \n", - "11 So what are the three young people on business... \n", - "12 According to the above statement, which of the... \n", - "13 Which of the following can best strengthen the... \n", - "14 According to the above statement, it can be co... \n", - "15 According to the above statement, you can get ... \n", - "16 Which of the following is true will most weake... \n", - "17 The result of the match showed that only one a... \n", - "18 Which of the following is the conclusion must ... \n", - "19 Which of the following is assumed by the above... \n", + " category test_type original_context \\\n", + "0 robustness add_ocr_typo - \n", + "1 robustness add_ocr_typo - \n", + "2 robustness add_ocr_typo - \n", + "3 robustness add_ocr_typo - \n", + "4 robustness add_ocr_typo - \n", + "5 robustness dyslexia_word_swap - \n", + "6 robustness dyslexia_word_swap - \n", + "7 robustness dyslexia_word_swap - \n", + "8 robustness dyslexia_word_swap - \n", "\n", - " perturbed_context \\\n", - "0 IN THE PLANNING OF A NEW DISTRICT IN A TOWNSHI... \n", - "1 THE COMPANY SENT THREE YOUNG STAFF MEMBERS TO ... \n", - "2 IN A TRADITIONAL CHINESE MEDICINE PREPARATION,... \n", - "3 IN RECENT YEARS, GRADUATE ENTRANCE EXAMINATION... \n", - "4 A UNIT CONDUCTED THE YEAR-END ASSESSMENT AND A... \n", - "5 ZHANG MING, LI YING, WANG JIA AND CHEN RUI WOR... \n", - "6 THE PERSON IN CHARGE OF THE RELEVANT DEPARTMEN... \n", - "7 THERE ARE FIVE TEAMS PARTICIPATING IN THE GAME... \n", - "8 COMPARED WITH SMALL AND MEDIUM-SIZED CITIES, E... \n", - "9 RESEARCHERS RECENTLY DISCOVERED THAT THERE IS ... \n", - "10 in the planning of a new district in a townshi... \n", - "11 the company sent three young staff members to ... \n", - "12 in a traditional chinese medicine preparation,... \n", - "13 in recent years, graduate entrance examination... \n", - "14 a unit conducted the year-end assessment and a... \n", - "15 zhang ming, li ying, wang jia and chen rui wor... \n", - "16 the person in charge of the relevant departmen... \n", - "17 there are five teams participating in the game... \n", - "18 compared with small and medium-sized cities, e... \n", - "19 researchers recently discovered that there is ... \n", + " original_question \\\n", + "0 The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs \n", + "1 There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system \n", + "2 What were the kids doing as they looked up at the sky and clouds?\\nA. ponder\\nB. become adults\\nC. wonder about\\nD. open door\\nE. distracting \n", + "3 The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children \n", + "4 What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", + "5 The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs \n", + "6 There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system \n", + "7 The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children \n", + "8 What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", "\n", - " perturbed_question \\\n", - "0 BASED ON THE ABOVE STATEMENT, WHICH OF THE FOL... \n", - "1 SO WHAT ARE THE THREE YOUNG PEOPLE ON BUSINESS... \n", - "2 ACCORDING TO THE ABOVE STATEMENT, WHICH OF THE... \n", - "3 WHICH OF THE FOLLOWING CAN BEST STRENGTHEN THE... \n", - "4 ACCORDING TO THE ABOVE STATEMENT, IT CAN BE CO... \n", - "5 ACCORDING TO THE ABOVE STATEMENT, YOU CAN GET ... \n", - "6 WHICH OF THE FOLLOWING IS TRUE WILL MOST WEAKE... \n", - "7 THE RESULT OF THE MATCH SHOWED THAT ONLY ONE A... \n", - "8 WHICH OF THE FOLLOWING IS THE CONCLUSION MUST ... \n", - "9 WHICH OF THE FOLLOWING IS ASSUMED BY THE ABOVE... \n", - "10 based on the above statement, which of the fol... \n", - "11 so what are the three young people on business... \n", - "12 according to the above statement, which of the... \n", - "13 which of the following can best strengthen the... \n", - "14 according to the above statement, it can be co... \n", - "15 according to the above statement, you can get ... \n", - "16 which of the following is true will most weake... \n", - "17 the result of the match showed that only one a... \n", - "18 which of the following is the conclusion must ... \n", - "19 which of the following is assumed by the above... \n", + " perturbed_context \\\n", + "0 - \n", + "1 - \n", + "2 - \n", + "3 - \n", + "4 - \n", + "5 - \n", + "6 - \n", + "7 - \n", + "8 - \n", "\n", - " expected_result \\\n", - "0 B. The leisure area is southwest of the cultu... \n", - "1 A. 0-year-old accountant, 20-year-old salespe... \n", - "2 B. o Shouwu. \n", - "3 B. Only those who intend to take the graduate... \n", - "4 C. C. \n", - "5 A. Chen Rui can't speak the Central Plains Ma... \n", - "6 A. This newly built house in Hexi Village has... \n", - "7 B. Jiangnan. \n", - "8 B. Simple development of large cities is not ... \n", - "9 D. The brain cannot process too much informat... \n", - "10 B. The leisure area is southwest of the cultu... \n", - "11 A. 0-year-old accountant, 20-year-old salespe... \n", - "12 B. o Shouwu. \n", - "13 B. Only those who intend to take the graduate... \n", - "14 C. C. \n", - "15 A. Chen Rui can't speak the Central Plains Ma... \n", - "16 A. This newly built house in Hexi Village has... \n", - "17 B. Jiangnan. \n", - "18 B. Simple development of large cities is not ... \n", - "19 D. The brain cannot process too much informat... \n", + " perturbed_question \\\n", + "0 t^he townhouse was a h^ard vs^ell f^or the realtor, i^t was rig^ht ncxt t^o a higb rlse v^^hat?\\nA. suburban develoi)ment\\nB. apartment buihiing\\nC. bus ftop\\nD. michigan\\nE. suburbs \n", + "1 t^here is a ftar at tle ccnter of v/hat group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer lpace\\nD. constellation\\nE. solar fvftem \n", + "2 whai were tlic kids doing as tbey looked up at the sky a^nd clouds?\\nA. ponder\\nB. bec6me adults\\nC. w^onder aboui\\nD. ol)en d6or\\nE. distracting \n", + "3 t^he perfon taught an advanced clasf onhy f^or vHo?\\nA. oAvn houfe\\nB. own self\\nC. wonderful memories\\nD. knoAv trutb\\nE. intelligent children \n", + "4 w^hat is a likelv consequence of ignorance of rules?\\nA. f1nd tiuth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", + "5 The townhouse was a hard sell four the realtor, it was write next too a hi rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs \n", + "6 There is a star at the center off what group off celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system \n", + "7 The person taught an advanced class only four who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. no truth\\nE. intelligent children \n", + "8 What is a likely consequence off ignorance off rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", "\n", - " actual_result eval_score pass \n", - "0 B. The Leisure Area is Southwest of the Cultu... 0.999999 True \n", - "1 A. 0-YEAR-OLD ACCOUNTANT, 20-YEAR-OLD SALESPE... 1.000000 True \n", - "2 B. O SHOUWU. 1.000000 True \n", - "3 B. ONLY THOSE WHO INTEND TO TAKE THE GRADUATE... 1.000000 True \n", - "4 D. DING. 0.871277 False \n", - "5 A. Hen Rui can't speak the Central Plains Man... 0.978401 True \n", - "6 A. This newly built house in Hexi Village has... 0.999999 True \n", - "7 B. JIANGNAN. 1.000000 True \n", - "8 B. Simple development of large cities is not ... 0.999999 True \n", - "9 D. The brain cannot process too much informat... 0.999998 True \n", - "10 b. the leisure area is southwest of the cultu... 1.000000 True \n", - "11 a. 0-year-old accountant, 20-year-old salespe... 0.999859 True \n", - "12 b. o shouwu. 0.999983 True \n", - "13 B. only those who intend to take the graduate... 1.000000 True \n", - "14 d. ding. 0.871224 False \n", - "15 A. Chen Rui can't speak the Central Plains Ma... 1.000000 True \n", - "16 A. this newly built house in hexi village has... 1.000000 True \n", - "17 b. jiangnan. 0.999998 True \n", - "18 b. simple development of large cities is not ... 1.000000 True \n", - "19 d. the brain cannot process too much informat... 1.000000 True " + " expected_result actual_result eval_score pass \n", + "0 B. apartment building B. Apartment building 1.000000 True \n", + "1 D. constellation D. constellation 1.000000 True \n", + "2 C. wonder about C. wonder about 0.999803 True \n", + "3 E. Intelligent children B. own self 0.804418 False \n", + "4 D. accidents D. Accidents 0.999998 True \n", + "5 B. apartment building B. apartment building 0.999998 True \n", + "6 D. constellation D. constellation 0.999997 True \n", + "7 E. Intelligent children E. Intelligent children 0.999999 True \n", + "8 D. accidents D. Accidents 0.999998 True " ] }, - "execution_count": 22, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1174,7 +835,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -1211,21 +872,21 @@ " \n", " 0\n", " robustness\n", - " uppercase\n", + " add_ocr_typo\n", " 1\n", - " 9\n", - " 90%\n", + " 4\n", + " 80%\n", " 66%\n", " True\n", " \n", " \n", " 1\n", " robustness\n", - " lowercase\n", - " 1\n", - " 9\n", - " 90%\n", - " 66%\n", + " dyslexia_word_swap\n", + " 0\n", + " 4\n", + " 100%\n", + " 60%\n", " True\n", " \n", " \n", @@ -1233,16 +894,16 @@ "" ], "text/plain": [ - " category test_type fail_count pass_count pass_rate minimum_pass_rate \\\n", - "0 robustness uppercase 1 9 90% 66% \n", - "1 robustness lowercase 1 9 90% 66% \n", + " category test_type fail_count pass_count pass_rate \\\n", + "0 robustness add_ocr_typo 1 4 80% \n", + "1 robustness dyslexia_word_swap 0 4 100% \n", "\n", - " pass \n", - "0 True \n", - "1 True " + " minimum_pass_rate pass \n", + "0 66% True \n", + "1 60% True " ] }, - "execution_count": 23, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1255,29 +916,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Note:** If the above evaluation metric, threshold value, or embeddings model does not work for your use case, you can easily customize the configuration. Simply adjust the configuration parameters to suit your specific needs, and then call the `.generated_results()` method with the updated configuration.\n", - "\n", - "Here's an example of how to modify the configuration and generate results:\n", - "\n", - "\n", - "```python\n", - "harness.configure(\n", - "{\n", - " \"evaluation\": {\"metric\":\"embedding_distance\",\"distance\":\"cosine\",\"threshold\":0.9},\n", - " \"embeddings\":{\"model\":\"sentence-transformers/all-mpnet-base-v2\",\"hub\":\"huggingface\"},\n", - " 'tests': {'defaults': {'min_pass_rate': 0.65},\n", - "\n", - " 'robustness': {'uppercase': {'min_pass_rate': 0.66},\n", - " 'lowercase': {'min_pass_rate': 0.66}\n", - " }\n", - " }\n", - " }\n", - "\n", - ")\n", - "\n", - "# After configuring the parameters, generate the results\n", - "harness.generated_results()\n", - "```" + "**Note:** If the above evaluation metric, threshold value, or embeddings model does not work for your use case, you can easily customize the configuration. Simply adjust the configuration parameters to suit your specific needs, and then call the `.generated_results()` method with the updated configuration.\n" ] }, { @@ -1289,7 +928,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1301,11 +940,11 @@ " 'embeddings': {'model': 'sentence-transformers/all-mpnet-base-v2',\n", " 'hub': 'huggingface'},\n", " 'tests': {'defaults': {'min_pass_rate': 0.65},\n", - " 'robustness': {'uppercase': {'min_pass_rate': 0.66},\n", - " 'lowercase': {'min_pass_rate': 0.66}}}}" + " 'robustness': {'add_ocr_typo': {'min_pass_rate': 0.66},\n", + " 'dyslexia_word_swap': {'min_pass_rate': 0.6}}}}" ] }, - "execution_count": 30, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1318,8 +957,8 @@ " \"embeddings\":{\"model\":\"sentence-transformers/all-mpnet-base-v2\",\"hub\":\"huggingface\"},\n", " 'tests': {'defaults': {'min_pass_rate': 0.65},\n", "\n", - " 'robustness': {'uppercase': {'min_pass_rate': 0.66},\n", - " 'lowercase': {'min_pass_rate': 0.66}\n", + " 'robustness': {'add_ocr_typo': {'min_pass_rate': 0.66},\n", + " 'dyslexia_word_swap':{'min_pass_rate': 0.60}\n", " }\n", " }\n", " }\n", @@ -1336,7 +975,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -1376,260 +1015,117 @@ " \n", " 0\n", " robustness\n", - " uppercase\n", - " In the planning of a new district in a townshi...\n", - " Based on the above statement, which of the fol...\n", - " IN THE PLANNING OF A NEW DISTRICT IN A TOWNSHI...\n", - " BASED ON THE ABOVE STATEMENT, WHICH OF THE FOL...\n", - " B. The leisure area is southwest of the cultu...\n", - " B. The Leisure Area is Southwest of the Cultu...\n", + " add_ocr_typo\n", + " -\n", + " The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs\n", + " -\n", + " t^he townhouse was a h^ard vs^ell f^or the realtor, i^t was rig^ht ncxt t^o a higb rlse v^^hat?\\nA. suburban develoi)ment\\nB. apartment buihiing\\nC. bus ftop\\nD. michigan\\nE. suburbs\n", + " B. apartment building\n", + " B. Apartment building\n", " 1.000000\n", " True\n", " \n", " \n", " 1\n", " robustness\n", - " uppercase\n", - " The company sent three young staff members to ...\n", - " So what are the three young people on business...\n", - " THE COMPANY SENT THREE YOUNG STAFF MEMBERS TO ...\n", - " SO WHAT ARE THE THREE YOUNG PEOPLE ON BUSINESS...\n", - " A. 0-year-old accountant, 20-year-old salespe...\n", - " A. 0-YEAR-OLD ACCOUNTANT, 20-YEAR-OLD SALESPE...\n", + " add_ocr_typo\n", + " -\n", + " There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system\n", + " -\n", + " t^here is a ftar at tle ccnter of v/hat group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer lpace\\nD. constellation\\nE. solar fvftem\n", + " D. constellation\n", + " D. constellation\n", " 1.000000\n", " True\n", " \n", " \n", " 2\n", " robustness\n", - " uppercase\n", - " In a traditional Chinese medicine preparation,...\n", - " According to the above statement, which of the...\n", - " IN A TRADITIONAL CHINESE MEDICINE PREPARATION,...\n", - " ACCORDING TO THE ABOVE STATEMENT, WHICH OF THE...\n", - " B. o Shouwu.\n", - " B. O SHOUWU.\n", + " add_ocr_typo\n", + " -\n", + " What were the kids doing as they looked up at the sky and clouds?\\nA. ponder\\nB. become adults\\nC. wonder about\\nD. open door\\nE. distracting\n", + " -\n", + " whai were tlic kids doing as tbey looked up at the sky a^nd clouds?\\nA. ponder\\nB. bec6me adults\\nC. w^onder aboui\\nD. ol)en d6or\\nE. distracting\n", + " C. wonder about\n", + " C. wonder about\n", " 1.000000\n", " True\n", " \n", " \n", " 3\n", " robustness\n", - " uppercase\n", - " In recent years, graduate entrance examination...\n", - " Which of the following can best strengthen the...\n", - " IN RECENT YEARS, GRADUATE ENTRANCE EXAMINATION...\n", - " WHICH OF THE FOLLOWING CAN BEST STRENGTHEN THE...\n", - " B. Only those who intend to take the graduate...\n", - " B. ONLY THOSE WHO INTEND TO TAKE THE GRADUATE...\n", - " 1.000000\n", - " True\n", + " add_ocr_typo\n", + " -\n", + " The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children\n", + " -\n", + " t^he perfon taught an advanced clasf onhy f^or vHo?\\nA. oAvn houfe\\nB. own self\\nC. wonderful memories\\nD. knoAv trutb\\nE. intelligent children\n", + " E. Intelligent children\n", + " B. own self\n", + " 0.481119\n", + " False\n", " \n", " \n", " 4\n", " robustness\n", - " uppercase\n", - " A unit conducted the year-end assessment and a...\n", - " According to the above statement, it can be co...\n", - " A UNIT CONDUCTED THE YEAR-END ASSESSMENT AND A...\n", - " ACCORDING TO THE ABOVE STATEMENT, IT CAN BE CO...\n", - " C. C.\n", - " D. DING.\n", - " 0.272773\n", - " False\n", + " add_ocr_typo\n", + " -\n", + " What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage\n", + " -\n", + " w^hat is a likelv consequence of ignorance of rules?\\nA. f1nd tiuth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage\n", + " D. accidents\n", + " D. Accidents\n", + " 1.000000\n", + " True\n", " \n", " \n", " 5\n", " robustness\n", - " uppercase\n", - " Zhang Ming, Li Ying, Wang Jia and Chen Rui wor...\n", - " According to the above statement, you can get ...\n", - " ZHANG MING, LI YING, WANG JIA AND CHEN RUI WOR...\n", - " ACCORDING TO THE ABOVE STATEMENT, YOU CAN GET ...\n", - " A. Chen Rui can't speak the Central Plains Ma...\n", - " A. Hen Rui can't speak the Central Plains Man...\n", - " 0.939710\n", + " dyslexia_word_swap\n", + " -\n", + " The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs\n", + " -\n", + " The townhouse was a hard sell four the realtor, it was write next too a hi rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs\n", + " B. apartment building\n", + " B. apartment building\n", + " 1.000000\n", " True\n", " \n", " \n", " 6\n", " robustness\n", - " uppercase\n", - " The person in charge of the relevant departmen...\n", - " Which of the following is true will most weake...\n", - " THE PERSON IN CHARGE OF THE RELEVANT DEPARTMEN...\n", - " WHICH OF THE FOLLOWING IS TRUE WILL MOST WEAKE...\n", - " A. This newly built house in Hexi Village has...\n", - " A. This newly built house in Hexi Village has...\n", + " dyslexia_word_swap\n", + " -\n", + " There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system\n", + " -\n", + " There is a star at the center off what group off celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system\n", + " D. constellation\n", + " D. constellation\n", " 1.000000\n", " True\n", " \n", " \n", " 7\n", " robustness\n", - " uppercase\n", - " There are five teams participating in the game...\n", - " The result of the match showed that only one a...\n", - " THERE ARE FIVE TEAMS PARTICIPATING IN THE GAME...\n", - " THE RESULT OF THE MATCH SHOWED THAT ONLY ONE A...\n", - " B. Jiangnan.\n", - " B. JIANGNAN.\n", + " dyslexia_word_swap\n", + " -\n", + " The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children\n", + " -\n", + " The person taught an advanced class only four who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. no truth\\nE. intelligent children\n", + " E. Intelligent children\n", + " E. Intelligent children\n", " 1.000000\n", " True\n", " \n", " \n", " 8\n", " robustness\n", - " uppercase\n", - " Compared with small and medium-sized cities, e...\n", - " Which of the following is the conclusion must ...\n", - " COMPARED WITH SMALL AND MEDIUM-SIZED CITIES, E...\n", - " WHICH OF THE FOLLOWING IS THE CONCLUSION MUST ...\n", - " B. Simple development of large cities is not ...\n", - " B. Simple development of large cities is not ...\n", - " 1.000000\n", - " True\n", - " \n", - " \n", - " 9\n", - " robustness\n", - " uppercase\n", - " Researchers recently discovered that there is ...\n", - " Which of the following is assumed by the above...\n", - " RESEARCHERS RECENTLY DISCOVERED THAT THERE IS ...\n", - " WHICH OF THE FOLLOWING IS ASSUMED BY THE ABOVE...\n", - " D. The brain cannot process too much informat...\n", - " D. The brain cannot process too much informat...\n", - " 1.000000\n", - " True\n", - " \n", - " \n", - " 10\n", - " robustness\n", - " lowercase\n", - " In the planning of a new district in a townshi...\n", - " Based on the above statement, which of the fol...\n", - " in the planning of a new district in a townshi...\n", - " based on the above statement, which of the fol...\n", - " B. The leisure area is southwest of the cultu...\n", - " b. the leisure area is southwest of the cultu...\n", - " 1.000000\n", - " True\n", - " \n", - " \n", - " 11\n", - " robustness\n", - " lowercase\n", - " The company sent three young staff members to ...\n", - " So what are the three young people on business...\n", - " the company sent three young staff members to ...\n", - " so what are the three young people on business...\n", - " A. 0-year-old accountant, 20-year-old salespe...\n", - " a. 0-year-old accountant, 20-year-old salespe...\n", - " 1.000000\n", - " True\n", - " \n", - " \n", - " 12\n", - " robustness\n", - " lowercase\n", - " In a traditional Chinese medicine preparation,...\n", - " According to the above statement, which of the...\n", - " in a traditional chinese medicine preparation,...\n", - " according to the above statement, which of the...\n", - " B. o Shouwu.\n", - " b. o shouwu.\n", - " 1.000000\n", - " True\n", - " \n", - " \n", - " 13\n", - " robustness\n", - " lowercase\n", - " In recent years, graduate entrance examination...\n", - " Which of the following can best strengthen the...\n", - " in recent years, graduate entrance examination...\n", - " which of the following can best strengthen the...\n", - " B. Only those who intend to take the graduate...\n", - " B. only those who intend to take the graduate...\n", - " 1.000000\n", - " True\n", - " \n", - " \n", - " 14\n", - " robustness\n", - " lowercase\n", - " A unit conducted the year-end assessment and a...\n", - " According to the above statement, it can be co...\n", - " a unit conducted the year-end assessment and a...\n", - " according to the above statement, it can be co...\n", - " C. C.\n", - " d. ding.\n", - " 0.272773\n", - " False\n", - " \n", - " \n", - " 15\n", - " robustness\n", - " lowercase\n", - " Zhang Ming, Li Ying, Wang Jia and Chen Rui wor...\n", - " According to the above statement, you can get ...\n", - " zhang ming, li ying, wang jia and chen rui wor...\n", - " according to the above statement, you can get ...\n", - " A. Chen Rui can't speak the Central Plains Ma...\n", - " A. Chen Rui can't speak the Central Plains Ma...\n", - " 1.000000\n", - " True\n", - " \n", - " \n", - " 16\n", - " robustness\n", - " lowercase\n", - " The person in charge of the relevant departmen...\n", - " Which of the following is true will most weake...\n", - " the person in charge of the relevant departmen...\n", - " which of the following is true will most weake...\n", - " A. This newly built house in Hexi Village has...\n", - " A. this newly built house in hexi village has...\n", - " 1.000000\n", - " True\n", - " \n", - " \n", - " 17\n", - " robustness\n", - " lowercase\n", - " There are five teams participating in the game...\n", - " The result of the match showed that only one a...\n", - " there are five teams participating in the game...\n", - " the result of the match showed that only one a...\n", - " B. Jiangnan.\n", - " b. jiangnan.\n", - " 1.000000\n", - " True\n", - " \n", - " \n", - " 18\n", - " robustness\n", - " lowercase\n", - " Compared with small and medium-sized cities, e...\n", - " Which of the following is the conclusion must ...\n", - " compared with small and medium-sized cities, e...\n", - " which of the following is the conclusion must ...\n", - " B. Simple development of large cities is not ...\n", - " b. simple development of large cities is not ...\n", - " 1.000000\n", - " True\n", - " \n", - " \n", - " 19\n", - " robustness\n", - " lowercase\n", - " Researchers recently discovered that there is ...\n", - " Which of the following is assumed by the above...\n", - " researchers recently discovered that there is ...\n", - " which of the following is assumed by the above...\n", - " D. The brain cannot process too much informat...\n", - " d. the brain cannot process too much informat...\n", + " dyslexia_word_swap\n", + " -\n", + " What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage\n", + " -\n", + " What is a likely consequence off ignorance off rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage\n", + " D. accidents\n", + " D. Accidents\n", " 1.000000\n", " True\n", " \n", @@ -1638,140 +1134,63 @@ "" ], "text/plain": [ - " category test_type original_context \\\n", - "0 robustness uppercase In the planning of a new district in a townshi... \n", - "1 robustness uppercase The company sent three young staff members to ... \n", - "2 robustness uppercase In a traditional Chinese medicine preparation,... \n", - "3 robustness uppercase In recent years, graduate entrance examination... \n", - "4 robustness uppercase A unit conducted the year-end assessment and a... \n", - "5 robustness uppercase Zhang Ming, Li Ying, Wang Jia and Chen Rui wor... \n", - "6 robustness uppercase The person in charge of the relevant departmen... \n", - "7 robustness uppercase There are five teams participating in the game... \n", - "8 robustness uppercase Compared with small and medium-sized cities, e... \n", - "9 robustness uppercase Researchers recently discovered that there is ... \n", - "10 robustness lowercase In the planning of a new district in a townshi... \n", - "11 robustness lowercase The company sent three young staff members to ... \n", - "12 robustness lowercase In a traditional Chinese medicine preparation,... \n", - "13 robustness lowercase In recent years, graduate entrance examination... \n", - "14 robustness lowercase A unit conducted the year-end assessment and a... \n", - "15 robustness lowercase Zhang Ming, Li Ying, Wang Jia and Chen Rui wor... \n", - "16 robustness lowercase The person in charge of the relevant departmen... \n", - "17 robustness lowercase There are five teams participating in the game... \n", - "18 robustness lowercase Compared with small and medium-sized cities, e... \n", - "19 robustness lowercase Researchers recently discovered that there is ... \n", - "\n", - " original_question \\\n", - "0 Based on the above statement, which of the fol... \n", - "1 So what are the three young people on business... \n", - "2 According to the above statement, which of the... \n", - "3 Which of the following can best strengthen the... \n", - "4 According to the above statement, it can be co... \n", - "5 According to the above statement, you can get ... \n", - "6 Which of the following is true will most weake... \n", - "7 The result of the match showed that only one a... \n", - "8 Which of the following is the conclusion must ... \n", - "9 Which of the following is assumed by the above... \n", - "10 Based on the above statement, which of the fol... \n", - "11 So what are the three young people on business... \n", - "12 According to the above statement, which of the... \n", - "13 Which of the following can best strengthen the... \n", - "14 According to the above statement, it can be co... \n", - "15 According to the above statement, you can get ... \n", - "16 Which of the following is true will most weake... \n", - "17 The result of the match showed that only one a... \n", - "18 Which of the following is the conclusion must ... \n", - "19 Which of the following is assumed by the above... \n", + " category test_type original_context \\\n", + "0 robustness add_ocr_typo - \n", + "1 robustness add_ocr_typo - \n", + "2 robustness add_ocr_typo - \n", + "3 robustness add_ocr_typo - \n", + "4 robustness add_ocr_typo - \n", + "5 robustness dyslexia_word_swap - \n", + "6 robustness dyslexia_word_swap - \n", + "7 robustness dyslexia_word_swap - \n", + "8 robustness dyslexia_word_swap - \n", "\n", - " perturbed_context \\\n", - "0 IN THE PLANNING OF A NEW DISTRICT IN A TOWNSHI... \n", - "1 THE COMPANY SENT THREE YOUNG STAFF MEMBERS TO ... \n", - "2 IN A TRADITIONAL CHINESE MEDICINE PREPARATION,... \n", - "3 IN RECENT YEARS, GRADUATE ENTRANCE EXAMINATION... \n", - "4 A UNIT CONDUCTED THE YEAR-END ASSESSMENT AND A... \n", - "5 ZHANG MING, LI YING, WANG JIA AND CHEN RUI WOR... \n", - "6 THE PERSON IN CHARGE OF THE RELEVANT DEPARTMEN... \n", - "7 THERE ARE FIVE TEAMS PARTICIPATING IN THE GAME... \n", - "8 COMPARED WITH SMALL AND MEDIUM-SIZED CITIES, E... \n", - "9 RESEARCHERS RECENTLY DISCOVERED THAT THERE IS ... \n", - "10 in the planning of a new district in a townshi... \n", - "11 the company sent three young staff members to ... \n", - "12 in a traditional chinese medicine preparation,... \n", - "13 in recent years, graduate entrance examination... \n", - "14 a unit conducted the year-end assessment and a... \n", - "15 zhang ming, li ying, wang jia and chen rui wor... \n", - "16 the person in charge of the relevant departmen... \n", - "17 there are five teams participating in the game... \n", - "18 compared with small and medium-sized cities, e... \n", - "19 researchers recently discovered that there is ... \n", + " original_question \\\n", + "0 The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs \n", + "1 There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system \n", + "2 What were the kids doing as they looked up at the sky and clouds?\\nA. ponder\\nB. become adults\\nC. wonder about\\nD. open door\\nE. distracting \n", + "3 The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children \n", + "4 What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", + "5 The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs \n", + "6 There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system \n", + "7 The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children \n", + "8 What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", "\n", - " perturbed_question \\\n", - "0 BASED ON THE ABOVE STATEMENT, WHICH OF THE FOL... \n", - "1 SO WHAT ARE THE THREE YOUNG PEOPLE ON BUSINESS... \n", - "2 ACCORDING TO THE ABOVE STATEMENT, WHICH OF THE... \n", - "3 WHICH OF THE FOLLOWING CAN BEST STRENGTHEN THE... \n", - "4 ACCORDING TO THE ABOVE STATEMENT, IT CAN BE CO... \n", - "5 ACCORDING TO THE ABOVE STATEMENT, YOU CAN GET ... \n", - "6 WHICH OF THE FOLLOWING IS TRUE WILL MOST WEAKE... \n", - "7 THE RESULT OF THE MATCH SHOWED THAT ONLY ONE A... \n", - "8 WHICH OF THE FOLLOWING IS THE CONCLUSION MUST ... \n", - "9 WHICH OF THE FOLLOWING IS ASSUMED BY THE ABOVE... \n", - "10 based on the above statement, which of the fol... \n", - "11 so what are the three young people on business... \n", - "12 according to the above statement, which of the... \n", - "13 which of the following can best strengthen the... \n", - "14 according to the above statement, it can be co... \n", - "15 according to the above statement, you can get ... \n", - "16 which of the following is true will most weake... \n", - "17 the result of the match showed that only one a... \n", - "18 which of the following is the conclusion must ... \n", - "19 which of the following is assumed by the above... \n", + " perturbed_context \\\n", + "0 - \n", + "1 - \n", + "2 - \n", + "3 - \n", + "4 - \n", + "5 - \n", + "6 - \n", + "7 - \n", + "8 - \n", "\n", - " expected_result \\\n", - "0 B. The leisure area is southwest of the cultu... \n", - "1 A. 0-year-old accountant, 20-year-old salespe... \n", - "2 B. o Shouwu. \n", - "3 B. Only those who intend to take the graduate... \n", - "4 C. C. \n", - "5 A. Chen Rui can't speak the Central Plains Ma... \n", - "6 A. This newly built house in Hexi Village has... \n", - "7 B. Jiangnan. \n", - "8 B. Simple development of large cities is not ... \n", - "9 D. The brain cannot process too much informat... \n", - "10 B. The leisure area is southwest of the cultu... \n", - "11 A. 0-year-old accountant, 20-year-old salespe... \n", - "12 B. o Shouwu. \n", - "13 B. Only those who intend to take the graduate... \n", - "14 C. C. \n", - "15 A. Chen Rui can't speak the Central Plains Ma... \n", - "16 A. This newly built house in Hexi Village has... \n", - "17 B. Jiangnan. \n", - "18 B. Simple development of large cities is not ... \n", - "19 D. The brain cannot process too much informat... \n", + " perturbed_question \\\n", + "0 t^he townhouse was a h^ard vs^ell f^or the realtor, i^t was rig^ht ncxt t^o a higb rlse v^^hat?\\nA. suburban develoi)ment\\nB. apartment buihiing\\nC. bus ftop\\nD. michigan\\nE. suburbs \n", + "1 t^here is a ftar at tle ccnter of v/hat group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer lpace\\nD. constellation\\nE. solar fvftem \n", + "2 whai were tlic kids doing as tbey looked up at the sky a^nd clouds?\\nA. ponder\\nB. bec6me adults\\nC. w^onder aboui\\nD. ol)en d6or\\nE. distracting \n", + "3 t^he perfon taught an advanced clasf onhy f^or vHo?\\nA. oAvn houfe\\nB. own self\\nC. wonderful memories\\nD. knoAv trutb\\nE. intelligent children \n", + "4 w^hat is a likelv consequence of ignorance of rules?\\nA. f1nd tiuth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", + "5 The townhouse was a hard sell four the realtor, it was write next too a hi rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs \n", + "6 There is a star at the center off what group off celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system \n", + "7 The person taught an advanced class only four who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. no truth\\nE. intelligent children \n", + "8 What is a likely consequence off ignorance off rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", "\n", - " actual_result eval_score pass \n", - "0 B. The Leisure Area is Southwest of the Cultu... 1.000000 True \n", - "1 A. 0-YEAR-OLD ACCOUNTANT, 20-YEAR-OLD SALESPE... 1.000000 True \n", - "2 B. O SHOUWU. 1.000000 True \n", - "3 B. ONLY THOSE WHO INTEND TO TAKE THE GRADUATE... 1.000000 True \n", - "4 D. DING. 0.272773 False \n", - "5 A. Hen Rui can't speak the Central Plains Man... 0.939710 True \n", - "6 A. This newly built house in Hexi Village has... 1.000000 True \n", - "7 B. JIANGNAN. 1.000000 True \n", - "8 B. Simple development of large cities is not ... 1.000000 True \n", - "9 D. The brain cannot process too much informat... 1.000000 True \n", - "10 b. the leisure area is southwest of the cultu... 1.000000 True \n", - "11 a. 0-year-old accountant, 20-year-old salespe... 1.000000 True \n", - "12 b. o shouwu. 1.000000 True \n", - "13 B. only those who intend to take the graduate... 1.000000 True \n", - "14 d. ding. 0.272773 False \n", - "15 A. Chen Rui can't speak the Central Plains Ma... 1.000000 True \n", - "16 A. this newly built house in hexi village has... 1.000000 True \n", - "17 b. jiangnan. 1.000000 True \n", - "18 b. simple development of large cities is not ... 1.000000 True \n", - "19 d. the brain cannot process too much informat... 1.000000 True " + " expected_result actual_result eval_score pass \n", + "0 B. apartment building B. Apartment building 1.000000 True \n", + "1 D. constellation D. constellation 1.000000 True \n", + "2 C. wonder about C. wonder about 1.000000 True \n", + "3 E. Intelligent children B. own self 0.481119 False \n", + "4 D. accidents D. Accidents 1.000000 True \n", + "5 B. apartment building B. apartment building 1.000000 True \n", + "6 D. constellation D. constellation 1.000000 True \n", + "7 E. Intelligent children E. Intelligent children 1.000000 True \n", + "8 D. accidents D. Accidents 1.000000 True " ] }, - "execution_count": 31, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1789,7 +1208,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1826,21 +1245,21 @@ " \n", " 0\n", " robustness\n", - " uppercase\n", + " add_ocr_typo\n", " 1\n", - " 9\n", - " 90%\n", + " 4\n", + " 80%\n", " 66%\n", " True\n", " \n", " \n", " 1\n", " robustness\n", - " lowercase\n", - " 1\n", - " 9\n", - " 90%\n", - " 66%\n", + " dyslexia_word_swap\n", + " 0\n", + " 4\n", + " 100%\n", + " 60%\n", " True\n", " \n", " \n", @@ -1848,16 +1267,16 @@ "" ], "text/plain": [ - " category test_type fail_count pass_count pass_rate minimum_pass_rate \\\n", - "0 robustness uppercase 1 9 90% 66% \n", - "1 robustness lowercase 1 9 90% 66% \n", + " category test_type fail_count pass_count pass_rate \\\n", + "0 robustness add_ocr_typo 1 4 80% \n", + "1 robustness dyslexia_word_swap 0 4 100% \n", "\n", - " pass \n", - "0 True \n", - "1 True " + " minimum_pass_rate pass \n", + "0 66% True \n", + "1 60% True " ] }, - "execution_count": 32, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1938,7 +1357,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -1948,11 +1367,11 @@ " 'distance': 'jaro',\n", " 'threshold': 0.1},\n", " 'tests': {'defaults': {'min_pass_rate': 0.65},\n", - " 'robustness': {'uppercase': {'min_pass_rate': 0.66},\n", - " 'lowercase': {'min_pass_rate': 0.66}}}}" + " 'robustness': {'add_ocr_typo': {'min_pass_rate': 0.66},\n", + " 'dyslexia_word_swap': {'min_pass_rate': 0.6}}}}" ] }, - "execution_count": 33, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -1963,8 +1382,8 @@ " \"evaluation\": {\"metric\":\"string_distance\",\"distance\":\"jaro\",\"threshold\":0.1},\n", " 'tests': {'defaults': {'min_pass_rate': 0.65},\n", "\n", - " 'robustness': {'uppercase': {'min_pass_rate': 0.66},\n", - " 'lowercase': {'min_pass_rate': 0.66}\n", + " 'robustness': {'add_ocr_typo': {'min_pass_rate': 0.66},\n", + " 'dyslexia_word_swap':{'min_pass_rate': 0.60}\n", " }\n", " }\n", " }\n", @@ -1974,7 +1393,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -2014,261 +1433,446 @@ " \n", " 0\n", " robustness\n", - " uppercase\n", - " In the planning of a new district in a townshi...\n", - " Based on the above statement, which of the fol...\n", - " IN THE PLANNING OF A NEW DISTRICT IN A TOWNSHI...\n", - " BASED ON THE ABOVE STATEMENT, WHICH OF THE FOL...\n", - " B. The leisure area is southwest of the cultu...\n", - " B. The Leisure Area is Southwest of the Cultu...\n", + " add_ocr_typo\n", + " -\n", + " The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs\n", + " -\n", + " t^he townhouse was a h^ard vs^ell f^or the realtor, i^t was rig^ht ncxt t^o a higb rlse v^^hat?\\nA. suburban develoi)ment\\nB. apartment buihiing\\nC. bus ftop\\nD. michigan\\nE. suburbs\n", + " B. apartment building\n", + " B. Apartment building\n", " 0.000000\n", " True\n", " \n", " \n", " 1\n", " robustness\n", - " uppercase\n", - " The company sent three young staff members to ...\n", - " So what are the three young people on business...\n", - " THE COMPANY SENT THREE YOUNG STAFF MEMBERS TO ...\n", - " SO WHAT ARE THE THREE YOUNG PEOPLE ON BUSINESS...\n", - " A. 0-year-old accountant, 20-year-old salespe...\n", - " A. 0-YEAR-OLD ACCOUNTANT, 20-YEAR-OLD SALESPE...\n", + " add_ocr_typo\n", + " -\n", + " There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system\n", + " -\n", + " t^here is a ftar at tle ccnter of v/hat group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer lpace\\nD. constellation\\nE. solar fvftem\n", + " D. constellation\n", + " D. constellation\n", " 0.000000\n", " True\n", " \n", " \n", " 2\n", " robustness\n", - " uppercase\n", - " In a traditional Chinese medicine preparation,...\n", - " According to the above statement, which of the...\n", - " IN A TRADITIONAL CHINESE MEDICINE PREPARATION,...\n", - " ACCORDING TO THE ABOVE STATEMENT, WHICH OF THE...\n", - " B. o Shouwu.\n", - " B. O SHOUWU.\n", + " add_ocr_typo\n", + " -\n", + " What were the kids doing as they looked up at the sky and clouds?\\nA. ponder\\nB. become adults\\nC. wonder about\\nD. open door\\nE. distracting\n", + " -\n", + " whai were tlic kids doing as tbey looked up at the sky a^nd clouds?\\nA. ponder\\nB. bec6me adults\\nC. w^onder aboui\\nD. ol)en d6or\\nE. distracting\n", + " C. wonder about\n", + " C. wonder about\n", " 0.000000\n", " True\n", " \n", " \n", " 3\n", " robustness\n", - " uppercase\n", - " In recent years, graduate entrance examination...\n", - " Which of the following can best strengthen the...\n", - " IN RECENT YEARS, GRADUATE ENTRANCE EXAMINATION...\n", - " WHICH OF THE FOLLOWING CAN BEST STRENGTHEN THE...\n", - " B. Only those who intend to take the graduate...\n", - " B. ONLY THOSE WHO INTEND TO TAKE THE GRADUATE...\n", - " 0.000000\n", - " True\n", + " add_ocr_typo\n", + " -\n", + " The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children\n", + " -\n", + " t^he perfon taught an advanced clasf onhy f^or vHo?\\nA. oAvn houfe\\nB. own self\\nC. wonderful memories\\nD. knoAv trutb\\nE. intelligent children\n", + " E. Intelligent children\n", + " B. own self\n", + " 0.564559\n", + " False\n", " \n", " \n", " 4\n", " robustness\n", - " uppercase\n", - " A unit conducted the year-end assessment and a...\n", - " According to the above statement, it can be co...\n", - " A UNIT CONDUCTED THE YEAR-END ASSESSMENT AND A...\n", - " ACCORDING TO THE ABOVE STATEMENT, IT CAN BE CO...\n", - " C. C.\n", - " D. DING.\n", - " 0.341667\n", - " False\n", + " add_ocr_typo\n", + " -\n", + " What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage\n", + " -\n", + " w^hat is a likelv consequence of ignorance of rules?\\nA. f1nd tiuth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage\n", + " D. accidents\n", + " D. Accidents\n", + " 0.000000\n", + " True\n", " \n", " \n", " 5\n", " robustness\n", - " uppercase\n", - " Zhang Ming, Li Ying, Wang Jia and Chen Rui wor...\n", - " According to the above statement, you can get ...\n", - " ZHANG MING, LI YING, WANG JIA AND CHEN RUI WOR...\n", - " ACCORDING TO THE ABOVE STATEMENT, YOU CAN GET ...\n", - " A. Chen Rui can't speak the Central Plains Ma...\n", - " A. Hen Rui can't speak the Central Plains Man...\n", - " 0.084842\n", + " dyslexia_word_swap\n", + " -\n", + " The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs\n", + " -\n", + " The townhouse was a hard sell four the realtor, it was write next too a hi rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs\n", + " B. apartment building\n", + " B. apartment building\n", + " 0.000000\n", " True\n", " \n", " \n", " 6\n", " robustness\n", - " uppercase\n", - " The person in charge of the relevant departmen...\n", - " Which of the following is true will most weake...\n", - " THE PERSON IN CHARGE OF THE RELEVANT DEPARTMEN...\n", - " WHICH OF THE FOLLOWING IS TRUE WILL MOST WEAKE...\n", - " A. This newly built house in Hexi Village has...\n", - " A. This newly built house in Hexi Village has...\n", + " dyslexia_word_swap\n", + " -\n", + " There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system\n", + " -\n", + " There is a star at the center off what group off celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system\n", + " D. constellation\n", + " D. constellation\n", " 0.000000\n", " True\n", " \n", " \n", " 7\n", " robustness\n", - " uppercase\n", - " There are five teams participating in the game...\n", - " The result of the match showed that only one a...\n", - " THERE ARE FIVE TEAMS PARTICIPATING IN THE GAME...\n", - " THE RESULT OF THE MATCH SHOWED THAT ONLY ONE A...\n", - " B. Jiangnan.\n", - " B. JIANGNAN.\n", + " dyslexia_word_swap\n", + " -\n", + " The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children\n", + " -\n", + " The person taught an advanced class only four who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. no truth\\nE. intelligent children\n", + " E. Intelligent children\n", + " E. Intelligent children\n", " 0.000000\n", " True\n", " \n", " \n", " 8\n", " robustness\n", - " uppercase\n", - " Compared with small and medium-sized cities, e...\n", - " Which of the following is the conclusion must ...\n", - " COMPARED WITH SMALL AND MEDIUM-SIZED CITIES, E...\n", - " WHICH OF THE FOLLOWING IS THE CONCLUSION MUST ...\n", - " B. Simple development of large cities is not ...\n", - " B. Simple development of large cities is not ...\n", + " dyslexia_word_swap\n", + " -\n", + " What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage\n", + " -\n", + " What is a likely consequence off ignorance off rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage\n", + " D. accidents\n", + " D. Accidents\n", " 0.000000\n", " True\n", " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " category test_type original_context \\\n", + "0 robustness add_ocr_typo - \n", + "1 robustness add_ocr_typo - \n", + "2 robustness add_ocr_typo - \n", + "3 robustness add_ocr_typo - \n", + "4 robustness add_ocr_typo - \n", + "5 robustness dyslexia_word_swap - \n", + "6 robustness dyslexia_word_swap - \n", + "7 robustness dyslexia_word_swap - \n", + "8 robustness dyslexia_word_swap - \n", + "\n", + " original_question \\\n", + "0 The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs \n", + "1 There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system \n", + "2 What were the kids doing as they looked up at the sky and clouds?\\nA. ponder\\nB. become adults\\nC. wonder about\\nD. open door\\nE. distracting \n", + "3 The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children \n", + "4 What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", + "5 The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs \n", + "6 There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system \n", + "7 The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children \n", + "8 What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", + "\n", + " perturbed_context \\\n", + "0 - \n", + "1 - \n", + "2 - \n", + "3 - \n", + "4 - \n", + "5 - \n", + "6 - \n", + "7 - \n", + "8 - \n", + "\n", + " perturbed_question \\\n", + "0 t^he townhouse was a h^ard vs^ell f^or the realtor, i^t was rig^ht ncxt t^o a higb rlse v^^hat?\\nA. suburban develoi)ment\\nB. apartment buihiing\\nC. bus ftop\\nD. michigan\\nE. suburbs \n", + "1 t^here is a ftar at tle ccnter of v/hat group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer lpace\\nD. constellation\\nE. solar fvftem \n", + "2 whai were tlic kids doing as tbey looked up at the sky a^nd clouds?\\nA. ponder\\nB. bec6me adults\\nC. w^onder aboui\\nD. ol)en d6or\\nE. distracting \n", + "3 t^he perfon taught an advanced clasf onhy f^or vHo?\\nA. oAvn houfe\\nB. own self\\nC. wonderful memories\\nD. knoAv trutb\\nE. intelligent children \n", + "4 w^hat is a likelv consequence of ignorance of rules?\\nA. f1nd tiuth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", + "5 The townhouse was a hard sell four the realtor, it was write next too a hi rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs \n", + "6 There is a star at the center off what group off celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system \n", + "7 The person taught an advanced class only four who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. no truth\\nE. intelligent children \n", + "8 What is a likely consequence off ignorance off rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", + "\n", + " expected_result actual_result eval_score pass \n", + "0 B. apartment building B. Apartment building 0.000000 True \n", + "1 D. constellation D. constellation 0.000000 True \n", + "2 C. wonder about C. wonder about 0.000000 True \n", + "3 E. Intelligent children B. own self 0.564559 False \n", + "4 D. accidents D. Accidents 0.000000 True \n", + "5 B. apartment building B. apartment building 0.000000 True \n", + "6 D. constellation D. constellation 0.000000 True \n", + "7 E. Intelligent children E. Intelligent children 0.000000 True \n", + "8 D. accidents D. Accidents 0.000000 True " + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "harness.generated_results()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This method returns the generated results in the form of a pandas dataframe, which provides a convenient and easy-to-use format for working with the test results. You can use this method to quickly identify the test cases that failed and to determine where fixes are needed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Final Results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can call `.report()` which summarizes the results giving information about pass and fail counts and overall test pass/fail flag." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + "
categorytest_typefail_countpass_countpass_rateminimum_pass_ratepass
90robustnessuppercaseResearchers recently discovered that there is ...Which of the following is assumed by the above...RESEARCHERS RECENTLY DISCOVERED THAT THERE IS ...WHICH OF THE FOLLOWING IS ASSUMED BY THE ABOVE...D. The brain cannot process too much informat...D. The brain cannot process too much informat...0.000000add_ocr_typo1480%66%True
101robustnesslowercaseIn the planning of a new district in a townshi...Based on the above statement, which of the fol...in the planning of a new district in a townshi...based on the above statement, which of the fol...B. The leisure area is southwest of the cultu...b. the leisure area is southwest of the cultu...0.000000dyslexia_word_swap04100%60%True
\n", + "
" + ], + "text/plain": [ + " category test_type fail_count pass_count pass_rate \\\n", + "0 robustness add_ocr_typo 1 4 80% \n", + "1 robustness dyslexia_word_swap 0 4 100% \n", + "\n", + " minimum_pass_rate pass \n", + "0 66% True \n", + "1 60% True " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "harness.report()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluating with QAEvalChain\n", + "\n", + "How it operates in LangTest for robustness testing:\n", + "\n", + "- The evaluation process is conducted on provided data, by assessing the original question and the expected results (ground truth), as well as the perturbed question and the actual results.\n", + "- The outcome of the evaluation process determines whether the actual results aligns with the expected results (ground truth)." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2276,155 +1880,85 @@ "" ], "text/plain": [ - " category test_type original_context \\\n", - "0 robustness uppercase In the planning of a new district in a townshi... \n", - "1 robustness uppercase The company sent three young staff members to ... \n", - "2 robustness uppercase In a traditional Chinese medicine preparation,... \n", - "3 robustness uppercase In recent years, graduate entrance examination... \n", - "4 robustness uppercase A unit conducted the year-end assessment and a... \n", - "5 robustness uppercase Zhang Ming, Li Ying, Wang Jia and Chen Rui wor... \n", - "6 robustness uppercase The person in charge of the relevant departmen... \n", - "7 robustness uppercase There are five teams participating in the game... \n", - "8 robustness uppercase Compared with small and medium-sized cities, e... \n", - "9 robustness uppercase Researchers recently discovered that there is ... \n", - "10 robustness lowercase In the planning of a new district in a townshi... \n", - "11 robustness lowercase The company sent three young staff members to ... \n", - "12 robustness lowercase In a traditional Chinese medicine preparation,... \n", - "13 robustness lowercase In recent years, graduate entrance examination... \n", - "14 robustness lowercase A unit conducted the year-end assessment and a... \n", - "15 robustness lowercase Zhang Ming, Li Ying, Wang Jia and Chen Rui wor... \n", - "16 robustness lowercase The person in charge of the relevant departmen... \n", - "17 robustness lowercase There are five teams participating in the game... \n", - "18 robustness lowercase Compared with small and medium-sized cities, e... \n", - "19 robustness lowercase Researchers recently discovered that there is ... \n", + " category test_type original_context \\\n", + "0 robustness add_ocr_typo - \n", + "1 robustness add_ocr_typo - \n", + "2 robustness add_ocr_typo - \n", + "3 robustness add_ocr_typo - \n", + "4 robustness add_ocr_typo - \n", + "5 robustness dyslexia_word_swap - \n", + "6 robustness dyslexia_word_swap - \n", + "7 robustness dyslexia_word_swap - \n", + "8 robustness dyslexia_word_swap - \n", "\n", - " original_question \\\n", - "0 Based on the above statement, which of the fol... \n", - "1 So what are the three young people on business... \n", - "2 According to the above statement, which of the... \n", - "3 Which of the following can best strengthen the... \n", - "4 According to the above statement, it can be co... \n", - "5 According to the above statement, you can get ... \n", - "6 Which of the following is true will most weake... \n", - "7 The result of the match showed that only one a... \n", - "8 Which of the following is the conclusion must ... \n", - "9 Which of the following is assumed by the above... \n", - "10 Based on the above statement, which of the fol... \n", - "11 So what are the three young people on business... \n", - "12 According to the above statement, which of the... \n", - "13 Which of the following can best strengthen the... \n", - "14 According to the above statement, it can be co... \n", - "15 According to the above statement, you can get ... \n", - "16 Which of the following is true will most weake... \n", - "17 The result of the match showed that only one a... \n", - "18 Which of the following is the conclusion must ... \n", - "19 Which of the following is assumed by the above... \n", + " original_question \\\n", + "0 The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs \n", + "1 There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system \n", + "2 What were the kids doing as they looked up at the sky and clouds?\\nA. ponder\\nB. become adults\\nC. wonder about\\nD. open door\\nE. distracting \n", + "3 The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children \n", + "4 What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", + "5 The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs \n", + "6 There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system \n", + "7 The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children \n", + "8 What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", "\n", - " perturbed_context \\\n", - "0 IN THE PLANNING OF A NEW DISTRICT IN A TOWNSHI... \n", - "1 THE COMPANY SENT THREE YOUNG STAFF MEMBERS TO ... \n", - "2 IN A TRADITIONAL CHINESE MEDICINE PREPARATION,... \n", - "3 IN RECENT YEARS, GRADUATE ENTRANCE EXAMINATION... \n", - "4 A UNIT CONDUCTED THE YEAR-END ASSESSMENT AND A... \n", - "5 ZHANG MING, LI YING, WANG JIA AND CHEN RUI WOR... \n", - "6 THE PERSON IN CHARGE OF THE RELEVANT DEPARTMEN... \n", - "7 THERE ARE FIVE TEAMS PARTICIPATING IN THE GAME... \n", - "8 COMPARED WITH SMALL AND MEDIUM-SIZED CITIES, E... \n", - "9 RESEARCHERS RECENTLY DISCOVERED THAT THERE IS ... \n", - "10 in the planning of a new district in a townshi... \n", - "11 the company sent three young staff members to ... \n", - "12 in a traditional chinese medicine preparation,... \n", - "13 in recent years, graduate entrance examination... \n", - "14 a unit conducted the year-end assessment and a... \n", - "15 zhang ming, li ying, wang jia and chen rui wor... \n", - "16 the person in charge of the relevant departmen... \n", - "17 there are five teams participating in the game... \n", - "18 compared with small and medium-sized cities, e... \n", - "19 researchers recently discovered that there is ... \n", + " perturbed_context \\\n", + "0 - \n", + "1 - \n", + "2 - \n", + "3 - \n", + "4 - \n", + "5 - \n", + "6 - \n", + "7 - \n", + "8 - \n", "\n", - " perturbed_question \\\n", - "0 BASED ON THE ABOVE STATEMENT, WHICH OF THE FOL... \n", - "1 SO WHAT ARE THE THREE YOUNG PEOPLE ON BUSINESS... \n", - "2 ACCORDING TO THE ABOVE STATEMENT, WHICH OF THE... \n", - "3 WHICH OF THE FOLLOWING CAN BEST STRENGTHEN THE... \n", - "4 ACCORDING TO THE ABOVE STATEMENT, IT CAN BE CO... \n", - "5 ACCORDING TO THE ABOVE STATEMENT, YOU CAN GET ... \n", - "6 WHICH OF THE FOLLOWING IS TRUE WILL MOST WEAKE... \n", - "7 THE RESULT OF THE MATCH SHOWED THAT ONLY ONE A... \n", - "8 WHICH OF THE FOLLOWING IS THE CONCLUSION MUST ... \n", - "9 WHICH OF THE FOLLOWING IS ASSUMED BY THE ABOVE... \n", - "10 based on the above statement, which of the fol... \n", - "11 so what are the three young people on business... \n", - "12 according to the above statement, which of the... \n", - "13 which of the following can best strengthen the... \n", - "14 according to the above statement, it can be co... \n", - "15 according to the above statement, you can get ... \n", - "16 which of the following is true will most weake... \n", - "17 the result of the match showed that only one a... \n", - "18 which of the following is the conclusion must ... \n", - "19 which of the following is assumed by the above... \n", + " perturbed_question \\\n", + "0 t^he townhouse was a h^ard vs^ell f^or the realtor, i^t was rig^ht ncxt t^o a higb rlse v^^hat?\\nA. suburban develoi)ment\\nB. apartment buihiing\\nC. bus ftop\\nD. michigan\\nE. suburbs \n", + "1 t^here is a ftar at tle ccnter of v/hat group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer lpace\\nD. constellation\\nE. solar fvftem \n", + "2 whai were tlic kids doing as tbey looked up at the sky a^nd clouds?\\nA. ponder\\nB. bec6me adults\\nC. w^onder aboui\\nD. ol)en d6or\\nE. distracting \n", + "3 t^he perfon taught an advanced clasf onhy f^or vHo?\\nA. oAvn houfe\\nB. own self\\nC. wonderful memories\\nD. knoAv trutb\\nE. intelligent children \n", + "4 w^hat is a likelv consequence of ignorance of rules?\\nA. f1nd tiuth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", + "5 The townhouse was a hard sell four the realtor, it was write next too a hi rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs \n", + "6 There is a star at the center off what group off celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system \n", + "7 The person taught an advanced class only four who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. no truth\\nE. intelligent children \n", + "8 What is a likely consequence off ignorance off rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage \n", "\n", - " expected_result \\\n", - "0 B. The leisure area is southwest of the cultu... \n", - "1 A. 0-year-old accountant, 20-year-old salespe... \n", - "2 B. o Shouwu. \n", - "3 B. Only those who intend to take the graduate... \n", - "4 C. C. \n", - "5 A. Chen Rui can't speak the Central Plains Ma... \n", - "6 A. This newly built house in Hexi Village has... \n", - "7 B. Jiangnan. \n", - "8 B. Simple development of large cities is not ... \n", - "9 D. The brain cannot process too much informat... \n", - "10 B. The leisure area is southwest of the cultu... \n", - "11 A. 0-year-old accountant, 20-year-old salespe... \n", - "12 B. o Shouwu. \n", - "13 B. Only those who intend to take the graduate... \n", - "14 C. C. \n", - "15 A. Chen Rui can't speak the Central Plains Ma... \n", - "16 A. This newly built house in Hexi Village has... \n", - "17 B. Jiangnan. \n", - "18 B. Simple development of large cities is not ... \n", - "19 D. The brain cannot process too much informat... \n", - "\n", - " actual_result eval_score pass \n", - "0 B. The Leisure Area is Southwest of the Cultu... 0.000000 True \n", - "1 A. 0-YEAR-OLD ACCOUNTANT, 20-YEAR-OLD SALESPE... 0.000000 True \n", - "2 B. O SHOUWU. 0.000000 True \n", - "3 B. ONLY THOSE WHO INTEND TO TAKE THE GRADUATE... 0.000000 True \n", - "4 D. DING. 0.341667 False \n", - "5 A. Hen Rui can't speak the Central Plains Man... 0.084842 True \n", - "6 A. This newly built house in Hexi Village has... 0.000000 True \n", - "7 B. JIANGNAN. 0.000000 True \n", - "8 B. Simple development of large cities is not ... 0.000000 True \n", - "9 D. The brain cannot process too much informat... 0.000000 True \n", - "10 b. the leisure area is southwest of the cultu... 0.000000 True \n", - "11 a. 0-year-old accountant, 20-year-old salespe... 0.000000 True \n", - "12 b. o shouwu. 0.000000 True \n", - "13 B. only those who intend to take the graduate... 0.000000 True \n", - "14 d. ding. 0.341667 False \n", - "15 A. Chen Rui can't speak the Central Plains Ma... 0.000000 True \n", - "16 A. this newly built house in hexi village has... 0.000000 True \n", - "17 b. jiangnan. 0.000000 True \n", - "18 b. simple development of large cities is not ... 0.000000 True \n", - "19 d. the brain cannot process too much informat... 0.000000 True " + " expected_result actual_result pass \n", + "0 B. apartment building B. Apartment building True \n", + "1 D. constellation D. constellation True \n", + "2 C. wonder about C. wonder about True \n", + "3 E. Intelligent children B. own self False \n", + "4 D. accidents D. Accidents True \n", + "5 B. apartment building B. apartment building True \n", + "6 D. constellation D. constellation True \n", + "7 E. Intelligent children E. Intelligent children True \n", + "8 D. accidents D. Accidents True " ] }, - "execution_count": 34, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# Evaluating with QAEvalChain\n", + "harness.configure(\n", + "{\n", + " 'tests': {'defaults': {'min_pass_rate': 0.65},\n", + "\n", + " 'robustness': {'add_ocr_typo': {'min_pass_rate': 0.66},\n", + " 'dyslexia_word_swap':{'min_pass_rate': 0.60}\n", + " }\n", + " }\n", + " }\n", + "\n", + ")\n", + "\n", + "#Generated Results\n", "harness.generated_results()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This method returns the generated results in the form of a pandas dataframe, which provides a convenient and easy-to-use format for working with the test results. You can use this method to quickly identify the test cases that failed and to determine where fixes are needed." - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -2432,16 +1966,9 @@ "### Final Results" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can call `.report()` which summarizes the results giving information about pass and fail counts and overall test pass/fail flag." - ] - }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -2478,21 +2005,21 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2500,16 +2027,16 @@ "" ], "text/plain": [ - " category test_type fail_count pass_count pass_rate minimum_pass_rate \\\n", - "0 robustness uppercase 1 9 90% 66% \n", - "1 robustness lowercase 1 9 90% 66% \n", + " category test_type fail_count pass_count pass_rate \\\n", + "0 robustness add_ocr_typo 1 4 80% \n", + "1 robustness dyslexia_word_swap 0 4 100% \n", "\n", - " pass \n", - "0 True \n", - "1 True " + " minimum_pass_rate pass \n", + "0 66% True \n", + "1 60% True " ] }, - "execution_count": 35, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" }
categorytest_typeoriginal_contextoriginal_questionperturbed_contextperturbed_questionexpected_resultactual_resultpass
110robustnesslowercaseThe company sent three young staff members to ...So what are the three young people on business...the company sent three young staff members to ...so what are the three young people on business...A. 0-year-old accountant, 20-year-old salespe...a. 0-year-old accountant, 20-year-old salespe...0.000000add_ocr_typo-The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs-t^he townhouse was a h^ard vs^ell f^or the realtor, i^t was rig^ht ncxt t^o a higb rlse v^^hat?\\nA. suburban develoi)ment\\nB. apartment buihiing\\nC. bus ftop\\nD. michigan\\nE. suburbsB. apartment buildingB. Apartment buildingTrue
121robustnesslowercaseIn a traditional Chinese medicine preparation,...According to the above statement, which of the...in a traditional chinese medicine preparation,...according to the above statement, which of the...B. o Shouwu.b. o shouwu.0.000000add_ocr_typo-There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system-t^here is a ftar at tle ccnter of v/hat group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer lpace\\nD. constellation\\nE. solar fvftemD. constellationD. constellationTrue
132robustnesslowercaseIn recent years, graduate entrance examination...Which of the following can best strengthen the...in recent years, graduate entrance examination...which of the following can best strengthen the...B. Only those who intend to take the graduate...B. only those who intend to take the graduate...0.000000add_ocr_typo-What were the kids doing as they looked up at the sky and clouds?\\nA. ponder\\nB. become adults\\nC. wonder about\\nD. open door\\nE. distracting-whai were tlic kids doing as tbey looked up at the sky a^nd clouds?\\nA. ponder\\nB. bec6me adults\\nC. w^onder aboui\\nD. ol)en d6or\\nE. distractingC. wonder aboutC. wonder aboutTrue
143robustnesslowercaseA unit conducted the year-end assessment and a...According to the above statement, it can be co...a unit conducted the year-end assessment and a...according to the above statement, it can be co...C. C.d. ding.0.341667add_ocr_typo-The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children-t^he perfon taught an advanced clasf onhy f^or vHo?\\nA. oAvn houfe\\nB. own self\\nC. wonderful memories\\nD. knoAv trutb\\nE. intelligent childrenE. Intelligent childrenB. own selfFalse
154robustnesslowercaseZhang Ming, Li Ying, Wang Jia and Chen Rui wor...According to the above statement, you can get ...zhang ming, li ying, wang jia and chen rui wor...according to the above statement, you can get ...A. Chen Rui can't speak the Central Plains Ma...A. Chen Rui can't speak the Central Plains Ma...0.000000add_ocr_typo-What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage-w^hat is a likelv consequence of ignorance of rules?\\nA. f1nd tiuth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damageD. accidentsD. AccidentsTrue
165robustnesslowercaseThe person in charge of the relevant departmen...Which of the following is true will most weake...the person in charge of the relevant departmen...which of the following is true will most weake...A. This newly built house in Hexi Village has...A. this newly built house in hexi village has...0.000000dyslexia_word_swap-The townhouse was a hard sell for the realtor, it was right next to a high rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbs-The townhouse was a hard sell four the realtor, it was write next too a hi rise what?\\nA. suburban development\\nB. apartment building\\nC. bus stop\\nD. michigan\\nE. suburbsB. apartment buildingB. apartment buildingTrue
176robustnesslowercaseThere are five teams participating in the game...The result of the match showed that only one a...there are five teams participating in the game...the result of the match showed that only one a...B. Jiangnan.b. jiangnan.0.000000dyslexia_word_swap-There is a star at the center of what group of celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar system-There is a star at the center off what group off celestial bodies?\\nA. hollywood\\nB. skyline\\nC. outer space\\nD. constellation\\nE. solar systemD. constellationD. constellationTrue
187robustnesslowercaseCompared with small and medium-sized cities, e...Which of the following is the conclusion must ...compared with small and medium-sized cities, e...which of the following is the conclusion must ...B. Simple development of large cities is not ...b. simple development of large cities is not ...0.000000dyslexia_word_swap-The person taught an advanced class only for who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. know truth\\nE. intelligent children-The person taught an advanced class only four who?\\nA. own house\\nB. own self\\nC. wonderful memories\\nD. no truth\\nE. intelligent childrenE. Intelligent childrenE. Intelligent childrenTrue
198robustnesslowercaseResearchers recently discovered that there is ...Which of the following is assumed by the above...researchers recently discovered that there is ...which of the following is assumed by the above...D. The brain cannot process too much informat...d. the brain cannot process too much informat...0.000000dyslexia_word_swap-What is a likely consequence of ignorance of rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damage-What is a likely consequence off ignorance off rules?\\nA. find truth\\nB. hostility\\nC. bliss\\nD. accidents\\nE. damageD. accidentsD. AccidentsTrue
0robustnessuppercaseadd_ocr_typo1990%480%66%True
1robustnesslowercase1990%66%dyslexia_word_swap04100%60%True