diff --git a/examples/FinanceBench-Lite/.env.template b/examples/FinanceBench-AMD/.env.template similarity index 100% rename from examples/FinanceBench-Lite/.env.template rename to examples/FinanceBench-AMD/.env.template diff --git a/examples/FinanceBench-Lite/.gitignore b/examples/FinanceBench-AMD/.gitignore similarity index 100% rename from examples/FinanceBench-Lite/.gitignore rename to examples/FinanceBench-AMD/.gitignore diff --git a/examples/FinanceBench-Lite/Makefile b/examples/FinanceBench-AMD/Makefile similarity index 100% rename from examples/FinanceBench-Lite/Makefile rename to examples/FinanceBench-AMD/Makefile diff --git a/examples/FinanceBench-Lite/README.md b/examples/FinanceBench-AMD/README.md similarity index 87% rename from examples/FinanceBench-Lite/README.md rename to examples/FinanceBench-AMD/README.md index 6b27245db..638517d2f 100644 --- a/examples/FinanceBench-Lite/README.md +++ b/examples/FinanceBench-AMD/README.md @@ -1,9 +1,9 @@ - + # OpenSSA-FinanceBench Lite benchmarking This is a lite version of the benchmarking of `OpenSSA` performance -on the `FinanceBench` dataset. We will use 1 question from the dataset to demonstrate the use of `OpenSSA` with `DANA` architecture. +on the `FinanceBench` dataset. We will use 1 question from the dataset to demonstrate the use of `OpenSSA` with `DANA` architecture. ## [`FinanceBench` Dataset](https://github.com/patronus-ai/financebench/blob/main/financebench_sample_150.csv) @@ -19,26 +19,26 @@ Create `.env` file following the `.env.template` and fill in necessary credentia __Solve__ the problem corresponding to a problem `00807` `financebench_id`: __`make dana-solve id=00807`__. -### Question + +**Question**: `Does 3M have a reasonably healthy liquidity profile based on its quick ratio for Q2 of FY2023? If the quick ratio is not relevant to measure liquidity, please state that and explain why.` -### Knowledge +**Knowledge** To solve this question, you can add knowledge related to `liquidity`. See the example below: - Liquidity Metric Formulas - - `(Net) Working Capital` = `(Total) Current Assets` - `(Total) Current Liabilities` - - `Working Capital Ratio` = `(Total) Current Assets` / `(Total) Current Liabilities` + - `(Net) Working Capital` = `(Total) Current Assets` - `(Total) Current Liabilities` + - `Working Capital Ratio` = `(Total) Current Assets` / `(Total) Current Liabilities` Go to `knowledge-store.txt` to add relevant knowledge yourself and see how it helps the agent to solve this question. -### Program - -With the above-provided knowledge, the program we can provide to the agent could be as below: +**Program** +With the above-provided knowledge, the program we can provide to the agent could be as below: - Goal: To assess liquidity health of a company, calculate `quick ratio` - - Task: To calculate `quick ratio`, use this formula + - Task: To calculate `quick ratio`, use this formula `Quick Ratio` = ( (`Cash & Cash Equivalents` + `Short-Term Investments or (Current) Marketable Securities` + @@ -53,6 +53,5 @@ With the above-provided knowledge, the program we can provide to the agent could Go to `program-store.yml` to see details of the program yourself! You can experimenting with different plans to see how it helps the agent solve the problem as well. ## Advancing DANA Agent with Domain Knowledge and Program Store - - To solve the question with added domain knowledge, run `make dana-solve-w-knowledge id=00807` - To solve the question with added domain knowledge and program store, run `make dana-solve-w-knowledge-and-prog-store id=00807` diff --git a/examples/FinanceBench-Lite/dana.py b/examples/FinanceBench-AMD/dana.py similarity index 92% rename from examples/FinanceBench-Lite/dana.py rename to examples/FinanceBench-AMD/dana.py index 92ec4ee61..8d7e1ca22 100644 --- a/examples/FinanceBench-Lite/dana.py +++ b/examples/FinanceBench-AMD/dana.py @@ -8,7 +8,7 @@ # pylint: disable=wrong-import-order,wrong-import-position from data_and_knowledge import (DocName, FbId, Answer, Doc, FB_ID_COL_NAME, DOC_NAMES_BY_FB_ID, QS_BY_FB_ID, EXPERT_KNOWLEDGE, EXPERT_PROGRAMS, EXPERT_HTP_COMPANY_KEY, EXPERT_HTP_PERIOD_KEY) -from util import QAFunc, enable_batch_qa_and_eval, log_qa_and_update_output_file +from util import QAFunc, log_qa_and_update_output_file @cache @@ -51,7 +51,6 @@ def get_or_create_adaptations(doc_name: DocName) -> dict[str, str]: return {EXPERT_HTP_COMPANY_KEY: (doc := Doc(name=doc_name)).company, EXPERT_HTP_PERIOD_KEY: doc.period} -@enable_batch_qa_and_eval(output_name='DANA') @log_qa_and_update_output_file(output_name='DANA') def solve(fb_id: FbId) -> Answer: return get_or_create_agent(doc_name=DOC_NAMES_BY_FB_ID[fb_id]).solve( @@ -59,7 +58,6 @@ def solve(fb_id: FbId) -> Answer: adaptations_from_known_programs=get_or_create_adaptations(doc_name=DOC_NAMES_BY_FB_ID[fb_id])) -@enable_batch_qa_and_eval(output_name='DANA-wKnowledge') @log_qa_and_update_output_file(output_name='DANA-wKnowledge') def solve_with_knowledge(fb_id: FbId) -> Answer: return get_or_create_agent(doc_name=DOC_NAMES_BY_FB_ID[fb_id], expert_knowledge=True).solve( @@ -67,7 +65,6 @@ def solve_with_knowledge(fb_id: FbId) -> Answer: adaptations_from_known_programs=get_or_create_adaptations(doc_name=DOC_NAMES_BY_FB_ID[fb_id])) -@enable_batch_qa_and_eval(output_name='DANA-wProgStore') @log_qa_and_update_output_file(output_name='DANA-wProgStore') def solve_with_program_store(fb_id: FbId) -> Answer: return get_or_create_agent(doc_name=DOC_NAMES_BY_FB_ID[fb_id], expert_programs=True).solve( @@ -75,7 +72,6 @@ def solve_with_program_store(fb_id: FbId) -> Answer: adaptations_from_known_programs=get_or_create_adaptations(doc_name=DOC_NAMES_BY_FB_ID[fb_id])) -@enable_batch_qa_and_eval(output_name='DANA-wKnowledge-wProgStore') @log_qa_and_update_output_file(output_name='DANA-wKnowledge-wProgStore') def solve_with_knowledge_and_program_store(fb_id: FbId) -> Answer: return get_or_create_agent(DOC_NAMES_BY_FB_ID[fb_id], expert_knowledge=True, expert_programs=True).solve( @@ -83,7 +79,6 @@ def solve_with_knowledge_and_program_store(fb_id: FbId) -> Answer: adaptations_from_known_programs=get_or_create_adaptations(doc_name=DOC_NAMES_BY_FB_ID[fb_id])) -@enable_batch_qa_and_eval(output_name='DANA-wLlama3') @log_qa_and_update_output_file(output_name='DANA-wLlama3') def solve_with_llama3(fb_id: FbId) -> Answer: return get_or_create_agent(doc_name=DOC_NAMES_BY_FB_ID[fb_id], use_llama3=True).solve( @@ -91,7 +86,6 @@ def solve_with_llama3(fb_id: FbId) -> Answer: adaptations_from_known_programs=get_or_create_adaptations(doc_name=DOC_NAMES_BY_FB_ID[fb_id])) -@enable_batch_qa_and_eval(output_name='DANA-wKnowledge-wLlama3') @log_qa_and_update_output_file(output_name='DANA-wKnowledge-wLlama3') def solve_with_knowledge_with_llama3(fb_id: FbId) -> Answer: return get_or_create_agent(doc_name=DOC_NAMES_BY_FB_ID[fb_id], expert_knowledge=True, use_llama3=True).solve( @@ -99,7 +93,6 @@ def solve_with_knowledge_with_llama3(fb_id: FbId) -> Answer: adaptations_from_known_programs=get_or_create_adaptations(doc_name=DOC_NAMES_BY_FB_ID[fb_id])) -@enable_batch_qa_and_eval(output_name='DANA-wProgStore-wLlama3') @log_qa_and_update_output_file(output_name='DANA-wProgStore-wLlama3') def solve_with_program_store_with_llama3(fb_id: FbId) -> Answer: return get_or_create_agent(doc_name=DOC_NAMES_BY_FB_ID[fb_id], expert_programs=True, use_llama3=True).solve( @@ -107,7 +100,6 @@ def solve_with_program_store_with_llama3(fb_id: FbId) -> Answer: adaptations_from_known_programs=get_or_create_adaptations(doc_name=DOC_NAMES_BY_FB_ID[fb_id])) -@enable_batch_qa_and_eval(output_name='DANA-wKnowledge-wProgStore-wLlama3') @log_qa_and_update_output_file(output_name='DANA-wKnowledge-wProgStore-wLlama3') def solve_with_knowledge_and_program_store_with_llama3(fb_id: FbId) -> Answer: return get_or_create_agent(DOC_NAMES_BY_FB_ID[fb_id], expert_knowledge=True, expert_programs=True, use_llama3=True).solve( # noqa: E501 diff --git a/examples/FinanceBench-Lite/data_and_knowledge.py b/examples/FinanceBench-AMD/data_and_knowledge.py similarity index 100% rename from examples/FinanceBench-Lite/data_and_knowledge.py rename to examples/FinanceBench-AMD/data_and_knowledge.py diff --git a/examples/FinanceBench-Lite/ground-truths.yml b/examples/FinanceBench-AMD/ground-truths.yml similarity index 100% rename from examples/FinanceBench-Lite/ground-truths.yml rename to examples/FinanceBench-AMD/ground-truths.yml diff --git a/examples/FinanceBench-Lite/knowledge-store.txt b/examples/FinanceBench-AMD/knowledge-store.txt similarity index 100% rename from examples/FinanceBench-Lite/knowledge-store.txt rename to examples/FinanceBench-AMD/knowledge-store.txt diff --git a/examples/FinanceBench-Lite/log.py b/examples/FinanceBench-AMD/log.py similarity index 100% rename from examples/FinanceBench-Lite/log.py rename to examples/FinanceBench-AMD/log.py diff --git a/examples/FinanceBench-Lite/program-store.yml b/examples/FinanceBench-AMD/program-store.yml similarity index 100% rename from examples/FinanceBench-Lite/program-store.yml rename to examples/FinanceBench-AMD/program-store.yml diff --git a/examples/FinanceBench-Lite/rag-ground-truths.yml b/examples/FinanceBench-AMD/rag-ground-truths.yml similarity index 100% rename from examples/FinanceBench-Lite/rag-ground-truths.yml rename to examples/FinanceBench-AMD/rag-ground-truths.yml diff --git a/examples/FinanceBench-Lite/util.py b/examples/FinanceBench-AMD/util.py similarity index 50% rename from examples/FinanceBench-Lite/util.py rename to examples/FinanceBench-AMD/util.py index 3025beadb..a7ab24305 100644 --- a/examples/FinanceBench-Lite/util.py +++ b/examples/FinanceBench-AMD/util.py @@ -9,7 +9,6 @@ from tqdm import tqdm from data_and_knowledge import FbId, Answer, FB_IDS, DOC_NAMES_BY_FB_ID, QS_BY_FB_ID, OUTPUT_FILE_PATH, get_or_create_output_df # noqa: E501 -from eval import eval_correctness, eval_all from log import switch_log_file if TYPE_CHECKING: @@ -19,38 +18,6 @@ type QAFunc = Callable[[FbId], Answer] -@dataclass -class enable_batch_qa_and_eval: # noqa: N801 - output_name: str - - def __call__(self, qa_func: QAFunc) -> QAFunc: - @wraps(wrapped=qa_func) - def decorated_qa_func(fb_id: FbId) -> Answer | None: - if 'all' in fb_id.lower(): - for _fb_id in tqdm(FB_IDS): - # run inferencing and preliminarily evaluate - eval_correctness(fb_id=_fb_id, answer=qa_func(_fb_id), output_name=self.output_name, human=False) - - # rigorously evaluate again, including human evaluation for difficult cases - eval_all(output_name=self.output_name, refresh=True) - return None - - if 'from:' in fb_id.lower(): - for _fb_id in tqdm(FB_IDS[FB_IDS.index(fb_id[5:]):]): - # run inferencing and preliminarily evaluate - eval_correctness(fb_id=_fb_id, answer=qa_func(_fb_id), output_name=self.output_name, human=False) - - # rigorously evaluate again, including human evaluation for difficult cases - eval_all(output_name=self.output_name, refresh=True) - return None - - # run inferencing and evaluate - eval_correctness(fb_id=fb_id, answer=(answer := qa_func(fb_id)), output_name=self.output_name, human=True) - return answer - - return decorated_qa_func - - @dataclass class log_qa_and_update_output_file: # noqa: N801 output_name: str diff --git a/examples/FinanceBench-Lite/eval.py b/examples/FinanceBench-Lite/eval.py deleted file mode 100644 index 77f491f4f..000000000 --- a/examples/FinanceBench-Lite/eval.py +++ /dev/null @@ -1,301 +0,0 @@ -from __future__ import annotations - -import argparse -from collections import defaultdict -from functools import cache -from pprint import pprint -from typing import TYPE_CHECKING - -from dotenv import load_dotenv -from loguru import logger -from pandas import DataFrame, notna, read_csv -from tqdm import tqdm - -from openssa.core.util.lm.config import LMConfig -from openssa.core.util.lm.openai import OpenAILM - -# pylint: disable=wrong-import-order -from data_and_knowledge import (FbId, Question, Answer, Category, GroundTruth, - FB_ID_COL_NAME, GROUND_TRUTHS, N_CASES, CAT_DISTRIB, - LOCAL_CACHE_DIR_PATH, OUTPUT_FILE_PATH, get_or_create_output_df) -from log import switch_log_file - -if TYPE_CHECKING: - from openssa.core.util.lm.abstract import AbstractLM - - -EVAL_PROMPT_TEMPLATE: str = \ -"""You shall act as a judge of question-answering correctness. - -Given the posed QUESTION below, evaluate whether the ANSWER below is correct -according to the criteria specified in the CORRECTNESS EVALUATION RUBRIC below. - -- The evaluation should regard the ANSWER as responding to the QUESTION, - and hence the ANSWER does not need to repeat contextual information already in the QUESTION; - -- The evaluation should follow the RUBRIC strictly, - not looking for in the ANSWER more elaboration/explanation than what the RUBRIC explicitly requires; - -- Financial and technical terminology can be treated as case-insensitive. - -Output only a single word, either: -- YES: if you judge the ANSWER to be correct; or -- NO: if you judge the ANSWER to be incorrect. - -QUESTION: ---------- -``` -{question} -``` - -ANSWER TO EVALUATE: -------------------- -``` -{answer} -``` - -CORRECTNESS EVALUATION RUBRIC: ------------------------------- -``` -{rubric} -``` -""" # noqa: E122 - - -load_dotenv() - - -@cache -def get_lm(model='gpt-4o') -> AbstractLM: - return OpenAILM(model=model, api_key=LMConfig.OPENAI_API_KEY, api_base=LMConfig.OPENAI_API_URL) - - -def human_eval_recommended(fb_id: FbId) -> bool | None: - return (ground_truth := GROUND_TRUTHS[fb_id]).get('answer-inadequate') or ground_truth.get('evaluator-unreliable') - - -def eval_correctness(fb_id: FbId, answer: Answer, output_name: str | None = None, # pylint: disable=too-many-arguments - n_times: int = 9, human: bool = True, debug: bool = False) -> bool: - if output_name: - switch_log_file(fb_id=fb_id, output_name=output_name) - - question: Question = (ground_truth := GROUND_TRUTHS[fb_id])['question'] - rubric: str = ground_truth['correctness'] - prompt: str = EVAL_PROMPT_TEMPLATE.format(question=question, answer=answer, rubric=rubric) - - lm: AbstractLM = get_lm() - - for _ in range(n_times): - score: str = '' - - while score not in {'YES', 'NO'}: - score: str = lm.get_response(prompt=prompt, temperature=0) - - if score == 'NO': - logger.warning(f'\n{fb_id}\n{ground_truth['doc']}:\n{question}\n' - '\n' - f'ANSWER JUDGED TO BE INCORRECT:\n{answer}\n' - '\n' - f'RUBRIC:\n{rubric}' + - ('\n\n(*** EXPERT ANSWER KNOWN TO BE INADEQUATE ***)\n' - if GROUND_TRUTHS[fb_id].get('answer-inadequate') - else '\n')) - - if debug: - logger.debug(f'PROMPT:\n{prompt}') - - if human and human_eval_recommended(fb_id=fb_id): - human_eval_str: str = '' - while not human_eval_str: - human_eval_str: str = input('\n*** HUMAN EVALUATION ***: if answer is correct, type "Y": ').strip() - - correct: bool = human_eval_str.lower().startswith('y') - - else: - correct: bool = False - - break - - else: - correct: bool = True - - if output_name: - output_df: DataFrame = get_or_create_output_df() - output_df.loc[fb_id, f'{output_name}---CORRECTNESS']: bool = correct - output_df.to_csv(OUTPUT_FILE_PATH, index=True) - - return correct - - -def eval_all(output_name: str, refresh: bool = True, n_times: int = 9, human: bool = True, debug: bool = False): - # pylint: disable=too-many-locals - output_df: DataFrame = get_or_create_output_df() - - n_yes_scores_by_category: defaultdict = defaultdict(int) - incorrect_answer_fb_ids: dict[FbId, str] = {} - - for fb_id, answer in tqdm(output_df[output_name].items(), total=N_CASES): - ground_truth: GroundTruth = GROUND_TRUTHS[fb_id] - - if (eval_correctness(fb_id=fb_id, answer=answer, output_name=output_name, n_times=n_times, human=human, debug=debug) # noqa: E501 - if refresh - else (notna(correctness := output_df.loc[fb_id, f'{output_name}---CORRECTNESS']) and correctness)): - n_yes_scores_by_category[ground_truth['category']] += 1 - - else: - incorrect_answer_fb_ids[fb_id]: str = ('expert answer inadequate' - if ground_truth.get('answer-inadequate') - else ('evaluator unreliable' - if ground_truth.get('evaluator-unreliable') - else '')) - - logger.info(f'TOTAL CORRECT: {(n := sum(n_yes_scores_by_category.values()))} / {N_CASES} = {n / N_CASES:.1%}') - - pprint(correctness_by_category := {category: (f'{(n := n_yes_scores_by_category[category])} / {n_for_category} ' - f'= {n / n_for_category:.1%}') - for category, n_for_category in CAT_DISTRIB.items()}) - - pprint({ - 'EASY': (f'{(e := sum(n_yes_scores_by_category[easy_cat] - for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} / ' - f'{(se := sum(CAT_DISTRIB[easy_cat] - for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} ' - f'= {e / se:.1%}'), - - 'HARD': (f'{(h := sum(n_yes_scores_by_category[hard_cat] - for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE, - Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} / ' - f'{(sh := sum(CAT_DISTRIB[hard_cat] - for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE, - Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} ' - f'= {h / sh:.1%}') - }) - - logger.warning('INCORRECT:') - pprint(incorrect_answer_fb_ids) - - return correctness_by_category - - -def compare_eval(output_name: str, baseline_output_name: str = 'RAG-Default'): - output_df: DataFrame = get_or_create_output_df() - - baseline_correctness_by_category: dict[str, str] = eval_all(output_name=baseline_output_name, refresh=False) - correctness_by_category: dict[str, str] = eval_all(output_name=output_name, refresh=False) - pprint({category: {output_name: correctness_summary, baseline_output_name: baseline_correctness_by_category[category]} - for category, correctness_summary in correctness_by_category.items()}) - - output_df.loc[:, baseline_output_name] = output_df[f'{baseline_output_name}---CORRECTNESS'] - output_df.loc[:, output_name] = output_df[f'{output_name}---CORRECTNESS'] - return output_df.loc[output_df[output_name] != output_df[baseline_output_name], - ['doc_name', 'category', baseline_output_name, output_name]] - - -def eval_accuracy_and_consistency_wrt_ground_truths(output_name: str, output_file_names: list[str]): - # pylint: disable=too-many-locals - - n_output_files: int = len(output_file_names) - correctness_col_name: str = f'{output_name}---CORRECTNESS' - - n_yes_scores_by_fb_id: defaultdict = defaultdict(int) - incorrect_answer_fb_ids: dict[FbId, str] = {} - - for output_df in (read_csv(LOCAL_CACHE_DIR_PATH / output_file_name, index_col=FB_ID_COL_NAME) - for output_file_name in output_file_names): - - for fb_id, correctness in output_df[correctness_col_name].items(): - ground_truth: GroundTruth = GROUND_TRUTHS[fb_id] - - if notna(correctness) and correctness: - n_yes_scores_by_fb_id[fb_id] += 1 - - else: - incorrect_answer_fb_ids[fb_id]: str = ('expert answer inadequate' - if ground_truth.get('answer-inadequate') - else ('evaluator unreliable' - if ground_truth.get('evaluator-unreliable') - else '')) - - cumu_avg_accuracy_scores_by_category: defaultdict = defaultdict(int) - cumu_consistency_scores_by_category: defaultdict = defaultdict(float) - - for fb_id, ground_truth in GROUND_TRUTHS.items(): - cumu_avg_accuracy_scores_by_category[cat := ground_truth['category']] += (a := n_yes_scores_by_fb_id[fb_id] / n_output_files) - cumu_consistency_scores_by_category[cat] += 2 * abs(a - 0.5) - - print(f'TOTAL CORRECT: {(n := sum(cumu_avg_accuracy_scores_by_category.values()))} / {N_CASES} = {n / N_CASES:.1%}') - - pprint({category: (f'{(n := cumu_avg_accuracy_scores_by_category[category])} / {n_for_category} ' - f'= {n / n_for_category:.1%}') - for category, n_for_category in CAT_DISTRIB.items()}) - - pprint({ - 'EASY': (f'{(e := sum(cumu_avg_accuracy_scores_by_category[easy_cat] - for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} / ' - f'{(se := sum(CAT_DISTRIB[easy_cat] - for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} ' - f'= {e / se:.1%}'), - - 'HARD': (f'{(h := sum(cumu_avg_accuracy_scores_by_category[hard_cat] - for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE, - Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} / ' - f'{(sh := sum(CAT_DISTRIB[hard_cat] - for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE, - Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} ' - f'= {h / sh:.1%}') - }) - - print(f'\nTOTAL CONSISTENT: {(n := sum(cumu_consistency_scores_by_category.values()))} / {N_CASES} = {n / N_CASES:.1%}') - - pprint({category: (f'{(n := cumu_consistency_scores_by_category[category])} / {n_for_category} ' - f'= {n / n_for_category:.1%}') - for category, n_for_category in CAT_DISTRIB.items()}) - - pprint({ - 'EASY': (f'{(e := sum(cumu_consistency_scores_by_category[easy_cat] - for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} / ' - f'{(se := sum(CAT_DISTRIB[easy_cat] - for easy_cat in (Category.RETRIEVE, Category.COMPARE, Category.CALC_CHANGE)))} ' - f'= {e / se:.1%}'), - - 'HARD': (f'{(h := sum(cumu_consistency_scores_by_category[hard_cat] - for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE, - Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} / ' - f'{(sh := sum(CAT_DISTRIB[hard_cat] - for hard_cat in (Category.CALC_COMPLEX, Category.CALC_AND_JUDGE, - Category.EXPLAIN_FACTORS, Category.OTHER_ADVANCED)))} ' - f'= {h / sh:.1%}') - }) - - print('\nINCORRECT:') - pprint(incorrect_answer_fb_ids) - - -if __name__ == '__main__': - arg_parser = argparse.ArgumentParser() - - arg_parser.add_argument('answer_col', help='Name of the column containing answers to evaluate') - arg_parser.add_argument('--id', default='all', help='FinanceBench Case ID') - arg_parser.add_argument('--n-times', type=int, default=9, help='Number of times to evaluate') - - arg_parser.add_argument('--human-eval', dest='human_eval', action='store_true', help='Human Evaluation ON') - arg_parser.add_argument('--no-human-eval', dest='human_eval', action='store_false', help='Human Evaluation OFF') - arg_parser.set_defaults(human_eval=True) - - arg_parser.add_argument('--refresh', dest='refresh', action='store_true', help='Evaluation Refreshing ON') - arg_parser.add_argument('--no-refresh', dest='refresh', action='store_false', help='Evaluation Refreshing OFF') - arg_parser.set_defaults(refresh=True) - - arg_parser.add_argument('--debug', action='store_true', help='Debug by printing out prompts') - - args = arg_parser.parse_args() - - if 'all' in args.id.lower(): - eval_all(output_name=args.answer_col, refresh=args.refresh, n_times=args.n_times, human=args.human_eval, debug=args.debug) # noqa: E501 - - else: - logger.info( - eval_correctness(fb_id=args.id, - answer=read_csv(OUTPUT_FILE_PATH, index_col=FB_ID_COL_NAME).loc[args.id, args.answer_col], - output_name=args.answer_col, - n_times=args.n_times, human=args.human_eval, debug=args.debug))