Skip to content

Commit 6f01ad7

Browse files
committed
updated evaluation text
1 parent cd8b4d3 commit 6f01ad7

File tree

2 files changed

+30
-24
lines changed

2 files changed

+30
-24
lines changed

app/evaluation.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,15 +97,16 @@ def recursive_evaluation(responses, answers, chain, parser):
9797

9898
def evaluation_function(response, answer, param=None):
9999
"""Evaluates the given response against the answer using LLaMA 3 or GPT-4o."""
100-
100+
start_time = time.process_time()
101101

102102

103103

104104
#split the response and answer into lists with semicolons
105105
response = parse_input(response)
106+
answer = parse_input(answer)
107+
106108

107109

108-
start_time = time.process_time()
109110

110111
# Ensure config is provided
111112
if param is None:
@@ -192,8 +193,8 @@ def evaluation_function(response, answer, param=None):
192193
if __name__ == "__main__":
193194
custom_config = Param()
194195
print(evaluation_function(
195-
["speed"], #response
196-
["velocity"], #answer
196+
"speed,red", #response
197+
"red, velocity", #answer
197198
custom_config
198199
))
199200

app/evaluation_tests.py

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -24,72 +24,77 @@ def setUpClass(cls):
2424

2525
def test_basic_correct_response(self):
2626
"""Test if semantically similar responses are marked correct."""
27-
response = ["Density", "Velocity", "Viscosity", "Length"]
28-
answer = ["Density", "Velocity", "Viscosity", "Length"]
27+
response = "Density;Velocity;Viscosity;Length"
28+
answer = "Density;Velocity;Viscosity;Length"
2929
result = evaluation_function(response, answer, self.param)
3030

3131
self.assertTrue(result.get("is_correct"))
3232

3333
def test_basic_incorrect_response(self):
3434
"""Test if semantically different responses are marked incorrect."""
35-
response = ["Mass", "Speed", "Friction", "Force"]
36-
answer = ["Density", "Velocity", "Viscosity", "Length"]
35+
response = "Mass;Speed;Friction;Force"
36+
answer = "Density;Velocity;Viscosity;Length"
3737
result = evaluation_function(response, answer, self.param)
3838

39+
3940
self.assertFalse(result.get("is_correct"))
4041

4142
def test_partial_match(self):
4243
"""Test if a response too short is marked incorrect."""
43-
response = ["Density", "Velocity", "Viscosity"]
44-
answer = ["Density", "Velocity", "Viscosity", "Length"]
44+
response = "Density;Velocity;Viscosity"
45+
answer = "Density;Velocity;Viscosity;Length"
4546

4647
self.param.response_num_required = 4
4748
result = evaluation_function(response, answer, self.param)
49+
4850
self.param.response_num_required = 0
49-
50-
self.assertFalse(result.get("is_correct"))
5151

52+
self.assertFalse(result.get("is_correct"))
5253

5354
def test_synonyms_match(self):
54-
"""Test if abbriviations are correctly identified."""
55-
response = ['velocity']
56-
answer = ['speed']
55+
"""Test if abbreviations are correctly identified."""
56+
response = "velocity"
57+
answer = "speed"
5758
result = evaluation_function(response, answer, self.param)
5859

60+
5961
self.assertTrue(result.get("is_correct"))
6062

6163
def test_exact_match_requirement(self):
6264
"""Test enforcing exact match on keystrings."""
63-
response = ["density", "speed", "viscosity", "length"]
64-
answer = ["Density", "Velocity", "Viscosity", "Length"]
65+
response = "density;speed;viscosity;length"
66+
answer = "Density;Velocity;Viscosity;Length"
6567

6668
result = evaluation_function(response, answer, self.param)
69+
6770
self.assertTrue(result.get("is_correct"))
6871

6972
def test_should_not_contain(self):
7073
"""Test if a response with a prohibited keyword fails."""
71-
response = ["density", "velocity", "viscosity", "length", "direction"]
72-
answer = ["Density", "Velocity", "Viscosity", "Length"]
74+
response = "density;velocity;viscosity;length;direction"
75+
answer = "Density;Velocity;Viscosity;Length"
7376

7477
result = evaluation_function(response, answer, self.param)
75-
self.assertFalse(result.get("is_correct"))
7678

79+
self.assertFalse(result.get("is_correct"))
7780

7881
def test_negation_handling(self):
7982
"""Test how the model handles negation."""
80-
response = ["not light blue", "dark blue"]
81-
answer = ["light blue"]
83+
response = "not light blue;dark blue"
84+
answer = "light blue"
8285

8386
result = evaluation_function(response, answer, self.param)
8487

88+
8589
self.assertFalse(result.get("is_correct"))
8690

8791
def test_performance(self):
8892
"""Ensure that processing time is reasonable."""
89-
response = ["Density", "Velocity", "Viscosity", "Length"]
90-
answer = ["Density", "Velocity", "Viscosity", "Length"]
93+
response = "Density;Velocity;Viscosity;Length"
94+
answer = "Density;Velocity;Viscosity;Length"
9195

9296
result = evaluation_function(response, answer, self.param)
97+
9398
processing_time = result.get("result", {}).get("processing_time", 0)
9499

95100
self.assertLess(processing_time, 5, msg="Evaluation function should run efficiently.")

0 commit comments

Comments
 (0)