updated evaluation text

BigBigboss02 · BigBigboss02 · commit 6f01ad74b251 · 2025-02-19T20:56:56.000Z
diff --git a/app/evaluation.py b/app/evaluation.py
@@ -97,15 +97,16 @@ def recursive_evaluation(responses, answers, chain, parser):
 
 def evaluation_function(response, answer, param=None):
     """Evaluates the given response against the answer using LLaMA 3 or GPT-4o."""
-
+    start_time = time.process_time()
 
 
 
     #split the response and answer into lists with semicolons
     response = parse_input(response)
+    answer = parse_input(answer)
+
 
 
-    start_time = time.process_time()
     
     # Ensure config is provided
     if param is None:
@@ -192,8 +193,8 @@ def evaluation_function(response, answer, param=None):
 if __name__ == "__main__":
     custom_config = Param()
     print(evaluation_function(
-        ["speed"], #response
-        ["velocity"], #answer
+        "speed,red", #response
+        "red, velocity", #answer
         custom_config
     ))
     
diff --git a/app/evaluation_tests.py b/app/evaluation_tests.py
@@ -24,72 +24,77 @@ def setUpClass(cls):
 
     def test_basic_correct_response(self):
         """Test if semantically similar responses are marked correct."""
-        response = ["Density", "Velocity", "Viscosity", "Length"]
-        answer = ["Density", "Velocity", "Viscosity", "Length"]
+        response = "Density;Velocity;Viscosity;Length"
+        answer = "Density;Velocity;Viscosity;Length"
         result = evaluation_function(response, answer, self.param)
 
         self.assertTrue(result.get("is_correct"))
 
     def test_basic_incorrect_response(self):
         """Test if semantically different responses are marked incorrect."""
-        response = ["Mass", "Speed", "Friction", "Force"]
-        answer = ["Density", "Velocity", "Viscosity", "Length"]
+        response = "Mass;Speed;Friction;Force"
+        answer = "Density;Velocity;Viscosity;Length"
         result = evaluation_function(response, answer, self.param)
 
+
         self.assertFalse(result.get("is_correct"))
 
     def test_partial_match(self):
         """Test if a response too short is marked incorrect."""
-        response = ["Density", "Velocity", "Viscosity"]
-        answer = ["Density", "Velocity", "Viscosity", "Length"]
+        response = "Density;Velocity;Viscosity"
+        answer = "Density;Velocity;Viscosity;Length"
 
         self.param.response_num_required = 4
         result = evaluation_function(response, answer, self.param)
+
         self.param.response_num_required = 0
-        
-        self.assertFalse(result.get("is_correct"))
 
+        self.assertFalse(result.get("is_correct"))
 
     def test_synonyms_match(self):
-        """Test if abbriviations are correctly identified."""
-        response = ['velocity']
-        answer = ['speed']
+        """Test if abbreviations are correctly identified."""
+        response = "velocity"
+        answer = "speed"
         result = evaluation_function(response, answer, self.param)
 
+
         self.assertTrue(result.get("is_correct"))
 
     def test_exact_match_requirement(self):
         """Test enforcing exact match on keystrings."""
-        response = ["density", "speed", "viscosity", "length"]
-        answer = ["Density", "Velocity", "Viscosity", "Length"]
+        response = "density;speed;viscosity;length"
+        answer = "Density;Velocity;Viscosity;Length"
 
         result = evaluation_function(response, answer, self.param)
+
         self.assertTrue(result.get("is_correct"))
 
     def test_should_not_contain(self):
         """Test if a response with a prohibited keyword fails."""
-        response = ["density", "velocity", "viscosity", "length", "direction"]
-        answer = ["Density", "Velocity", "Viscosity", "Length"]
+        response = "density;velocity;viscosity;length;direction"
+        answer = "Density;Velocity;Viscosity;Length"
 
         result = evaluation_function(response, answer, self.param)
-        self.assertFalse(result.get("is_correct"))
 
+        self.assertFalse(result.get("is_correct"))
 
     def test_negation_handling(self):
         """Test how the model handles negation."""
-        response = ["not light blue", "dark blue"]
-        answer = ["light blue"]
+        response = "not light blue;dark blue"
+        answer = "light blue"
 
         result = evaluation_function(response, answer, self.param)
 
+
         self.assertFalse(result.get("is_correct"))
 
     def test_performance(self):
         """Ensure that processing time is reasonable."""
-        response = ["Density", "Velocity", "Viscosity", "Length"]
-        answer = ["Density", "Velocity", "Viscosity", "Length"]
+        response = "Density;Velocity;Viscosity;Length"
+        answer = "Density;Velocity;Viscosity;Length"
 
         result = evaluation_function(response, answer, self.param)
+
         processing_time = result.get("result", {}).get("processing_time", 0)
 
         self.assertLess(processing_time, 5, msg="Evaluation function should run efficiently.")