first change

levi-accherman · levi-accherman · commit e3486da255f3 · 2025-07-18T10:04:53.000+01:00
diff --git a/app/eval_single_testing.py b/app/eval_single_testing.py
@@ -0,0 +1,2 @@
+from evaluation import contains_special_math
+print(contains_special_math("dy/dx"))
diff --git a/app/evaluation.py b/app/evaluation.py
@@ -1,4 +1,11 @@
+import os
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
 from typing import Any, TypedDict
+from sympy import solve, Eq, simplify
+from sympy.parsing.sympy_parser import parse_expr, standard_transformations, implicit_multiplication_application
+import re
 
 
 class Params(TypedDict):
@@ -7,30 +14,134 @@ class Params(TypedDict):
 
 class Result(TypedDict):
     is_correct: bool
+    sympy_result: bool | None
+    llm_result: bool
+    mismatch_info: str
+
+
+transformations = standard_transformations + (implicit_multiplication_application,)
 
 
-def evaluation_function(response: Any, answer: Any, params: Params) -> Result:
+def contains_special_math(expr: str) -> bool:
     """
-    Function used to evaluate a student response.
-    ---
-    The handler function passes three arguments to evaluation_function():
-
-    - `response` which are the answers provided by the student.
-    - `answer` which are the correct answers to compare against.
-    - `params` which are any extra parameters that may be useful,
-        e.g., error tolerances.
-
-    The output of this function is what is returned as the API response
-    and therefore must be JSON-encodable. It must also conform to the
-    response schema.
-
-    Any standard python library may be used, as well as any package
-    available on pip (provided it is added to requirements.txt).
-
-    The way you wish to structure you code (all in this function, or
-    split into many) is entirely up to you. All that matters are the
-    return types and that evaluation_function() is the main function used
-    to output the evaluation response.
+    特殊な記号/演算が含まれているか判定
+    """
+    patterns = [
+        r"d(\^|\*\*)?\d*(\*\*)?\w*/d\w+(\^|\*\*)?\d*(\*\*)?", # Ordinary diff (dy/dx, d^2y/dx^2)
+        r"∂(\^|\*\*)?\d*(\*\*)?\w*/∂\w+(\^|\*\*)?\d*(\*\*)?",        # Partial diff (∂y/∂x, ∂^2y/∂x^2)
+        r"diff\(\w+, \w+\)",                # diff function (diff(y, x))
+        r"int",                          # integration (int_b^a f(x)dx)
+        r"∫",                            
+    ]
+    return any(re.search(p, expr) for p in patterns)
+
+
+def is_equivalent_sympy(expr1, expr2) -> bool | None:
+    """
+    Return True/False if comparable with SymPy,
+    or None if an error occurs.
+    """
+    try:
+        expr1, expr2 = expr1.replace("^", "**"), expr2.replace("^", "**")
+        if not expr1.strip() and not expr2.strip():
+            return True
+        elif not expr1.strip() or not expr2.strip():
+            return False
+
+        # Compare with Eq() for equations
+        if "=" in expr1 and "=" in expr2:
+            lhs1, rhs1 = expr1.split("=")
+            lhs2, rhs2 = expr2.split("=")
+
+            # implicit multiplication handlable
+            lhs1_parsed = parse_expr(lhs1, transformations=transformations)
+            rhs1_parsed = parse_expr(rhs1, transformations=transformations)
+            lhs2_parsed = parse_expr(lhs2, transformations=transformations)
+            rhs2_parsed = parse_expr(rhs2, transformations=transformations)
+
+            eq1 = Eq(lhs1_parsed - rhs1_parsed, 0)
+            eq2 = Eq(lhs2_parsed - rhs2_parsed, 0)
+
+            all_symbols = eq1.free_symbols.union(eq2.free_symbols)
+
+            sol1 = solve(eq1, list(all_symbols))
+            sol2 = solve(eq2, list(all_symbols))
+
+            return set(sol1) == set(sol2)
+        else:
+            expr1_parsed = parse_expr(expr1, transformations=transformations)
+            expr2_parsed = parse_expr(expr2, transformations=transformations)
+            return simplify(expr1_parsed - expr2_parsed) == 0
+
+    except Exception as e:
+        print(f" SymPy error: {e}")
+        return None
+
+
+def evaluation_function(response, answer, params):
+    load_dotenv()
+    llm = ChatOpenAI(
+        model=os.environ['OPENAI_MODEL'],
+        api_key=os.environ["OPENAI_API_KEY"],
+    )
+
+    # Check if LLM priority is needed
+    needs_llm_priority = contains_special_math(response) or contains_special_math(answer)
+
+    # Check with SymPy first if not using LLM priority
+    sympy_result = None
+    if not needs_llm_priority:
+        sympy_result = is_equivalent_sympy(response, answer)
+
+    prompt = fr"""
+    Follow these steps carefully:
+    A student response and an answer are provided below. Compare the two if they are mathematically equivalent.
+    Only return True if they are **exactly equivalent** for all possible values of all variables.
+    Do not assume expressions are equivalent based on similarity.
+    There are a few types of symbols for differentiation and the following in the same square brackets are considered equivalent:
+    [dy/dx, d/dx(y), diff(y,x)], [d^2y/dx^2, d**2y/dx**2, diff(y,x,x)], [∂y/∂x, ∂/∂x(y), diff(y,x), partial(y)/partial(x)], [∂^2y/∂x^2, ∂**2y/∂x**2, diff(y,x,x), partial**2(y)/partial(x)**2, partial^2(y)/partial(x)^2]
+    The terms above that are not in the same square brackets are not considered equivalent.
+    Student response: {response}
+    Answer: {answer}
+
+    Return either True or False as a single word and nothing else.
     """
+    llm_response = llm.invoke(prompt)
+    llm_result_text = llm_response.content.strip().lower()
+
+    if llm_result_text == "true":
+        llm_result = True
+    elif llm_result_text == "false":
+        llm_result = False
+    else:
+        # Any weird responses
+        llm_result = False
 
-    return Result(is_correct=True)
+    if sympy_result is not None:
+        if sympy_result == llm_result:
+            return {
+                "is_correct": sympy_result,
+                "sympy_result": sympy_result,
+                "llm_result": llm_result,
+                "mismatch_info": ""
+            }
+        else:
+            mismatch_info = (
+                f"Mismatch detected:\n"
+                f"- SymPy result: {sympy_result}\n"
+                f"- LLM result: {llm_result}\n"
+                f"Used LLM result due to mismatch"
+            )
+            return {
+                "is_correct": sympy_result, 
+                "sympy_result": sympy_result,
+                "llm_result": llm_result,
+                "mismatch_info": mismatch_info
+            }
+    else:
+        return {
+            "is_correct": llm_result,
+            "sympy_result": None,
+            "llm_result": llm_result,
+            "mismatch_info": "Used LLM result only"
+        }
diff --git a/app/evaluation_test_cases.py b/app/evaluation_test_cases.py
@@ -0,0 +1,26 @@
+from evaluation import Params
+# [response, answer, params, expected]
+test_cases = [
+            ["2+2", "4", Params(), True],
+            ["sin(x)**2 + cos(x)**2", "1", Params(), True],
+            ["x+y", "y+x", Params(), True],
+            ["x*y", "x+y", Params(), False],
+            ["x**2 + 2*x + 1", "(x+1)**2", Params(), True],
+            ["x**2 - 1", "(x-1)*(x+1)", Params(), True],
+            ["x^5-1", "(x-1)*(x**4+x**3+x**2+x+1)", Params(), True],
+            ["sin(x) + cos(x)", "cos(x) + sin(x)", Params(), True],
+            ["sin(x) * cos(x)", "sin(x) + cos(x)", Params(), False],
+            ["exp(x) * exp(y)", "exp(x+y)", Params(), True],
+            ["log(x*y)", "log(x) + log(y)", Params(), False],
+            ["x**3 + x**2", "x**2 * (x + 1)", Params(), True],
+            ["", "", Params(), True],       
+            ["", "x", Params(), False],
+            ["x+1=0", "-2x-2=0", Params(), True],
+            ["dy/dx", "diff(y, x)", Params(), True],
+            ["(x+y)/x", "1 + y/x", Params(), True],
+            ["∂y/∂x", "diff(y, x)", Params(), True],
+            ["∫f(x)dx", "int(f(x), x)", Params(), True],
+            ["∂^2y/∂x^2", "diff(diff(y, x), x)", Params(), True],
+            ["dy/dx + 1", "diff(y, x) + 1", Params(), True],
+            ["∂y/∂x + 1", "diff(y, x) + 1", Params(), True],
+            ]
diff --git a/app/evaluation_tests.py b/app/evaluation_tests.py
@@ -4,32 +4,37 @@
     from .evaluation import Params, evaluation_function
 except ImportError:
     from evaluation import Params, evaluation_function
+from evaluation_test_cases import test_cases
 
 
 class TestEvaluationFunction(unittest.TestCase):
     """
     TestCase Class used to test the algorithm.
-    ---
-    Tests are used here to check that the algorithm written
-    is working as it should.
-
-    It's best practise to write these tests first to get a
-    kind of 'specification' for how your algorithm should
-    work, and you should run these tests before committing
-    your code to AWS.
-
-    Read the docs on how to use unittest here:
-    https://docs.python.org/3/library/unittest.html
-
-    Use evaluation_function() to check your algorithm works
-    as it should.
     """
 
-    def test_returns_is_correct_true(self):
-        response, answer, params = None, None, Params()
-        result = evaluation_function(response, answer, params)
-
-        self.assertEqual(result.get("is_correct"), True)
+    def test_multiple_cases(self):
+        passed = 0
+        failed = 0
+
+        for i, (response, answer, params, expected) in enumerate(test_cases, 1):
+            with self.subTest(test_case=i):
+                result = evaluation_function(response, answer, params)
+                is_correct = result.get("is_correct")
+
+                try:
+                    self.assertEqual(is_correct, expected)
+                    print(f"Test {i} Passed")
+                    passed += 1
+                except AssertionError:
+                    print(f"Test {i} Failed: expected {expected}, got {is_correct}")
+                    failed += 1
+
+                    # mismatch_info があれば表示
+                    mismatch_info = result.get("mismatch_info")
+                    if mismatch_info:
+                        print(f"Mismatch Info (Test {i}):\n{mismatch_info}")
+
+        print(f"\n--- Summary ---\nPassed: {passed}, Failed: {failed}, Total: {passed + failed}")
 
 
 if __name__ == "__main__":
diff --git a/app/requirements.txt b/app/requirements.txt
@@ -0,0 +1,6 @@
+os
+typing
+sympy
+re
+python-dotenv
+langchain-openai

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from evaluation import contains_special_math`
	`2`	`+print(contains_special_math("dy/dx"))`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,6 @@ @@
 +os
 +typing
 +sympy
 +re
 +python-dotenv
 +langchain-openai