Skip to content

Commit 169c51c

Browse files
committed
Switch to more flexible JSON schema
1 parent e352d54 commit 169c51c

File tree

3 files changed

+91
-24
lines changed

3 files changed

+91
-24
lines changed

eval_tests.json

Lines changed: 72 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,76 @@
11
[
22
{
3-
"response": "A & Test",
4-
"answer": "A & Test",
5-
"params": {},
6-
"is_correct": true,
7-
"results": {
8-
"response_latex": "A \\cdot \\mathrm{Test}"
9-
}
10-
},
11-
{
12-
"response": "A | B",
13-
"answer": "A & B",
14-
"params": {},
15-
"is_correct": false,
16-
"feedback": "The expressions are not equal."
3+
"title": "Basic expression equality and inequality",
4+
"masterContent": "Demonstrates trivial comparisons",
5+
"parts": [
6+
{
7+
"content": "The response and answer are exactly the same, so the response should be considered correct.",
8+
"responseAreas": [
9+
{
10+
"preResponseText": "",
11+
"answer": "A & B",
12+
"params": {},
13+
"tests": [
14+
{
15+
"description": "Most basic possible case",
16+
"response": "A & B",
17+
"expectedResult": {
18+
"is_correct": true,
19+
"response_latex": "A \\cdot B"
20+
}
21+
}
22+
]
23+
},
24+
{
25+
"preResponseText": "Multi-character variable names are supported.",
26+
"answer": "A & Test",
27+
"params": {},
28+
"tests": [
29+
{
30+
"description": "Works with variable names of any length",
31+
"response": "A & Test",
32+
"expectedResult": {
33+
"is_correct": true,
34+
"response_latex": "A \\cdot \\mathrm{Test}"
35+
}
36+
}
37+
]
38+
}
39+
]
40+
},
41+
{
42+
"content": "",
43+
"responseAreas": [
44+
{
45+
"preResponseText": "Transposition of variables:",
46+
"answer": "A & B",
47+
"params": {},
48+
"tests": [
49+
{
50+
"description": "Tests transposed variables are correct",
51+
"response": "B & A",
52+
"expectedResult": {
53+
"is_correct": true
54+
}
55+
}
56+
]
57+
},
58+
{
59+
"preResponseText": "Trivially incorrect response:",
60+
"answer": "A & B",
61+
"params": {},
62+
"tests": [
63+
{
64+
"description": "Incorrect results marked as false",
65+
"response": "A | B",
66+
"expectedResult": {
67+
"is_correct": false
68+
}
69+
}
70+
]
71+
}
72+
]
73+
}
74+
]
1775
}
1876
]

evaluation_function/evaluation_test.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,8 +91,7 @@ def test_nor_nand(self):
9191
self.assertFalse(result.get("feedback"))
9292

9393
def test_complex(self):
94-
response, answer, params = "A & B | B & C & (B | C)", "B & (A | C)", Params(
95-
)
94+
response, answer, params = "A & B | B & C & (B | C)", "B & (A | C)", Params()
9695

9796
result = evaluation_function(response, answer, params).to_dict()
9897

evaluation_function/json_tests.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@ def __init__(self, test_dict: dict):
55
self.response = test_dict["response"]
66
self.answer = test_dict["answer"]
77
self.params = test_dict["params"]
8-
self.is_correct = test_dict["is_correct"]
9-
self.results = test_dict.get("results")
8+
expected_result = test_dict["expectedResult"]
9+
self.is_correct = expected_result["is_correct"]
10+
self.results = expected_result
11+
self.desc = test_dict["description"]
1012

1113
def evaluate(self, func) -> dict:
1214
return func(self.response, self.answer, self.params)
@@ -17,7 +19,7 @@ def compare(self, eval_result: dict) -> tuple[bool, str]:
1719
if eval_correct != self.is_correct:
1820
return (
1921
False,
20-
f"response \"{self.response}\" with answer \"{self.answer}\" was {'' if eval_correct else 'in'}correct: {eval_result['feedback']}."
22+
f"response \"{self.response}\" with answer \"{self.answer}\" was {'' if eval_correct else 'in'}correct: {eval_result['feedback']}\nTest description: {self.desc}"
2123
)
2224

2325
# Are there any other fields in the eval function result that need to be checked?
@@ -31,20 +33,28 @@ def compare(self, eval_result: dict) -> tuple[bool, str]:
3133
if actual_result_val != value:
3234
return (
3335
False,
34-
f"expected {key} = \"{value}\", got {key} = \"{actual_result_val}\""
36+
f"expected {key} = \"{value}\", got {key} = \"{actual_result_val}\"\nTest description: {self.desc}"
3537
)
3638

3739
return (True, "")
3840

3941

4042
def get_tests_from_json(filename: str) -> list[TestData]:
4143
out = []
42-
tests = []
44+
questions = []
4345
with open(filename, "r") as test_file:
4446
test_json = test_file.read()
45-
tests = json.loads(test_json)
46-
for test in tests:
47-
out.append(TestData(test))
47+
questions = json.loads(test_json)
48+
# Convert the structured test data into a flat list of tests
49+
for question in questions:
50+
for part in question["parts"]:
51+
for response_area in part["responseAreas"]:
52+
params = response_area["params"]
53+
answer = response_area["answer"]
54+
for test in response_area["tests"]:
55+
test.update({"answer": answer})
56+
test.update({"params": params})
57+
out.append(TestData(test))
4858

4959
return out
5060

0 commit comments

Comments
 (0)