Skip to content

Commit 7e4f7d3

Browse files
authored
Evaluation: Show Total Cost (#445)
* added cost * added testcases
1 parent f6dc638 commit 7e4f7d3

File tree

3 files changed

+184
-5
lines changed

3 files changed

+184
-5
lines changed

backend/app/crud/evaluations/langfuse.py

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,20 @@ def create_langfuse_dataset_run(
2020
dataset_name: str,
2121
run_name: str,
2222
results: list[dict[str, Any]],
23+
model: str | None = None,
2324
) -> dict[str, str]:
2425
"""
2526
Create a dataset run in Langfuse with traces for each evaluation item.
2627
2728
This function:
2829
1. Gets the dataset from Langfuse (which already exists)
2930
2. For each result, creates a trace linked to the dataset item
30-
3. Logs input (question), output (generated_output), and expected (ground_truth)
31-
4. Returns a mapping of item_id -> trace_id for later score updates
31+
3. Creates a generation within the trace with usage/model for cost tracking
32+
4. Logs input (question), output (generated_output), and expected (ground_truth)
33+
5. Returns a mapping of item_id -> trace_id for later score updates
34+
35+
Note: Cost tracking in Langfuse happens at the generation level, not trace level.
36+
We create a generation within each trace to enable automatic cost calculation.
3237
3338
Args:
3439
langfuse: Configured Langfuse client
@@ -41,10 +46,16 @@ def create_langfuse_dataset_run(
4146
"question": "What is 2+2?",
4247
"generated_output": "4",
4348
"ground_truth": "4",
44-
"response_id": "resp_0b99aadfead1fb62006908e7f540c48197bd110183a347c1d8"
49+
"response_id": "resp_0b99aadfead1fb62006908e7f540c48197bd110183a347c1d8",
50+
"usage": {
51+
"input_tokens": 69,
52+
"output_tokens": 258,
53+
"total_tokens": 327
54+
}
4555
},
4656
...
4757
]
58+
model: Model name used for evaluation (for cost calculation by Langfuse)
4859
4960
Returns:
5061
dict[str, str]: Mapping of item_id to Langfuse trace_id
@@ -71,6 +82,7 @@ def create_langfuse_dataset_run(
7182
generated_output = result["generated_output"]
7283
ground_truth = result["ground_truth"]
7384
response_id = result.get("response_id")
85+
usage_raw = result.get("usage")
7486

7587
dataset_item = dataset_items_map.get(item_id)
7688
if not dataset_item:
@@ -89,12 +101,39 @@ def create_langfuse_dataset_run(
89101
if response_id:
90102
metadata["response_id"] = response_id
91103

104+
# Create trace with basic info
92105
langfuse.trace(
93106
id=trace_id,
94107
input={"question": question},
95108
output={"answer": generated_output},
96109
metadata=metadata,
97110
)
111+
112+
# Convert usage to Langfuse format
113+
usage = None
114+
if usage_raw:
115+
usage = {
116+
"input": usage_raw.get("input_tokens", 0),
117+
"output": usage_raw.get("output_tokens", 0),
118+
"total": usage_raw.get("total_tokens", 0),
119+
"unit": "TOKENS",
120+
}
121+
122+
# Create a generation within the trace for cost tracking
123+
# Cost tracking happens at generation level, not trace level
124+
if usage and model:
125+
generation = langfuse.generation(
126+
name="evaluation-response",
127+
trace_id=trace_id,
128+
input={"question": question},
129+
metadata=metadata,
130+
)
131+
generation.end(
132+
output={"answer": generated_output},
133+
model=model,
134+
usage=usage,
135+
)
136+
98137
trace_id_mapping[item_id] = trace_id
99138

100139
except Exception as e:

backend/app/crud/evaluations/processing.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,12 @@ def parse_evaluation_output(
6363
"question": "What is 2+2?",
6464
"generated_output": "4",
6565
"ground_truth": "4",
66-
"response_id": "resp_0b99aadfead1fb62006908e7f540c48197bd110183a347c1d8"
66+
"response_id": "resp_0b99aadfead1fb62006908e7f540c48197bd110183a347c1d8",
67+
"usage": {
68+
"input_tokens": 69,
69+
"output_tokens": 258,
70+
"total_tokens": 327
71+
}
6772
},
6873
...
6974
]
@@ -97,6 +102,9 @@ def parse_evaluation_output(
97102
# Extract response ID from response.body.id
98103
response_id = response_body.get("id")
99104

105+
# Extract usage information for cost tracking
106+
usage = response_body.get("usage")
107+
100108
# Handle errors in batch processing
101109
if response.get("error"):
102110
error_msg = response["error"].get("message", "Unknown error")
@@ -152,6 +160,7 @@ def parse_evaluation_output(
152160
"generated_output": generated_output,
153161
"ground_truth": ground_truth,
154162
"response_id": response_id,
163+
"usage": usage,
155164
}
156165
)
157166

@@ -244,12 +253,16 @@ async def process_completed_evaluation(
244253
if not results:
245254
raise ValueError("No valid results found in batch output")
246255

256+
# Extract model from config for cost tracking
257+
model = eval_run.config.get("model") if eval_run.config else None
258+
247259
# Step 5: Create Langfuse dataset run with traces
248260
trace_id_mapping = create_langfuse_dataset_run(
249261
langfuse=langfuse,
250262
dataset_name=eval_run.dataset_name,
251263
run_name=eval_run.run_name,
252264
results=results,
265+
model=model,
253266
)
254267

255268
# Store object store URL in database

backend/app/tests/crud/evaluations/test_langfuse.py

Lines changed: 128 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,19 +34,31 @@ def test_create_langfuse_dataset_run_success(self):
3434
mock_dataset.items = [mock_item1, mock_item2]
3535
mock_langfuse.get_dataset.return_value = mock_dataset
3636

37-
# Test data
37+
# Test data with usage and response_id
3838
results = [
3939
{
4040
"item_id": "item_1",
4141
"question": "What is 2+2?",
4242
"generated_output": "4",
4343
"ground_truth": "4",
44+
"response_id": "resp_123",
45+
"usage": {
46+
"input_tokens": 10,
47+
"output_tokens": 5,
48+
"total_tokens": 15,
49+
},
4450
},
4551
{
4652
"item_id": "item_2",
4753
"question": "What is the capital of France?",
4854
"generated_output": "Paris",
4955
"ground_truth": "Paris",
56+
"response_id": "resp_456",
57+
"usage": {
58+
"input_tokens": 12,
59+
"output_tokens": 3,
60+
"total_tokens": 15,
61+
},
5062
},
5163
]
5264

@@ -88,12 +100,24 @@ def test_create_langfuse_dataset_run_skips_missing_items(self):
88100
"question": "What is 2+2?",
89101
"generated_output": "4",
90102
"ground_truth": "4",
103+
"response_id": "resp_123",
104+
"usage": {
105+
"input_tokens": 10,
106+
"output_tokens": 5,
107+
"total_tokens": 15,
108+
},
91109
},
92110
{
93111
"item_id": "item_nonexistent",
94112
"question": "Invalid question",
95113
"generated_output": "Invalid",
96114
"ground_truth": "Invalid",
115+
"response_id": "resp_456",
116+
"usage": {
117+
"input_tokens": 8,
118+
"output_tokens": 2,
119+
"total_tokens": 10,
120+
},
97121
},
98122
]
99123

@@ -133,12 +157,24 @@ def test_create_langfuse_dataset_run_handles_trace_error(self):
133157
"question": "What is 2+2?",
134158
"generated_output": "4",
135159
"ground_truth": "4",
160+
"response_id": "resp_123",
161+
"usage": {
162+
"input_tokens": 10,
163+
"output_tokens": 5,
164+
"total_tokens": 15,
165+
},
136166
},
137167
{
138168
"item_id": "item_2",
139169
"question": "What is the capital?",
140170
"generated_output": "Paris",
141171
"ground_truth": "Paris",
172+
"response_id": "resp_456",
173+
"usage": {
174+
"input_tokens": 8,
175+
"output_tokens": 2,
176+
"total_tokens": 10,
177+
},
142178
},
143179
]
144180

@@ -171,6 +207,97 @@ def test_create_langfuse_dataset_run_empty_results(self):
171207
assert len(trace_id_mapping) == 0
172208
mock_langfuse.flush.assert_called_once()
173209

210+
def test_create_langfuse_dataset_run_with_cost_tracking(self):
211+
"""Test that generation() is called with usage when model and usage are provided."""
212+
# Mock Langfuse client
213+
mock_langfuse = MagicMock()
214+
mock_dataset = MagicMock()
215+
mock_generation = MagicMock()
216+
217+
# Mock dataset items
218+
mock_item1 = MagicMock()
219+
mock_item1.id = "item_1"
220+
mock_item1.observe.return_value.__enter__.return_value = "trace_id_1"
221+
222+
mock_item2 = MagicMock()
223+
mock_item2.id = "item_2"
224+
mock_item2.observe.return_value.__enter__.return_value = "trace_id_2"
225+
226+
mock_dataset.items = [mock_item1, mock_item2]
227+
mock_langfuse.get_dataset.return_value = mock_dataset
228+
mock_langfuse.generation.return_value = mock_generation
229+
230+
# Test data with usage and model
231+
results = [
232+
{
233+
"item_id": "item_1",
234+
"question": "What is 2+2?",
235+
"generated_output": "The answer is 4",
236+
"ground_truth": "4",
237+
"response_id": "resp_123",
238+
"usage": {
239+
"input_tokens": 69,
240+
"output_tokens": 258,
241+
"total_tokens": 327,
242+
},
243+
},
244+
{
245+
"item_id": "item_2",
246+
"question": "What is the capital of France?",
247+
"generated_output": "Paris is the capital",
248+
"ground_truth": "Paris",
249+
"response_id": "resp_456",
250+
"usage": {
251+
"input_tokens": 50,
252+
"output_tokens": 100,
253+
"total_tokens": 150,
254+
},
255+
},
256+
]
257+
258+
# Call function with model parameter
259+
trace_id_mapping = create_langfuse_dataset_run(
260+
langfuse=mock_langfuse,
261+
dataset_name="test_dataset",
262+
run_name="test_run",
263+
results=results,
264+
model="gpt-4o",
265+
)
266+
267+
# Verify results
268+
assert len(trace_id_mapping) == 2
269+
assert trace_id_mapping["item_1"] == "trace_id_1"
270+
assert trace_id_mapping["item_2"] == "trace_id_2"
271+
272+
# Verify generation() was called for cost tracking
273+
assert mock_langfuse.generation.call_count == 2
274+
275+
# Verify the first generation call
276+
first_call = mock_langfuse.generation.call_args_list[0]
277+
assert first_call.kwargs["name"] == "evaluation-response"
278+
assert first_call.kwargs["trace_id"] == "trace_id_1"
279+
assert first_call.kwargs["input"] == {"question": "What is 2+2?"}
280+
assert first_call.kwargs["metadata"]["ground_truth"] == "4"
281+
assert first_call.kwargs["metadata"]["response_id"] == "resp_123"
282+
283+
# Verify generation.end() was called with usage
284+
assert mock_generation.end.call_count == 2
285+
286+
first_end_call = mock_generation.end.call_args_list[0]
287+
assert first_end_call.kwargs["output"] == {"answer": "The answer is 4"}
288+
assert first_end_call.kwargs["model"] == "gpt-4o"
289+
assert first_end_call.kwargs["usage"] == {
290+
"input": 69,
291+
"output": 258,
292+
"total": 327,
293+
"unit": "TOKENS",
294+
}
295+
296+
# Verify Langfuse calls
297+
mock_langfuse.get_dataset.assert_called_once_with("test_dataset")
298+
mock_langfuse.flush.assert_called_once()
299+
assert mock_langfuse.trace.call_count == 2
300+
174301

175302
class TestUpdateTracesWithCosineScores:
176303
"""Test updating Langfuse traces with cosine similarity scores."""

0 commit comments

Comments
 (0)