@@ -34,19 +34,31 @@ def test_create_langfuse_dataset_run_success(self):
3434 mock_dataset .items = [mock_item1 , mock_item2 ]
3535 mock_langfuse .get_dataset .return_value = mock_dataset
3636
37- # Test data
37+ # Test data with usage and response_id
3838 results = [
3939 {
4040 "item_id" : "item_1" ,
4141 "question" : "What is 2+2?" ,
4242 "generated_output" : "4" ,
4343 "ground_truth" : "4" ,
44+ "response_id" : "resp_123" ,
45+ "usage" : {
46+ "input_tokens" : 10 ,
47+ "output_tokens" : 5 ,
48+ "total_tokens" : 15 ,
49+ },
4450 },
4551 {
4652 "item_id" : "item_2" ,
4753 "question" : "What is the capital of France?" ,
4854 "generated_output" : "Paris" ,
4955 "ground_truth" : "Paris" ,
56+ "response_id" : "resp_456" ,
57+ "usage" : {
58+ "input_tokens" : 12 ,
59+ "output_tokens" : 3 ,
60+ "total_tokens" : 15 ,
61+ },
5062 },
5163 ]
5264
@@ -88,12 +100,24 @@ def test_create_langfuse_dataset_run_skips_missing_items(self):
88100 "question" : "What is 2+2?" ,
89101 "generated_output" : "4" ,
90102 "ground_truth" : "4" ,
103+ "response_id" : "resp_123" ,
104+ "usage" : {
105+ "input_tokens" : 10 ,
106+ "output_tokens" : 5 ,
107+ "total_tokens" : 15 ,
108+ },
91109 },
92110 {
93111 "item_id" : "item_nonexistent" ,
94112 "question" : "Invalid question" ,
95113 "generated_output" : "Invalid" ,
96114 "ground_truth" : "Invalid" ,
115+ "response_id" : "resp_456" ,
116+ "usage" : {
117+ "input_tokens" : 8 ,
118+ "output_tokens" : 2 ,
119+ "total_tokens" : 10 ,
120+ },
97121 },
98122 ]
99123
@@ -133,12 +157,24 @@ def test_create_langfuse_dataset_run_handles_trace_error(self):
133157 "question" : "What is 2+2?" ,
134158 "generated_output" : "4" ,
135159 "ground_truth" : "4" ,
160+ "response_id" : "resp_123" ,
161+ "usage" : {
162+ "input_tokens" : 10 ,
163+ "output_tokens" : 5 ,
164+ "total_tokens" : 15 ,
165+ },
136166 },
137167 {
138168 "item_id" : "item_2" ,
139169 "question" : "What is the capital?" ,
140170 "generated_output" : "Paris" ,
141171 "ground_truth" : "Paris" ,
172+ "response_id" : "resp_456" ,
173+ "usage" : {
174+ "input_tokens" : 8 ,
175+ "output_tokens" : 2 ,
176+ "total_tokens" : 10 ,
177+ },
142178 },
143179 ]
144180
@@ -171,6 +207,97 @@ def test_create_langfuse_dataset_run_empty_results(self):
171207 assert len (trace_id_mapping ) == 0
172208 mock_langfuse .flush .assert_called_once ()
173209
210+ def test_create_langfuse_dataset_run_with_cost_tracking (self ):
211+ """Test that generation() is called with usage when model and usage are provided."""
212+ # Mock Langfuse client
213+ mock_langfuse = MagicMock ()
214+ mock_dataset = MagicMock ()
215+ mock_generation = MagicMock ()
216+
217+ # Mock dataset items
218+ mock_item1 = MagicMock ()
219+ mock_item1 .id = "item_1"
220+ mock_item1 .observe .return_value .__enter__ .return_value = "trace_id_1"
221+
222+ mock_item2 = MagicMock ()
223+ mock_item2 .id = "item_2"
224+ mock_item2 .observe .return_value .__enter__ .return_value = "trace_id_2"
225+
226+ mock_dataset .items = [mock_item1 , mock_item2 ]
227+ mock_langfuse .get_dataset .return_value = mock_dataset
228+ mock_langfuse .generation .return_value = mock_generation
229+
230+ # Test data with usage and model
231+ results = [
232+ {
233+ "item_id" : "item_1" ,
234+ "question" : "What is 2+2?" ,
235+ "generated_output" : "The answer is 4" ,
236+ "ground_truth" : "4" ,
237+ "response_id" : "resp_123" ,
238+ "usage" : {
239+ "input_tokens" : 69 ,
240+ "output_tokens" : 258 ,
241+ "total_tokens" : 327 ,
242+ },
243+ },
244+ {
245+ "item_id" : "item_2" ,
246+ "question" : "What is the capital of France?" ,
247+ "generated_output" : "Paris is the capital" ,
248+ "ground_truth" : "Paris" ,
249+ "response_id" : "resp_456" ,
250+ "usage" : {
251+ "input_tokens" : 50 ,
252+ "output_tokens" : 100 ,
253+ "total_tokens" : 150 ,
254+ },
255+ },
256+ ]
257+
258+ # Call function with model parameter
259+ trace_id_mapping = create_langfuse_dataset_run (
260+ langfuse = mock_langfuse ,
261+ dataset_name = "test_dataset" ,
262+ run_name = "test_run" ,
263+ results = results ,
264+ model = "gpt-4o" ,
265+ )
266+
267+ # Verify results
268+ assert len (trace_id_mapping ) == 2
269+ assert trace_id_mapping ["item_1" ] == "trace_id_1"
270+ assert trace_id_mapping ["item_2" ] == "trace_id_2"
271+
272+ # Verify generation() was called for cost tracking
273+ assert mock_langfuse .generation .call_count == 2
274+
275+ # Verify the first generation call
276+ first_call = mock_langfuse .generation .call_args_list [0 ]
277+ assert first_call .kwargs ["name" ] == "evaluation-response"
278+ assert first_call .kwargs ["trace_id" ] == "trace_id_1"
279+ assert first_call .kwargs ["input" ] == {"question" : "What is 2+2?" }
280+ assert first_call .kwargs ["metadata" ]["ground_truth" ] == "4"
281+ assert first_call .kwargs ["metadata" ]["response_id" ] == "resp_123"
282+
283+ # Verify generation.end() was called with usage
284+ assert mock_generation .end .call_count == 2
285+
286+ first_end_call = mock_generation .end .call_args_list [0 ]
287+ assert first_end_call .kwargs ["output" ] == {"answer" : "The answer is 4" }
288+ assert first_end_call .kwargs ["model" ] == "gpt-4o"
289+ assert first_end_call .kwargs ["usage" ] == {
290+ "input" : 69 ,
291+ "output" : 258 ,
292+ "total" : 327 ,
293+ "unit" : "TOKENS" ,
294+ }
295+
296+ # Verify Langfuse calls
297+ mock_langfuse .get_dataset .assert_called_once_with ("test_dataset" )
298+ mock_langfuse .flush .assert_called_once ()
299+ assert mock_langfuse .trace .call_count == 2
300+
174301
175302class TestUpdateTracesWithCosineScores :
176303 """Test updating Langfuse traces with cosine similarity scores."""
0 commit comments