Evaluation: Show Total Cost (#445)

AkhileshNegi · web-flow · commit 7e4f7d3e98c6 · 2025-11-23T21:20:17.000+05:30
* added cost

* added testcases
diff --git a/backend/app/crud/evaluations/langfuse.py b/backend/app/crud/evaluations/langfuse.py
@@ -20,15 +20,20 @@ def create_langfuse_dataset_run(
     dataset_name: str,
     run_name: str,
     results: list[dict[str, Any]],
+    model: str | None = None,
 ) -> dict[str, str]:
     """
     Create a dataset run in Langfuse with traces for each evaluation item.
 
     This function:
     1. Gets the dataset from Langfuse (which already exists)
     2. For each result, creates a trace linked to the dataset item
-    3. Logs input (question), output (generated_output), and expected (ground_truth)
-    4. Returns a mapping of item_id -> trace_id for later score updates
+    3. Creates a generation within the trace with usage/model for cost tracking
+    4. Logs input (question), output (generated_output), and expected (ground_truth)
+    5. Returns a mapping of item_id -> trace_id for later score updates
+
+    Note: Cost tracking in Langfuse happens at the generation level, not trace level.
+    We create a generation within each trace to enable automatic cost calculation.
 
     Args:
         langfuse: Configured Langfuse client
@@ -41,10 +46,16 @@ def create_langfuse_dataset_run(
                          "question": "What is 2+2?",
                          "generated_output": "4",
                          "ground_truth": "4",
-                         "response_id": "resp_0b99aadfead1fb62006908e7f540c48197bd110183a347c1d8"
+                         "response_id": "resp_0b99aadfead1fb62006908e7f540c48197bd110183a347c1d8",
+                         "usage": {
+                             "input_tokens": 69,
+                             "output_tokens": 258,
+                             "total_tokens": 327
+                         }
                      },
                      ...
                  ]
+        model: Model name used for evaluation (for cost calculation by Langfuse)
 
     Returns:
         dict[str, str]: Mapping of item_id to Langfuse trace_id
@@ -71,6 +82,7 @@ def create_langfuse_dataset_run(
             generated_output = result["generated_output"]
             ground_truth = result["ground_truth"]
             response_id = result.get("response_id")
+            usage_raw = result.get("usage")
 
             dataset_item = dataset_items_map.get(item_id)
             if not dataset_item:
@@ -89,12 +101,39 @@ def create_langfuse_dataset_run(
                     if response_id:
                         metadata["response_id"] = response_id
 
+                    # Create trace with basic info
                     langfuse.trace(
                         id=trace_id,
                         input={"question": question},
                         output={"answer": generated_output},
                         metadata=metadata,
                     )
+
+                    # Convert usage to Langfuse format
+                    usage = None
+                    if usage_raw:
+                        usage = {
+                            "input": usage_raw.get("input_tokens", 0),
+                            "output": usage_raw.get("output_tokens", 0),
+                            "total": usage_raw.get("total_tokens", 0),
+                            "unit": "TOKENS",
+                        }
+
+                    # Create a generation within the trace for cost tracking
+                    # Cost tracking happens at generation level, not trace level
+                    if usage and model:
+                        generation = langfuse.generation(
+                            name="evaluation-response",
+                            trace_id=trace_id,
+                            input={"question": question},
+                            metadata=metadata,
+                        )
+                        generation.end(
+                            output={"answer": generated_output},
+                            model=model,
+                            usage=usage,
+                        )
+
                     trace_id_mapping[item_id] = trace_id
 
             except Exception as e:
diff --git a/backend/app/crud/evaluations/processing.py b/backend/app/crud/evaluations/processing.py
@@ -63,7 +63,12 @@ def parse_evaluation_output(
                 "question": "What is 2+2?",
                 "generated_output": "4",
                 "ground_truth": "4",
-                "response_id": "resp_0b99aadfead1fb62006908e7f540c48197bd110183a347c1d8"
+                "response_id": "resp_0b99aadfead1fb62006908e7f540c48197bd110183a347c1d8",
+                "usage": {
+                    "input_tokens": 69,
+                    "output_tokens": 258,
+                    "total_tokens": 327
+                }
             },
             ...
         ]
@@ -97,6 +102,9 @@ def parse_evaluation_output(
             # Extract response ID from response.body.id
             response_id = response_body.get("id")
 
+            # Extract usage information for cost tracking
+            usage = response_body.get("usage")
+
             # Handle errors in batch processing
             if response.get("error"):
                 error_msg = response["error"].get("message", "Unknown error")
@@ -152,6 +160,7 @@ def parse_evaluation_output(
                     "generated_output": generated_output,
                     "ground_truth": ground_truth,
                     "response_id": response_id,
+                    "usage": usage,
                 }
             )
 
@@ -244,12 +253,16 @@ async def process_completed_evaluation(
         if not results:
             raise ValueError("No valid results found in batch output")
 
+        # Extract model from config for cost tracking
+        model = eval_run.config.get("model") if eval_run.config else None
+
         # Step 5: Create Langfuse dataset run with traces
         trace_id_mapping = create_langfuse_dataset_run(
             langfuse=langfuse,
             dataset_name=eval_run.dataset_name,
             run_name=eval_run.run_name,
             results=results,
+            model=model,
         )
 
         # Store object store URL in database
diff --git a/backend/app/tests/crud/evaluations/test_langfuse.py b/backend/app/tests/crud/evaluations/test_langfuse.py
@@ -34,19 +34,31 @@ def test_create_langfuse_dataset_run_success(self):
         mock_dataset.items = [mock_item1, mock_item2]
         mock_langfuse.get_dataset.return_value = mock_dataset
 
-        # Test data
+        # Test data with usage and response_id
         results = [
             {
                 "item_id": "item_1",
                 "question": "What is 2+2?",
                 "generated_output": "4",
                 "ground_truth": "4",
+                "response_id": "resp_123",
+                "usage": {
+                    "input_tokens": 10,
+                    "output_tokens": 5,
+                    "total_tokens": 15,
+                },
             },
             {
                 "item_id": "item_2",
                 "question": "What is the capital of France?",
                 "generated_output": "Paris",
                 "ground_truth": "Paris",
+                "response_id": "resp_456",
+                "usage": {
+                    "input_tokens": 12,
+                    "output_tokens": 3,
+                    "total_tokens": 15,
+                },
             },
         ]
 
@@ -88,12 +100,24 @@ def test_create_langfuse_dataset_run_skips_missing_items(self):
                 "question": "What is 2+2?",
                 "generated_output": "4",
                 "ground_truth": "4",
+                "response_id": "resp_123",
+                "usage": {
+                    "input_tokens": 10,
+                    "output_tokens": 5,
+                    "total_tokens": 15,
+                },
             },
             {
                 "item_id": "item_nonexistent",
                 "question": "Invalid question",
                 "generated_output": "Invalid",
                 "ground_truth": "Invalid",
+                "response_id": "resp_456",
+                "usage": {
+                    "input_tokens": 8,
+                    "output_tokens": 2,
+                    "total_tokens": 10,
+                },
             },
         ]
 
@@ -133,12 +157,24 @@ def test_create_langfuse_dataset_run_handles_trace_error(self):
                 "question": "What is 2+2?",
                 "generated_output": "4",
                 "ground_truth": "4",
+                "response_id": "resp_123",
+                "usage": {
+                    "input_tokens": 10,
+                    "output_tokens": 5,
+                    "total_tokens": 15,
+                },
             },
             {
                 "item_id": "item_2",
                 "question": "What is the capital?",
                 "generated_output": "Paris",
                 "ground_truth": "Paris",
+                "response_id": "resp_456",
+                "usage": {
+                    "input_tokens": 8,
+                    "output_tokens": 2,
+                    "total_tokens": 10,
+                },
             },
         ]
 
@@ -171,6 +207,97 @@ def test_create_langfuse_dataset_run_empty_results(self):
         assert len(trace_id_mapping) == 0
         mock_langfuse.flush.assert_called_once()
 
+    def test_create_langfuse_dataset_run_with_cost_tracking(self):
+        """Test that generation() is called with usage when model and usage are provided."""
+        # Mock Langfuse client
+        mock_langfuse = MagicMock()
+        mock_dataset = MagicMock()
+        mock_generation = MagicMock()
+
+        # Mock dataset items
+        mock_item1 = MagicMock()
+        mock_item1.id = "item_1"
+        mock_item1.observe.return_value.__enter__.return_value = "trace_id_1"
+
+        mock_item2 = MagicMock()
+        mock_item2.id = "item_2"
+        mock_item2.observe.return_value.__enter__.return_value = "trace_id_2"
+
+        mock_dataset.items = [mock_item1, mock_item2]
+        mock_langfuse.get_dataset.return_value = mock_dataset
+        mock_langfuse.generation.return_value = mock_generation
+
+        # Test data with usage and model
+        results = [
+            {
+                "item_id": "item_1",
+                "question": "What is 2+2?",
+                "generated_output": "The answer is 4",
+                "ground_truth": "4",
+                "response_id": "resp_123",
+                "usage": {
+                    "input_tokens": 69,
+                    "output_tokens": 258,
+                    "total_tokens": 327,
+                },
+            },
+            {
+                "item_id": "item_2",
+                "question": "What is the capital of France?",
+                "generated_output": "Paris is the capital",
+                "ground_truth": "Paris",
+                "response_id": "resp_456",
+                "usage": {
+                    "input_tokens": 50,
+                    "output_tokens": 100,
+                    "total_tokens": 150,
+                },
+            },
+        ]
+
+        # Call function with model parameter
+        trace_id_mapping = create_langfuse_dataset_run(
+            langfuse=mock_langfuse,
+            dataset_name="test_dataset",
+            run_name="test_run",
+            results=results,
+            model="gpt-4o",
+        )
+
+        # Verify results
+        assert len(trace_id_mapping) == 2
+        assert trace_id_mapping["item_1"] == "trace_id_1"
+        assert trace_id_mapping["item_2"] == "trace_id_2"
+
+        # Verify generation() was called for cost tracking
+        assert mock_langfuse.generation.call_count == 2
+
+        # Verify the first generation call
+        first_call = mock_langfuse.generation.call_args_list[0]
+        assert first_call.kwargs["name"] == "evaluation-response"
+        assert first_call.kwargs["trace_id"] == "trace_id_1"
+        assert first_call.kwargs["input"] == {"question": "What is 2+2?"}
+        assert first_call.kwargs["metadata"]["ground_truth"] == "4"
+        assert first_call.kwargs["metadata"]["response_id"] == "resp_123"
+
+        # Verify generation.end() was called with usage
+        assert mock_generation.end.call_count == 2
+
+        first_end_call = mock_generation.end.call_args_list[0]
+        assert first_end_call.kwargs["output"] == {"answer": "The answer is 4"}
+        assert first_end_call.kwargs["model"] == "gpt-4o"
+        assert first_end_call.kwargs["usage"] == {
+            "input": 69,
+            "output": 258,
+            "total": 327,
+            "unit": "TOKENS",
+        }
+
+        # Verify Langfuse calls
+        mock_langfuse.get_dataset.assert_called_once_with("test_dataset")
+        mock_langfuse.flush.assert_called_once()
+        assert mock_langfuse.trace.call_count == 2
+
 
 class TestUpdateTracesWithCosineScores:
     """Test updating Langfuse traces with cosine similarity scores."""