Evaluation: Uploading dataset concurrently (#461)

Prajna1999 · AkhileshNegi · web-flow · commit fc46fa397bee · 2025-12-18T17:04:20.000+05:30
* fix: add threadpool based concurrency to speeden up langfuse dataset upload

* chore: fix precommit linting issues

* fix: cleanup and deleted CELERY.md

* chore: formatting

---------

Co-authored-by: Akhilesh Negi &lt;akhileshnegi.an3@gmail.com&gt;
diff --git a/backend/app/crud/evaluations/langfuse.py b/backend/app/crud/evaluations/langfuse.py
@@ -9,6 +9,7 @@
 """
 
 import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Any
 
 import numpy as np
@@ -247,42 +248,55 @@ def upload_dataset_to_langfuse(
         f"duplication_factor={duplication_factor}"
     )
 
+    def upload_item(item: dict[str, str], duplicate_num: int) -> bool:
+        try:
+            langfuse.create_dataset_item(
+                dataset_name=dataset_name,
+                input={"question": item["question"]},
+                expected_output={"answer": item["answer"]},
+                metadata={
+                    "original_question": item["question"],
+                    "duplicate_number": duplicate_num + 1,
+                    "duplication_factor": duplication_factor,
+                },
+            )
+            return True
+        except Exception as e:
+            logger.error(
+                f"[upload_dataset_to_langfuse] Failed to upload item | "
+                f"duplicate={duplicate_num + 1} | "
+                f"question={item['question'][:50]}... | {e}"
+            )
+            return False
+
     try:
         # Create or get dataset in Langfuse
         dataset = langfuse.create_dataset(name=dataset_name)
 
-        # Upload items with duplication
+        upload_tasks = [
+            (item, duplicate_num)
+            for item in items
+            for duplicate_num in range(duplication_factor)
+        ]
+
+        # Upload items concurrently using ThreadPoolExecutor
         total_uploaded = 0
-        for item in items:
-            # Duplicate each item N times
-            for duplicate_num in range(duplication_factor):
-                try:
-                    langfuse.create_dataset_item(
-                        dataset_name=dataset_name,
-                        input={"question": item["question"]},
-                        expected_output={"answer": item["answer"]},
-                        metadata={
-                            "original_question": item["question"],
-                            "duplicate_number": duplicate_num + 1,
-                            "duplication_factor": duplication_factor,
-                        },
-                    )
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            # Submit all upload tasks and collect the futures
+            futures = []
+            for item, dup_num in upload_tasks:
+                future = executor.submit(upload_item, item, dup_num)
+                futures.append(future)
+
+            for future in as_completed(futures):
+                upload_successful = future.result()
+                if upload_successful:
                     total_uploaded += 1
-                except Exception as e:
-                    logger.error(
-                        f"[upload_dataset_to_langfuse] Failed to upload item | "
-                        f"duplicate={duplicate_num + 1} | "
-                        f"question={item['question'][:50]}... | {e}"
-                    )
-
-            # Flush after each original item's duplicates to prevent race conditions
-            # in Langfuse SDK's internal batching that could mix up Q&A pairs
-            langfuse.flush()
 
         # Final flush to ensure all items are uploaded
         langfuse.flush()
 
-        langfuse_dataset_id = dataset.id if hasattr(dataset, "id") else None
+        langfuse_dataset_id = dataset.id
 
         logger.info(
             f"[upload_dataset_to_langfuse] Successfully uploaded to Langfuse | "
diff --git a/backend/app/tests/crud/evaluations/test_langfuse.py b/backend/app/tests/crud/evaluations/test_langfuse.py
@@ -415,8 +415,8 @@ def test_upload_dataset_to_langfuse_success(self, valid_items):
         # Verify dataset items were created (3 original * 5 duplicates = 15)
         assert mock_langfuse.create_dataset_item.call_count == 15
 
-        # Verify flush was called (once per original item + final flush = 4 times for 3 items)
-        assert mock_langfuse.flush.call_count == 4  # 3 items + 1 final
+        # Verify flush was called once (final flush)
+        assert mock_langfuse.flush.call_count == 1
 
     def test_upload_dataset_to_langfuse_duplication_metadata(self, valid_items):
         """Test that duplication metadata is included."""
@@ -483,8 +483,8 @@ def test_upload_dataset_to_langfuse_single_duplication(self, valid_items):
 
         assert total_items == 3  # 3 items * 1 duplication
         assert mock_langfuse.create_dataset_item.call_count == 3
-        # 3 items + 1 final flush
-        assert mock_langfuse.flush.call_count == 4
+        # final flush once
+        assert mock_langfuse.flush.call_count == 1
 
     def test_upload_dataset_to_langfuse_item_creation_error(self, valid_items):
         """Test that item creation errors are logged but don't stop processing."""