|
9 | 9 | """ |
10 | 10 |
|
11 | 11 | import logging |
| 12 | +from concurrent.futures import ThreadPoolExecutor, as_completed |
12 | 13 | from typing import Any |
13 | 14 |
|
14 | 15 | import numpy as np |
@@ -247,42 +248,55 @@ def upload_dataset_to_langfuse( |
247 | 248 | f"duplication_factor={duplication_factor}" |
248 | 249 | ) |
249 | 250 |
|
| 251 | + def upload_item(item: dict[str, str], duplicate_num: int) -> bool: |
| 252 | + try: |
| 253 | + langfuse.create_dataset_item( |
| 254 | + dataset_name=dataset_name, |
| 255 | + input={"question": item["question"]}, |
| 256 | + expected_output={"answer": item["answer"]}, |
| 257 | + metadata={ |
| 258 | + "original_question": item["question"], |
| 259 | + "duplicate_number": duplicate_num + 1, |
| 260 | + "duplication_factor": duplication_factor, |
| 261 | + }, |
| 262 | + ) |
| 263 | + return True |
| 264 | + except Exception as e: |
| 265 | + logger.error( |
| 266 | + f"[upload_dataset_to_langfuse] Failed to upload item | " |
| 267 | + f"duplicate={duplicate_num + 1} | " |
| 268 | + f"question={item['question'][:50]}... | {e}" |
| 269 | + ) |
| 270 | + return False |
| 271 | + |
250 | 272 | try: |
251 | 273 | # Create or get dataset in Langfuse |
252 | 274 | dataset = langfuse.create_dataset(name=dataset_name) |
253 | 275 |
|
254 | | - # Upload items with duplication |
| 276 | + upload_tasks = [ |
| 277 | + (item, duplicate_num) |
| 278 | + for item in items |
| 279 | + for duplicate_num in range(duplication_factor) |
| 280 | + ] |
| 281 | + |
| 282 | + # Upload items concurrently using ThreadPoolExecutor |
255 | 283 | total_uploaded = 0 |
256 | | - for item in items: |
257 | | - # Duplicate each item N times |
258 | | - for duplicate_num in range(duplication_factor): |
259 | | - try: |
260 | | - langfuse.create_dataset_item( |
261 | | - dataset_name=dataset_name, |
262 | | - input={"question": item["question"]}, |
263 | | - expected_output={"answer": item["answer"]}, |
264 | | - metadata={ |
265 | | - "original_question": item["question"], |
266 | | - "duplicate_number": duplicate_num + 1, |
267 | | - "duplication_factor": duplication_factor, |
268 | | - }, |
269 | | - ) |
| 284 | + with ThreadPoolExecutor(max_workers=4) as executor: |
| 285 | + # Submit all upload tasks and collect the futures |
| 286 | + futures = [] |
| 287 | + for item, dup_num in upload_tasks: |
| 288 | + future = executor.submit(upload_item, item, dup_num) |
| 289 | + futures.append(future) |
| 290 | + |
| 291 | + for future in as_completed(futures): |
| 292 | + upload_successful = future.result() |
| 293 | + if upload_successful: |
270 | 294 | total_uploaded += 1 |
271 | | - except Exception as e: |
272 | | - logger.error( |
273 | | - f"[upload_dataset_to_langfuse] Failed to upload item | " |
274 | | - f"duplicate={duplicate_num + 1} | " |
275 | | - f"question={item['question'][:50]}... | {e}" |
276 | | - ) |
277 | | - |
278 | | - # Flush after each original item's duplicates to prevent race conditions |
279 | | - # in Langfuse SDK's internal batching that could mix up Q&A pairs |
280 | | - langfuse.flush() |
281 | 295 |
|
282 | 296 | # Final flush to ensure all items are uploaded |
283 | 297 | langfuse.flush() |
284 | 298 |
|
285 | | - langfuse_dataset_id = dataset.id if hasattr(dataset, "id") else None |
| 299 | + langfuse_dataset_id = dataset.id |
286 | 300 |
|
287 | 301 | logger.info( |
288 | 302 | f"[upload_dataset_to_langfuse] Successfully uploaded to Langfuse | " |
|
0 commit comments