fix errors handling

okaris · okaris · commit e3549251e798 · 2025-06-29T16:02:48.000Z
diff --git a/src/inferencesh/models/llm.py b/src/inferencesh/models/llm.py
@@ -586,6 +586,8 @@ def stream_generate(
 ) -> Generator[BaseLLMOutput, None, None]:
     """Stream generate from LLaMA.cpp model with timing and usage tracking."""
     
+    print("[DEBUG] Starting stream_generate")
+    
     # Create queues for communication between threads
     response_queue = Queue()
     error_queue = Queue()
@@ -597,6 +599,7 @@ def stream_generate(
     def _generate_worker():
         """Worker thread to run the model generation."""
         try:
+            print("[DEBUG] Worker thread started")
             # Build completion kwargs
             completion_kwargs = {
                 "messages": messages,
@@ -613,17 +616,24 @@ def _generate_worker():
             
             # Signal that we're starting
             keep_alive_queue.put(("init", time.time()))
+            print("[DEBUG] Worker sent init signal")
             
             completion = model.create_chat_completion(**completion_kwargs)
+            print("[DEBUG] Got completion iterator from model")
             
+            chunk_count = 0
             for chunk in completion:
+                chunk_count += 1
                 if verbose:
                     print(chunk)
+                if chunk_count % 10 == 0:  # Log every 10th chunk to avoid spam
+                    print(f"[DEBUG] Processing chunk {chunk_count}")
                 response_queue.put(("chunk", chunk))
                 # Update keep-alive timestamp
                 keep_alive_queue.put(("alive", time.time()))
                 
             # Signal completion
+            print(f"[DEBUG] Worker finished processing {chunk_count} chunks")
             response_queue.put(("done", None))
             
         except Exception as e:
@@ -639,6 +649,7 @@ def _generate_worker():
         # Start generation thread
         generation_thread = Thread(target=_generate_worker, daemon=True)
         generation_thread.start()
+        print("[DEBUG] Started worker thread")
         
         # Initialize response state
         response = StreamResponse()
@@ -657,10 +668,13 @@ def _generate_worker():
                     print(f"[DEBUG] Raising due to unexpected init message: {msg_type}")
                     raise RuntimeError("Unexpected initialization message")
                 last_activity = timestamp
+                print("[DEBUG] Received init signal from worker")
             except Empty:
                 print(f"[DEBUG] Raising due to init timeout after {init_timeout}s")
                 raise RuntimeError(f"Model failed to initialize within {init_timeout} seconds")
             
+            chunk_count = 0
+            output_count = 0
             while True:
                 # Check for errors - now with proper exception chaining
                 if not error_queue.empty():
@@ -689,6 +703,9 @@ def _generate_worker():
                 # Get next chunk
                 try:
                     msg_type, data = response_queue.get(timeout=0.1)
+                    chunk_count += 1
+                    if chunk_count % 10 == 0:  # Log every 10th chunk to avoid spam
+                        print(f"[DEBUG] Main loop received chunk {chunk_count}")
                 except Empty:
                     continue
                 
@@ -698,33 +715,48 @@ def _generate_worker():
                     print(f"[DEBUG] Raising due to error message: {data}")
                     raise RuntimeError(f"Generation error: {data}")
                 elif msg_type == "done":
+                    print("[DEBUG] Received done signal from worker")
                     break
                 
                 chunk = data
                 
                 # Mark first token time
                 if not timing.first_token_time:
                     timing.mark_first_token()
+                    print("[DEBUG] Marked first token time")
                 
                 # Update response state from chunk
                 response.update_from_chunk(chunk, timing)
                 
                 # Yield output if we have updates
                 if response.has_updates():
+                    output_count += 1
+                    if output_count % 10 == 0:  # Log every 10th output to avoid spam
+                        print(f"[DEBUG] Yielding output {output_count}")
+                        if hasattr(response, 'usage_stats'):
+                            print(f"[DEBUG] Current usage stats: {response.usage_stats}")
                     output, buffer = response.to_output(buffer, transformer)
                     yield output
                 
                 # Break if we're done
                 if response.finish_reason:
+                    print(f"[DEBUG] Breaking loop due to finish_reason: {response.finish_reason}")
                     break
             
+            print(f"[DEBUG] Main loop finished. Processed {chunk_count} chunks, yielded {output_count} outputs")
+            if hasattr(response, 'usage_stats'):
+                print(f"[DEBUG] Final usage stats: {response.usage_stats}")
+            
             # Wait for generation thread to finish
             if generation_thread.is_alive():
+                print("[DEBUG] Waiting for worker thread to finish")
                 generation_thread.join(timeout=5.0)  # Increased timeout to 5 seconds
                 if generation_thread.is_alive():
                     # Thread didn't finish - this shouldn't happen normally
                     print("[DEBUG] Raising due to thread not finishing after 5s timeout")
                     raise RuntimeError("Generation thread failed to finish")
+                else:
+                    print("[DEBUG] Worker thread finished successfully")
                     
         except Exception as e:
             # Check if there's a thread error we should chain with