@@ -228,6 +228,7 @@ class ResponseTransformer:
228228 def __init__ (self , output_cls : type [LLMOutput ] = LLMOutput ):
229229 self .state = ResponseState ()
230230 self .output_cls = output_cls
231+ self .timing = None # Will be set by stream_generate
231232
232233 def clean_text (self , text : str ) -> str :
233234 """Clean common tokens from the text and apply model-specific cleaning.
@@ -264,10 +265,17 @@ def handle_reasoning(self, text: str) -> None:
264265 text: Cleaned text to process for reasoning
265266 """
266267 # Default implementation for <think> style reasoning
267- if "<think>" in text :
268+ if "<think>" in text and not self . state . state_changes [ "reasoning_started" ] :
268269 self .state .state_changes ["reasoning_started" ] = True
269- if "</think>" in text :
270+ if self .timing :
271+ self .timing .start_reasoning ()
272+
273+ if "</think>" in text and not self .state .state_changes ["reasoning_ended" ]:
270274 self .state .state_changes ["reasoning_ended" ] = True
275+ if self .timing :
276+ # Estimate token count from character count (rough approximation)
277+ token_count = len (self .state .buffer .split ("<think>" )[1 ].split ("</think>" )[0 ]) // 4
278+ self .timing .end_reasoning (token_count )
271279
272280 if "<think>" in self .state .buffer :
273281 parts = self .state .buffer .split ("</think>" , 1 )
@@ -381,6 +389,9 @@ def stream_generate(
381389 }
382390
383391 with timing_context () as timing :
392+ # Set timing context in transformer
393+ transformer .timing = timing
394+
384395 def generation_thread ():
385396 nonlocal thread_exception , usage_stats
386397 try :
0 commit comments