Skip to content

Commit 4f0e3e8

Browse files
committed
improve llm sdk
1 parent 0a9e5df commit 4f0e3e8

File tree

1 file changed

+13
-2
lines changed

1 file changed

+13
-2
lines changed

src/inferencesh/models/llm.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,7 @@ class ResponseTransformer:
228228
def __init__(self, output_cls: type[LLMOutput] = LLMOutput):
229229
self.state = ResponseState()
230230
self.output_cls = output_cls
231+
self.timing = None # Will be set by stream_generate
231232

232233
def clean_text(self, text: str) -> str:
233234
"""Clean common tokens from the text and apply model-specific cleaning.
@@ -264,10 +265,17 @@ def handle_reasoning(self, text: str) -> None:
264265
text: Cleaned text to process for reasoning
265266
"""
266267
# Default implementation for <think> style reasoning
267-
if "<think>" in text:
268+
if "<think>" in text and not self.state.state_changes["reasoning_started"]:
268269
self.state.state_changes["reasoning_started"] = True
269-
if "</think>" in text:
270+
if self.timing:
271+
self.timing.start_reasoning()
272+
273+
if "</think>" in text and not self.state.state_changes["reasoning_ended"]:
270274
self.state.state_changes["reasoning_ended"] = True
275+
if self.timing:
276+
# Estimate token count from character count (rough approximation)
277+
token_count = len(self.state.buffer.split("<think>")[1].split("</think>")[0]) // 4
278+
self.timing.end_reasoning(token_count)
271279

272280
if "<think>" in self.state.buffer:
273281
parts = self.state.buffer.split("</think>", 1)
@@ -381,6 +389,9 @@ def stream_generate(
381389
}
382390

383391
with timing_context() as timing:
392+
# Set timing context in transformer
393+
transformer.timing = timing
394+
384395
def generation_thread():
385396
nonlocal thread_exception, usage_stats
386397
try:

0 commit comments

Comments
 (0)