@@ -139,13 +139,13 @@ def stats(self):
139139
140140def build_messages (
141141 input_data : LLMInput ,
142- message_modifier : Optional [Callable [[List [ Dict [ str , Any ]]], List [ Dict [ str , Any ]] ]] = None
142+ transform_user_message : Optional [Callable [[str ], str ]] = None
143143) -> List [Dict [str , Any ]]:
144144 """Build messages for LLaMA.cpp chat completion.
145145
146146 Args:
147147 input_data: The input data
148- message_modifier : Optional function to modify messages before returning (e.g. add /think)
148+ transform_user_message : Optional function to transform user message text before building messages
149149 """
150150 messages = [
151151 {
@@ -157,8 +157,11 @@ def build_messages(
157157 # Add context messages
158158 for msg in input_data .context :
159159 message_content = []
160- if msg .text :
161- message_content .append ({"type" : "text" , "text" : msg .text })
160+ text = msg .text
161+ if transform_user_message and msg .role == ContextMessageRole .USER :
162+ text = transform_user_message (text )
163+ if text :
164+ message_content .append ({"type" : "text" , "text" : text })
162165 if hasattr (msg , 'image' ) and msg .image :
163166 if msg .image .path :
164167 message_content .append ({"type" : "image_url" , "image_url" : {"url" : msg .image .path }})
@@ -171,18 +174,18 @@ def build_messages(
171174
172175 # Add user message
173176 user_content = []
174- if input_data .text :
175- user_content .append ({"type" : "text" , "text" : input_data .text })
177+ text = input_data .text
178+ if transform_user_message :
179+ text = transform_user_message (text )
180+ if text :
181+ user_content .append ({"type" : "text" , "text" : text })
176182 if hasattr (input_data , 'image' ) and input_data .image :
177183 if input_data .image .path :
178184 user_content .append ({"type" : "image_url" , "image_url" : {"url" : input_data .image .path }})
179185 elif input_data .image .uri :
180186 user_content .append ({"type" : "image_url" , "image_url" : {"url" : input_data .image .uri }})
181187 messages .append ({"role" : "user" , "content" : user_content })
182188
183- if message_modifier :
184- messages = message_modifier (messages )
185-
186189 return messages
187190
188191
@@ -195,8 +198,21 @@ def stream_generate(
195198 max_tokens : int = 4096 ,
196199 stop : Optional [List [str ]] = None ,
197200 handle_thinking : bool = False ,
201+ transform_response : Optional [Callable [[str , str ], tuple [str , LLMOutput ]]] = None ,
198202) -> Generator [LLMOutput , None , None ]:
199- """Stream generate from LLaMA.cpp model with timing and usage tracking."""
203+ """Stream generate from LLaMA.cpp model with timing and usage tracking.
204+
205+ Args:
206+ model: The LLaMA.cpp model instance
207+ messages: List of messages to send to the model
208+ output_cls: Output class type to use for responses
209+ temperature: Sampling temperature
210+ top_p: Top-p sampling threshold
211+ max_tokens: Maximum tokens to generate
212+ stop: Optional list of stop sequences
213+ handle_thinking: Whether to handle thinking tags
214+ transform_response: Optional function to transform responses, takes (piece, buffer) and returns (new_buffer, output)
215+ """
200216 response_queue : Queue [Optional [tuple [str , dict ]]] = Queue ()
201217 thread_exception = None
202218 usage_stats = {
@@ -279,32 +295,57 @@ def generation_thread():
279295 completion_tokens = usage_stats ["completion_tokens" ],
280296 total_tokens = usage_stats ["total_tokens" ]
281297 )
282- yield output_cls (
283- response = buffer .strip (),
284- thinking_content = thinking_content .strip () if thinking_content else None ,
285- usage = usage
286- )
298+
299+ if transform_response :
300+ buffer , output = transform_response (piece or "" , buffer )
301+ output .usage = usage
302+ yield output
303+ else :
304+ # Handle thinking vs response content if enabled
305+ if handle_thinking and "</think>" in piece :
306+ parts = piece .split ("</think>" )
307+ if in_thinking :
308+ thinking_content += parts [0 ].replace ("<think>" , "" )
309+ buffer = parts [1 ] if len (parts ) > 1 else ""
310+ in_thinking = False
311+ else :
312+ buffer += piece
313+ else :
314+ if in_thinking :
315+ thinking_content += piece .replace ("<think>" , "" )
316+ else :
317+ buffer += piece
318+
319+ yield output_cls (
320+ response = buffer .strip (),
321+ thinking_content = thinking_content .strip () if thinking_content else None ,
322+ usage = usage
323+ )
287324 break
288325
289- # Handle thinking vs response content if enabled
290- if handle_thinking and "</think>" in piece :
291- parts = piece .split ("</think>" )
292- if in_thinking :
293- thinking_content += parts [0 ].replace ("<think>" , "" )
294- buffer = parts [1 ] if len (parts ) > 1 else ""
295- in_thinking = False
296- else :
297- buffer += piece
326+ if transform_response :
327+ buffer , output = transform_response (piece , buffer )
328+ yield output
298329 else :
299- if in_thinking :
300- thinking_content += piece .replace ("<think>" , "" )
330+ # Handle thinking vs response content if enabled
331+ if handle_thinking and "</think>" in piece :
332+ parts = piece .split ("</think>" )
333+ if in_thinking :
334+ thinking_content += parts [0 ].replace ("<think>" , "" )
335+ buffer = parts [1 ] if len (parts ) > 1 else ""
336+ in_thinking = False
337+ else :
338+ buffer += piece
301339 else :
302- buffer += piece
340+ if in_thinking :
341+ thinking_content += piece .replace ("<think>" , "" )
342+ else :
343+ buffer += piece
303344
304- yield output_cls (
305- response = buffer .strip (),
306- thinking_content = thinking_content .strip () if thinking_content else None
307- )
345+ yield output_cls (
346+ response = buffer .strip (),
347+ thinking_content = thinking_content .strip () if thinking_content else None
348+ )
308349
309350 except Exception as e :
310351 if thread_exception and isinstance (e , thread_exception .__class__ ):
0 commit comments