@@ -3801,6 +3801,106 @@ def __call__(self, **kwargs):
38013801 return super ().__call__ (** kwargs )
38023802
38033803
3804+ class GLM46VChatHandler (Llava15ChatHandler ):
3805+ GLM46V_EOS_TOKEN = "<|endoftext|>"
3806+ GLM46V_PAD_TOKEN = "<|endoftext|>"
3807+ GLM46V_IMAGE_START_TOKEN = "<|begin_of_image|>"
3808+ GLM46V_IMAGE_END_TOKEN = "<|end_of_image|>"
3809+
3810+ CHAT_FORMAT = (
3811+ "[gMASK]<sop>"
3812+ "{%- if tools -%}"
3813+ "<|system|>\n # Tools\n \n You may call one or more functions to assist with the user query.\n "
3814+ "You are provided with function signatures within <tools></tools> XML tags:\n <tools>\n "
3815+ "{%- for tool in tools -%}"
3816+ "{{ tool | tojson(ensure_ascii=False) }}\n "
3817+ "{%- endfor -%}"
3818+ "</tools>\n \n For each function call, output the function name and arguments within the following XML format:\n "
3819+ "<tool_call>{function-name}\n <arg_key>{arg-key-1}</arg_key>\n <arg_value>{arg-value-1}</arg_value>\n ...\n </tool_call>"
3820+ "{%- endif -%}"
3821+
3822+ "{%- for m in messages -%}"
3823+ "{%- if m.role == 'system' -%}"
3824+ "<|system|>\n {{ m.content }}"
3825+ "{%- elif m.role == 'user' -%}"
3826+ "<|user|>\n "
3827+ "{%- if m.content is string -%}"
3828+ "{{ m.content }}"
3829+ "{%- else -%}"
3830+ "{%- for item in m.content -%}"
3831+ "{%- if item.type == 'image_url' or 'image_url' in item -%}"
3832+ "<|begin_of_image|>"
3833+ "{%- if item.image_url is string -%}"
3834+ "{{- item.image_url -}}"
3835+ "{%- else -%}"
3836+ "{{- item.image_url.url -}}"
3837+ "{%- endif -%}"
3838+ "<|end_of_image|>"
3839+ "{%- elif item.type == 'text' -%}"
3840+ "{{ item.text }}"
3841+ "{%- endif -%}"
3842+ "{%- endfor -%}"
3843+ "{%- endif -%}"
3844+ # If enable_thinking is disabled, insert `/nothink` according to the source code logic.
3845+ "{{ '/nothink' if not enable_thinking else '' }}"
3846+ "{%- elif m.role == 'assistant' -%}"
3847+ "<|assistant|>"
3848+ "{%- if enable_thinking -%}"
3849+ "{%- set reasoning = m.reasoning_content if m.reasoning_content is string else '' -%}"
3850+ "\n <think>{{ reasoning.strip() }}</think>"
3851+ "{%- else -%}"
3852+ "\n <think></think>"
3853+ "{%- endif -%}"
3854+ "{{ '\n ' + m.content.strip() if m.content.strip() else '' }}"
3855+ "{%- endif -%}"
3856+ "{{ GLM46V_EOS_TOKEN }}"
3857+ "{%- endfor -%}"
3858+
3859+ "{%- if add_generation_prompt -%}"
3860+ "<|assistant|>\n "
3861+ "{{ '<think>' if enable_thinking else '<think></think>\n ' }}"
3862+ "{%- endif -%}"
3863+ )
3864+
3865+ def __init__ (self , enable_thinking : bool = True , ** kwargs ):
3866+ """
3867+ GLM-4.6V Handler
3868+ Parameters:
3869+ - enable_thinking (bool): Whether to enable the model's think process. The default is True.
3870+ """
3871+ self .enable_thinking = enable_thinking
3872+ super ().__init__ (** kwargs )
3873+
3874+ def __call__ (self , ** kwargs ):
3875+ self .extra_template_arguments ["enable_thinking" ] = self .enable_thinking
3876+ self .extra_template_arguments ["GLM46V_EOS_TOKEN" ] = self .GLM46V_EOS_TOKEN
3877+
3878+ # https://huggingface.co/zai-org/GLM-4.6V-Flash/blob/main/generation_config.json
3879+ kwargs ['stop' ] = [self .GLM46V_EOS_TOKEN , "<|user|>" , "<|observation|>" , "<|code_middle|>" ] # Stop token patch
3880+
3881+ llama = kwargs ['llama' ]
3882+ llama .reset ()
3883+ llama ._ctx .memory_clear (True )
3884+ llama .n_tokens = 0
3885+
3886+ if hasattr (llama , 'input_ids' ):
3887+ llama .input_ids .fill (0 )
3888+
3889+ if hasattr (self , '_last_image_embed' ):
3890+ self ._last_image_embed = None
3891+ self ._last_image_hash = None
3892+
3893+ if self .verbose :
3894+ messages = kwargs .get ('messages' , [])
3895+ try :
3896+ image_count = len (self .get_image_urls (messages ))
3897+ print (f"GLM46VChatHandler(enable_thinking={ self .enable_thinking } ) - Processing { image_count } images" , file = sys .stderr )
3898+ except Exception :
3899+ print (f"GLM46VChatHandler(enable_thinking={ self .enable_thinking } ) - Cleared state" , file = sys .stderr )
3900+
3901+ return super ().__call__ (** kwargs )
3902+
3903+
38043904class LFM2VLChatHandler (Llava15ChatHandler ):
38053905 LFM2VL_BOS_TOKEN = "<|startoftext|>"
38063906 LFM2VL_EOS_TOKEN = "<|im_end|>"
0 commit comments