@@ -2868,65 +2868,63 @@ def _init_mtmd_context(self, llama_model: llama.Llama):
28682868 if self .mtmd_ctx is not None :
28692869 return # Already initialized
28702870
2871- with suppress_stdout_stderr (disable = self .verbose ):
2872- self ._mtmd_cpp .mtmd_helper_log_set (llama_log_callback , ctypes .c_void_p (0 ))
2873-
2874- # Get default parameters
2875- self .mctx_params = self ._mtmd_cpp .mtmd_context_params_default ()
2876- self .mctx_params .use_gpu = self .use_gpu
2877- self .mctx_params .print_timings = self .verbose
2878- self .mctx_params .n_threads = llama_model .n_threads
2879- self .mctx_params .flash_attn_type = self ._mtmd_cpp .clip_flash_attn_type .CLIP_FLASH_ATTN_TYPE_AUTO
2880- self .mctx_params .warmup = True
2881- if self .image_min_tokens > 0 :
2882- self .mctx_params .image_min_tokens = self .image_min_tokens
2883- if self .image_max_tokens > 0 :
2884- self .mctx_params .image_max_tokens = self .image_max_tokens
2885- if (self .image_max_tokens < self .image_min_tokens ) and self .image_max_tokens > 0 :
2886- raise ValueError (f"{ self .log_prefix } (_init_mtmd_context): Configuration Error! image_max_tokens ({ self .image_max_tokens } ) "
2887- f"cannot be less than image_min_tokens ({ self .image_min_tokens } )." )
2888-
2889- # Cache the model's eos token and bos token
2890- self .mtmd_eos_token = llama_model .detokenize ([llama_model .token_eos ()]).decode ('utf-8' , errors = 'ignore' )
2891- self .mtmd_bos_token = llama_model .detokenize ([llama_model .token_bos ()]).decode ('utf-8' , errors = 'ignore' )
2892-
2893- # Cache the mtmd_default_marker
2894- self .media_marker = self ._mtmd_cpp .mtmd_default_marker ().decode ('utf-8' )
2895-
2896- # Initialize mtmd context
2897- self .mtmd_ctx = self ._mtmd_cpp .mtmd_init_from_file (
2898- self .clip_model_path .encode (),
2899- llama_model .model ,
2900- self .mctx_params
2901- )
2902-
2903- if self .mtmd_ctx is None :
2904- raise ValueError (f"{ self .log_prefix } (_init_mtmd_context): Failed to load mtmd context from: { self .clip_model_path } " )
2871+ self ._mtmd_cpp .mtmd_helper_log_set (llama_log_callback , ctypes .c_void_p (0 ))
2872+
2873+ # Get default parameters
2874+ self .mctx_params = self ._mtmd_cpp .mtmd_context_params_default ()
2875+ self .mctx_params .use_gpu = self .use_gpu
2876+ self .mctx_params .print_timings = self .verbose
2877+ self .mctx_params .n_threads = llama_model .n_threads
2878+ self .mctx_params .flash_attn_type = self ._mtmd_cpp .clip_flash_attn_type .CLIP_FLASH_ATTN_TYPE_AUTO
2879+ self .mctx_params .warmup = True
2880+ if self .image_min_tokens > 0 :
2881+ self .mctx_params .image_min_tokens = self .image_min_tokens
2882+ if self .image_max_tokens > 0 :
2883+ self .mctx_params .image_max_tokens = self .image_max_tokens
2884+ if (self .image_max_tokens < self .image_min_tokens ) and self .image_max_tokens > 0 :
2885+ raise ValueError (f"{ self .log_prefix } (_init_mtmd_context): Configuration Error! image_max_tokens ({ self .image_max_tokens } ) "
2886+ f"cannot be less than image_min_tokens ({ self .image_min_tokens } )." )
2887+
2888+ # Cache the model's eos token and bos token
2889+ self .mtmd_eos_token = llama_model .detokenize ([llama_model .token_eos ()]).decode ('utf-8' , errors = 'ignore' )
2890+ self .mtmd_bos_token = llama_model .detokenize ([llama_model .token_bos ()]).decode ('utf-8' , errors = 'ignore' )
2891+
2892+ # Cache the mtmd_default_marker
2893+ self .media_marker = self ._mtmd_cpp .mtmd_default_marker ().decode ('utf-8' )
2894+
2895+ # Initialize mtmd context
2896+ self .mtmd_ctx = self ._mtmd_cpp .mtmd_init_from_file (
2897+ self .clip_model_path .encode (),
2898+ llama_model .model ,
2899+ self .mctx_params
2900+ )
29052901
2906- # Check if vision is supported
2907- self .is_support_vision = self ._mtmd_cpp .mtmd_support_vision (self .mtmd_ctx )
2908- if self .is_support_vision :
2909- if self .verbose :
2910- print (f"{ self .log_prefix } (_init_mtmd_context): Vision support detected." , file = sys .stderr )
2911- else :
2912- if self .verbose :
2913- print (f"{ self .log_prefix } (_init_mtmd_context): Vision is NOT supported by this mmproj model backend." , file = sys .stderr )
2902+ if self .mtmd_ctx is None :
2903+ raise ValueError (f"{ self .log_prefix } (_init_mtmd_context): Failed to load mtmd context from: { self .clip_model_path } " )
29142904
2915- # Check if audio is supported
2916- self .is_support_audio = self ._mtmd_cpp .mtmd_support_audio (self .mtmd_ctx )
2917- if self .is_support_audio :
2918- if self .verbose :
2919- print (f"{ self .log_prefix } (_init_mtmd_context): Audio support detected." , file = sys .stderr )
2920- else :
2921- if self .verbose :
2922- print (f"{ self .log_prefix } (_init_mtmd_context): Audio is NOT supported by this mmproj model backend." , file = sys .stderr )
2905+ # Check if vision is supported
2906+ self .is_support_vision = self ._mtmd_cpp .mtmd_support_vision (self .mtmd_ctx )
2907+ if self .is_support_vision :
2908+ if self .verbose :
2909+ print (f"{ self .log_prefix } (_init_mtmd_context): Vision support detected." , file = sys .stderr )
2910+ else :
2911+ if self .verbose :
2912+ print (f"{ self .log_prefix } (_init_mtmd_context): Vision is NOT supported by this mmproj model backend." , file = sys .stderr )
2913+
2914+ # Check if audio is supported
2915+ self .is_support_audio = self ._mtmd_cpp .mtmd_support_audio (self .mtmd_ctx )
2916+ if self .is_support_audio :
2917+ if self .verbose :
2918+ print (f"{ self .log_prefix } (_init_mtmd_context): Audio support detected." , file = sys .stderr )
2919+ else :
2920+ if self .verbose :
2921+ print (f"{ self .log_prefix } (_init_mtmd_context): Audio is NOT supported by this mmproj model backend." , file = sys .stderr )
29232922
29242923 def close (self ) -> None :
29252924 """Explicitly free the mtmd context and vision model resources."""
29262925 if getattr (self , "mtmd_ctx" , None ) is not None :
29272926 try :
2928- with suppress_stdout_stderr (disable = getattr (self , "verbose" , True )):
2929- self ._mtmd_cpp .mtmd_free (self .mtmd_ctx )
2927+ self ._mtmd_cpp .mtmd_free (self .mtmd_ctx )
29302928 except Exception :
29312929 pass
29322930 self .mtmd_ctx = None
@@ -3027,20 +3025,19 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes):
30273025 if self .mtmd_ctx is None :
30283026 raise ValueError (f"{ self .log_prefix } (_create_bitmap_from_bytes): mtmd context not initialized." )
30293027
3030- with suppress_stdout_stderr (disable = self .verbose ):
3031- # Create bitmap from buffer using helper function
3032- bitmap = self ._mtmd_cpp .mtmd_helper_bitmap_init_from_buf (
3033- self .mtmd_ctx ,
3034- (ctypes .c_uint8 * len (media_bytes )).from_buffer (bytearray (media_bytes )),
3035- len (media_bytes )
3036- )
3028+ # Create bitmap from buffer using helper function
3029+ bitmap = self ._mtmd_cpp .mtmd_helper_bitmap_init_from_buf (
3030+ self .mtmd_ctx ,
3031+ (ctypes .c_uint8 * len (media_bytes )).from_buffer (bytearray (media_bytes )),
3032+ len (media_bytes )
3033+ )
30373034
3038- if bitmap is None :
3039- raise ValueError (f"{ self .log_prefix } (_create_bitmap_from_bytes): "
3040- "Failed to load image or audio file from media bytes "
3041- "(unsupported media format or corrupted data)." )
3035+ if bitmap is None :
3036+ raise ValueError (f"{ self .log_prefix } (_create_bitmap_from_bytes): "
3037+ "Failed to load image or audio file from media bytes "
3038+ "(unsupported media format or corrupted data)." )
30423039
3043- return bitmap
3040+ return bitmap
30443041
30453042
30463043 def _process_mtmd_prompt (
0 commit comments