|
33 | 33 | import urllib.request |
34 | 34 | from urllib.error import URLError, HTTPError |
35 | 35 |
|
36 | | -import llama_cpp.llama_cpp as llama_cpp |
37 | | -import llama_cpp.llama as llama |
| 36 | +import llama_cpp.llama_cpp as llama_cpp_lib |
| 37 | +import llama_cpp.llama as llama_core |
38 | 38 | import llama_cpp.llama_types as llama_types |
39 | 39 | import llama_cpp.llama_grammar as llama_grammar |
40 | 40 |
|
@@ -85,7 +85,7 @@ def __call__( |
85 | 85 | self, |
86 | 86 | *, |
87 | 87 | # llama.cpp instance |
88 | | - llama: llama.Llama, |
| 88 | + llama: llama_core.Llama, |
89 | 89 | # openai api parameters |
90 | 90 | messages: List[llama_types.ChatCompletionRequestMessage], |
91 | 91 | functions: Optional[List[llama_types.ChatCompletionFunction]] = None, |
@@ -124,8 +124,8 @@ def __call__( |
124 | 124 | adaptive_target : float = -1.0, |
125 | 125 | adaptive_decay : float = 0.9, |
126 | 126 | use_infill: bool = False, |
127 | | - logits_processor: Optional[llama.LogitsProcessorList] = None, |
128 | | - grammar: Optional[llama.LlamaGrammar] = None, |
| 127 | + logits_processor: Optional[llama_core.LogitsProcessorList] = None, |
| 128 | + grammar: Optional[llama_grammar.LlamaGrammar] = None, |
129 | 129 | logprobs: Optional[bool] = None, |
130 | 130 | top_logprobs: Optional[int] = None, |
131 | 131 | **kwargs, # type: ignore |
@@ -199,7 +199,7 @@ class ChatFormatterResponse: |
199 | 199 |
|
200 | 200 | prompt: str |
201 | 201 | stop: Optional[Union[str, List[str]]] = None |
202 | | - stopping_criteria: Optional[llama.StoppingCriteriaList] = None |
| 202 | + stopping_criteria: Optional[llama_core.StoppingCriteriaList] = None |
203 | 203 | added_special: bool = False |
204 | 204 |
|
205 | 205 |
|
@@ -281,7 +281,7 @@ def stop_on_last_token( |
281 | 281 | ) -> bool: |
282 | 282 | return tokens[-1] in self.stop_token_ids |
283 | 283 |
|
284 | | - stopping_criteria = llama.StoppingCriteriaList([stop_on_last_token]) |
| 284 | + stopping_criteria = llama_core.StoppingCriteriaList([stop_on_last_token]) |
285 | 285 |
|
286 | 286 | return ChatFormatterResponse( |
287 | 287 | prompt=prompt, |
@@ -585,7 +585,7 @@ def chat_formatter_to_chat_completion_handler( |
585 | 585 | ) -> LlamaChatCompletionHandler: |
586 | 586 | def chat_completion_handler( |
587 | 587 | *, |
588 | | - llama: llama.Llama, |
| 588 | + llama: llama_core.Llama, |
589 | 589 | messages: List[llama_types.ChatCompletionRequestMessage], |
590 | 590 | functions: Optional[List[llama_types.ChatCompletionFunction]] = None, |
591 | 591 | function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, |
@@ -621,8 +621,8 @@ def chat_completion_handler( |
621 | 621 | adaptive_decay : float = 0.9, |
622 | 622 | use_infill: bool = False, |
623 | 623 | model: Optional[str] = None, |
624 | | - logits_processor: Optional[llama.LogitsProcessorList] = None, |
625 | | - grammar: Optional[llama.LlamaGrammar] = None, |
| 624 | + logits_processor: Optional[llama_core.LogitsProcessorList] = None, |
| 625 | + grammar: Optional[llama_grammar.LlamaGrammar] = None, |
626 | 626 | logit_bias: Optional[Dict[str, float]] = None, |
627 | 627 | logprobs: Optional[bool] = None, |
628 | 628 | top_logprobs: Optional[int] = None, |
@@ -1467,7 +1467,7 @@ def format_gemma( |
1467 | 1467 |
|
1468 | 1468 | @register_chat_completion_handler("functionary") |
1469 | 1469 | def functionary_chat_handler( |
1470 | | - llama: llama.Llama, |
| 1470 | + llama: llama_core.Llama, |
1471 | 1471 | messages: List[llama_types.ChatCompletionRequestMessage], |
1472 | 1472 | functions: Optional[List[llama_types.ChatCompletionFunction]] = None, |
1473 | 1473 | function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, |
@@ -1500,8 +1500,8 @@ def functionary_chat_handler( |
1500 | 1500 | adaptive_decay : float = 0.9, |
1501 | 1501 | use_infill: bool = False, |
1502 | 1502 | model: Optional[str] = None, |
1503 | | - logits_processor: Optional[llama.LogitsProcessorList] = None, |
1504 | | - grammar: Optional[llama.LlamaGrammar] = None, |
| 1503 | + logits_processor: Optional[llama_core.LogitsProcessorList] = None, |
| 1504 | + grammar: Optional[llama_grammar.LlamaGrammar] = None, |
1505 | 1505 | **kwargs, # type: ignore |
1506 | 1506 | ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: |
1507 | 1507 | SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" |
@@ -1856,7 +1856,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage): |
1856 | 1856 | @register_chat_completion_handler("functionary-v1") |
1857 | 1857 | @register_chat_completion_handler("functionary-v2") |
1858 | 1858 | def functionary_v1_v2_chat_handler( |
1859 | | - llama: llama.Llama, |
| 1859 | + llama: llama_core.Llama, |
1860 | 1860 | messages: List[llama_types.ChatCompletionRequestMessage], |
1861 | 1861 | functions: Optional[List[llama_types.ChatCompletionFunction]] = None, |
1862 | 1862 | function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, |
@@ -1889,8 +1889,8 @@ def functionary_v1_v2_chat_handler( |
1889 | 1889 | adaptive_decay : float = 0.9, |
1890 | 1890 | use_infill: bool = False, |
1891 | 1891 | model: Optional[str] = None, |
1892 | | - logits_processor: Optional[llama.LogitsProcessorList] = None, |
1893 | | - grammar: Optional[llama.LlamaGrammar] = None, |
| 1892 | + logits_processor: Optional[llama_core.LogitsProcessorList] = None, |
| 1893 | + grammar: Optional[llama_grammar.LlamaGrammar] = None, |
1894 | 1894 | **kwargs, # type: ignore |
1895 | 1895 | ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]: |
1896 | 1896 | SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary""" |
@@ -2868,7 +2868,7 @@ def __init__( |
2868 | 2868 |
|
2869 | 2869 | self._exit_stack = ExitStack() |
2870 | 2870 |
|
2871 | | - def _init_mtmd_context(self, llama_model: llama.Llama): |
| 2871 | + def _init_mtmd_context(self, llama_model: llama_core.Llama): |
2872 | 2872 | """Initialize mtmd context with the llama model.""" |
2873 | 2873 | if self.mtmd_ctx is not None: |
2874 | 2874 | return # Already initialized |
@@ -3047,7 +3047,7 @@ def _create_bitmap_from_bytes(self, media_bytes: bytes): |
3047 | 3047 |
|
3048 | 3048 | def _process_mtmd_prompt( |
3049 | 3049 | self, |
3050 | | - llama: llama.Llama, |
| 3050 | + llama: llama_core.Llama, |
3051 | 3051 | messages: List[llama_types.ChatCompletionRequestMessage], |
3052 | 3052 | ) -> Tuple[List[int], List[tuple], Any, List[Any]]: |
3053 | 3053 | """ |
@@ -3212,7 +3212,7 @@ def _create_bitmap_func(idx: int, item: str): |
3212 | 3212 | def __call__( |
3213 | 3213 | self, |
3214 | 3214 | *, |
3215 | | - llama: llama.Llama, |
| 3215 | + llama: llama_core.Llama, |
3216 | 3216 | messages: List[llama_types.ChatCompletionRequestMessage], |
3217 | 3217 | functions: Optional[List[llama_types.ChatCompletionFunction]] = None, |
3218 | 3218 | function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, |
@@ -3248,8 +3248,8 @@ def __call__( |
3248 | 3248 | adaptive_decay : float = 0.9, |
3249 | 3249 | use_infill: bool = False, |
3250 | 3250 | model: Optional[str] = None, |
3251 | | - logits_processor: Optional[llama.LogitsProcessorList] = None, |
3252 | | - grammar: Optional[llama.LlamaGrammar] = None, |
| 3251 | + logits_processor: Optional[llama_core.LogitsProcessorList] = None, |
| 3252 | + grammar: Optional[llama_grammar.LlamaGrammar] = None, |
3253 | 3253 | logit_bias: Optional[Dict[str, float]] = None, |
3254 | 3254 | logprobs: Optional[bool] = None, |
3255 | 3255 | top_logprobs: Optional[int] = None, |
@@ -3367,13 +3367,13 @@ def __call__( |
3367 | 3367 | llama.n_tokens = n_past |
3368 | 3368 |
|
3369 | 3369 | # Execute C++ Multimodal Black-box Extraction |
3370 | | - new_n_past = llama_cpp.llama_pos(0) |
| 3370 | + new_n_past = llama_cpp_lib.llama_pos(0) |
3371 | 3371 | result = self._mtmd_cpp.mtmd_helper_eval_chunk_single( |
3372 | 3372 | self.mtmd_ctx, |
3373 | 3373 | llama._ctx.ctx, |
3374 | 3374 | chunk_ptr, |
3375 | | - llama_cpp.llama_pos(n_past), |
3376 | | - llama_cpp.llama_seq_id(0), |
| 3375 | + llama_cpp_lib.llama_pos(n_past), |
| 3376 | + llama_cpp_lib.llama_seq_id(0), |
3377 | 3377 | llama.n_batch, |
3378 | 3378 | True, # logits_last = True, drastically saves computational overhead |
3379 | 3379 | ctypes.byref(new_n_past) |
@@ -5022,7 +5022,7 @@ def __call__(self, **kwargs): |
5022 | 5022 |
|
5023 | 5023 | @register_chat_completion_handler("chatml-function-calling") |
5024 | 5024 | def chatml_function_calling( |
5025 | | - llama: llama.Llama, |
| 5025 | + llama: llama_core.Llama, |
5026 | 5026 | messages: List[llama_types.ChatCompletionRequestMessage], |
5027 | 5027 | functions: Optional[List[llama_types.ChatCompletionFunction]] = None, |
5028 | 5028 | function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None, |
@@ -5055,8 +5055,8 @@ def chatml_function_calling( |
5055 | 5055 | adaptive_decay : float = 0.9, |
5056 | 5056 | use_infill: bool = False, |
5057 | 5057 | model: Optional[str] = None, |
5058 | | - logits_processor: Optional[llama.LogitsProcessorList] = None, |
5059 | | - grammar: Optional[llama.LlamaGrammar] = None, |
| 5058 | + logits_processor: Optional[llama_core.LogitsProcessorList] = None, |
| 5059 | + grammar: Optional[llama_grammar.LlamaGrammar] = None, |
5060 | 5060 | logprobs: Optional[bool] = None, |
5061 | 5061 | top_logprobs: Optional[int] = None, |
5062 | 5062 | **kwargs, # type: ignore |
|
0 commit comments