Requested refactor to use stream_options in Llama class.

tpfau · tpfau · commit 771721e6587e · 2024-07-04T10:29:31.000+03:00
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -1013,7 +1013,7 @@ def _create_completion(
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
-        stream_include_usage: Optional[bool] = False,
+        stream_options: Optional[StreamOptions] = None,
         seed: Optional[int] = None,
         tfs_z: float = 1.0,
         mirostat_mode: int = 0,
@@ -1229,7 +1229,10 @@ def logit_bias_processor(
                 break
 
             if stream:
-                include_usage = stream_include_usage
+                if stream_options is None or stream_options.include_usage == None:
+                    include_usage = False
+                else: 
+                    include_usage = stream_options.include_usage
                 remaining_tokens = completion_tokens[returned_tokens:]
                 remaining_text = self.detokenize(remaining_tokens, prev_tokens=prompt_tokens + completion_tokens[:returned_tokens])
                 remaining_length = len(remaining_text)
@@ -1583,7 +1586,7 @@ def create_completion(
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
-        stream_include_usage: bool = False,
+        stream_options: Optional[StreamOptions] = None,
         seed: Optional[int] = None,
         tfs_z: float = 1.0,
         mirostat_mode: int = 0,
@@ -1647,7 +1650,7 @@ def create_completion(
             repeat_penalty=repeat_penalty,
             top_k=top_k,
             stream=stream,
-            stream_include_usage=stream_include_usage,
+            stream_options=stream_options,
             seed=seed,
             tfs_z=tfs_z,
             mirostat_mode=mirostat_mode,
@@ -1682,7 +1685,7 @@ def __call__(
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
-        stream_include_usage: Optional[bool] = False,
+        stream_options: Optional[StreamOptions] = None,
         seed: Optional[int] = None,
         tfs_z: float = 1.0,
         mirostat_mode: int = 0,
@@ -1746,7 +1749,7 @@ def __call__(
             repeat_penalty=repeat_penalty,
             top_k=top_k,
             stream=stream,
-            stream_include_usage=stream_include_usage,
+            stream_options=stream_options,
             seed=seed,
             tfs_z=tfs_z,
             mirostat_mode=mirostat_mode,
@@ -1772,7 +1775,7 @@ def create_chat_completion(
         min_p: float = 0.05,
         typical_p: float = 1.0,
         stream: bool = False,
-        stream_include_usage: Optional[bool] = False,
+        stream_options: Optional[StreamOptions] = False,
         stop: Optional[Union[str, List[str]]] = [],
         seed: Optional[int] = None,
         response_format: Optional[ChatCompletionRequestResponseFormat] = None,
@@ -1844,7 +1847,7 @@ def create_chat_completion(
             logprobs=logprobs,
             top_logprobs=top_logprobs,
             stream=stream,
-            stream_include_usage=stream_include_usage,
+            stream_options=stream_options,
             stop=stop,
             seed=seed,
             response_format=response_format,
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -66,7 +66,7 @@ def __call__(
         top_p: float = 0.95,
         top_k: int = 40,
         stream: bool = False,
-        stream_include_usage: Optional[bool] = False,
+        stream_options: Optional[llama_types.StreamOptions] = None,
         stop: Optional[Union[str, List[str]]] = [],
         seed: Optional[int] = None,
         response_format: Optional[
@@ -544,7 +544,7 @@ def chat_completion_handler(
         min_p: float = 0.05,
         typical_p: float = 1.0,
         stream: bool = False,
-        stream_include_usage: Optional[bool] = False,
+        stream_options: Optional[llama_types.StreamOptions] = None,
         stop: Optional[Union[str, List[str]]] = [],
         seed: Optional[int] = None,
         response_format: Optional[
@@ -577,6 +577,11 @@ def chat_completion_handler(
             tool_choice=tool_choice,
         )
         prompt = llama.tokenize(result.prompt.encode("utf-8"), add_bos=not result.added_special, special=True)
+        if stream and ( stream_options is not None or stream_options.include_usage is not None):
+            stream_include_usage = stream_options.include_usage
+        else:
+            stream_include_usage = False
+
         if result.stop is not None:
             stop = [] if stop is None else [stop] if isinstance(stop, str) else stop
             rstop = result.stop if isinstance(result.stop, list) else [result.stop]
@@ -639,7 +644,7 @@ def chat_completion_handler(
             typical_p=typical_p,
             logprobs=top_logprobs if logprobs else None,
             stream=stream,
-            stream_include_usage=stream_include_usage,
+            stream_options=stream_options,
             stop=stop,
             seed=seed,
             max_tokens=max_tokens,
@@ -1372,7 +1377,7 @@ def functionary_chat_handler(
     min_p: float = 0.05,
     typical_p: float = 1.0,
     stream: bool = False,
-    stream_include_usage: bool = False,
+    stream_options: Optional(llama_types.StreamOptions) = None,
     stop: Optional[Union[str, List[str]]] = [],
     response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
     max_tokens: Optional[int] = None,
@@ -1390,6 +1395,12 @@ def functionary_chat_handler(
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
 
+    # Define stream inclusion to avoid having to re-calculate it multiple times
+    if stream and ( stream_options is not None or stream_options.include_usage is not None):
+        stream_include_usage = stream_options.include_usage
+    else:
+        stream_include_usage = False
+
     def generate_type_definition(
         param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs
     ) -> str:
@@ -1580,7 +1591,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
             min_p=min_p,
             typical_p=typical_p,
             stream=stream,
-            stream_include_usage=stream_include_usage,
+            stream_options=stream_options,
             stop=["user:", "</s>"],
             max_tokens=max_tokens,
             presence_penalty=presence_penalty,
@@ -1601,7 +1612,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
     ):
         stop = "\n"
         completion: llama_types.Completion = llama.create_completion(
-            prompt=prompt, stop=stop, stream=False, stream_include_usage=stream_include_usage
+            prompt=prompt, stop=stop, stream=False, stream_options=stream_options
         )  # type: ignore
         completion_text = completion["choices"][0]["text"]
         # strip " to=functions." and ending ":"
@@ -1657,7 +1668,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
         prompt=new_prompt,
         stop=["user:", "</s>"],
         stream=False,
-        stream_include_usage=stream_include_usage,
+        stream_options=stream_options,
         grammar=grammar,
         max_tokens=max_tokens,
         temperature=temperature,
@@ -1734,7 +1745,7 @@ def functionary_v1_v2_chat_handler(
     min_p: float = 0.05,
     typical_p: float = 1.0,
     stream: bool = False,
-    stream_include_usage: bool = False,
+    stream_options: Optional[llama_types.StreamOptions] = None,
     stop: Optional[Union[str, List[str]]] = [],
     response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
     max_tokens: Optional[int] = None,
@@ -1752,6 +1763,12 @@ def functionary_v1_v2_chat_handler(
 ) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
     SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
 
+    # Define stream inclusion to avoid having to re-calculate it multiple times
+    if stream and ( stream_options is not None or stream_options.include_usage is not None):
+        stream_include_usage = stream_options.include_usage
+    else:
+        stream_include_usage = False
+
     tokenizer = llama.tokenizer_
     assert hasattr(
         tokenizer, "hf_tokenizer"
@@ -1952,7 +1969,7 @@ def prepare_messages_for_inference(
             min_p=min_p,
             typical_p=typical_p,
             stream=stream,
-            stream_include_usage=stream_include_usage,
+            stream_options=stream_options,
             stop=stop,
             max_tokens=max_tokens,
             presence_penalty=presence_penalty,
@@ -2013,7 +2030,7 @@ def create_completion(prompt, stop, grammar):
             min_p=min_p,
             typical_p=typical_p,
             stream=stream,
-            stream_include_usage=stream_include_usage,
+            stream_options=stream_options,
             stop=stop,
             max_tokens=max_tokens,
             presence_penalty=presence_penalty,
@@ -2597,7 +2614,7 @@ def __call__(
         min_p: float = 0.05,
         typical_p: float = 1.0,
         stream: bool = False,
-        stream_include_usage: bool = False,
+        stream_options: Optional[llama_types.StreamOptions] = None,
         stop: Optional[Union[str, List[str]]] = [],
         seed: Optional[int] = None,
         response_format: Optional[
@@ -2623,6 +2640,11 @@ def __call__(
         Iterator[llama_types.CreateChatCompletionStreamResponse],
     ]:
         assert self.clip_ctx is not None
+        # Define stream inclusion to avoid having to re-calculate it multiple times
+        if stream and ( stream_options is not None or stream_options.include_usage is not None):
+            stream_include_usage = stream_options.include_usage
+        else:
+            stream_include_usage = False
 
         system_prompt = _get_system_message(messages)
         if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None:
@@ -2745,7 +2767,7 @@ def embed_image_bytes(image_bytes: bytes):
             typical_p=typical_p,
             logprobs=top_logprobs if logprobs else None,
             stream=stream,
-            stream_include_usage=stream_include_usage,
+            stream_options=stream_options,
             stop=stop,
             seed=seed,
             max_tokens=max_tokens,
@@ -3205,7 +3227,7 @@ def chatml_function_calling(
     min_p: float = 0.05,
     typical_p: float = 1.0,
     stream: bool = False,
-    stream_include_usage: bool = False,
+    stream_options: Optional[llama_types.StreamOptions] = None,
     stop: Optional[Union[str, List[str]]] = [],
     response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
     max_tokens: Optional[int] = None,
@@ -3308,6 +3330,12 @@ def chatml_function_calling(
 
     stop = [stop, "<|im_end|>"] if isinstance(stop, str) else stop + ["<|im_end|>"] if stop else ["<|im_end|>"]
 
+    # Define stream inclusion to avoid having to re-calculate it multiple times
+    if stream and ( stream_options is not None or stream_options.include_usage is not None):
+        stream_include_usage = stream_options.include_usage
+    else:
+        stream_include_usage = False
+
     # Case 1: No tool choice by user
     if (
         tool_choice is None
@@ -3334,7 +3362,7 @@ def chatml_function_calling(
                 min_p=min_p,
                 typical_p=typical_p,
                 stream=stream,
-                stream_include_usage=stream_include_usage,
+                stream_options=stream_options,
                 stop=stop,
                 max_tokens=max_tokens,
                 presence_penalty=presence_penalty,
@@ -3350,7 +3378,7 @@ def chatml_function_calling(
                 logprobs=top_logprobs if logprobs else None,
             ),
             stream=stream,
-            stream_include_usage=stream_include_usage,
+            stream_include_usage=stream_include_usage
         )
 
     # Case 2: Tool choice by user
@@ -3389,7 +3417,7 @@ def chatml_function_calling(
             min_p=min_p,
             typical_p=typical_p,
             stream=stream,
-            stream_include_usage=stream_include_usage,
+            stream_options=stream_options,
             stop=stop,
             max_tokens=max_tokens,
             presence_penalty=presence_penalty,
@@ -3461,7 +3489,7 @@ def chatml_function_calling(
                 min_p=min_p,
                 typical_p=typical_p,
                 stream=stream,
-                stream_include_usage=stream_include_usage,
+                stream_options=stream_options,
                 stop=["<|im_end|>"],
                 logprobs=top_logprobs if logprobs else None,
                 max_tokens=None,
diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
@@ -273,6 +273,8 @@ class ChatCompletionNamedToolChoice(TypedDict):
     type: Literal["function"]
     function: ChatCompletionNamedToolChoiceFunction
 
+class StreamOptions(TypedDict):
+    include_usage: Optional[bool]
 
 ChatCompletionToolChoiceOption = Union[
     Literal["none", "auto"], ChatCompletionNamedToolChoice
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
@@ -276,7 +276,6 @@ async def create_completion(
         "logit_bias_type",
         "user",
         "min_tokens",
-        "stream_options",
     }
     kwargs = body.model_dump(exclude=exclude)
 
@@ -287,8 +286,6 @@ async def create_completion(
             else body.logit_bias
         )
     # Set usage when streaming (if defined)
-    if body.stream_options is not None and body.stream_options.include_usage:
-        kwargs["stream_include_usage"] = body.stream_options.include_usage
 
     if body.grammar is not None:
         kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
@@ -460,7 +457,6 @@ async def create_chat_completion(
         "logit_bias_type",
         "user",
         "min_tokens",
-        "stream_options",
     }
     kwargs = body.model_dump(exclude=exclude)
     llama = llama_proxy(body.model)
@@ -470,11 +466,7 @@ async def create_chat_completion(
             if body.logit_bias_type == "tokens"
             else body.logit_bias
         )
-        
-    # Set usage inclusion in stream (if defined)
-    if body.stream_options is not None and body.stream_options.include_usage:        
-        kwargs["stream_include_usage"] = body.stream_options.include_usage
-
+            
     if body.grammar is not None:
         kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
 
diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py
@@ -54,6 +54,7 @@
     description="Whether to stream the results as they are generated. Useful for chatbots.",
 )
 
+# I'm currently not sure, how to best give this as settings. 
 include_usage_field = Field(
     default=False,
     description="If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.",
@@ -110,9 +111,6 @@
     description="A CBNF grammar (as string) to be used for formatting the model's output.",
 )
 
-class StreamOptions(BaseModel):
-    include_usage: Optional[bool] = include_usage_field
-
 class CreateCompletionRequest(BaseModel):
     prompt: Union[str, List[str]] = Field(
         default="", description="The prompt to generate completions for."
@@ -134,7 +132,7 @@ class CreateCompletionRequest(BaseModel):
     )
     stop: Optional[Union[str, List[str]]] = stop_field
     stream: bool = stream_field
-    stream_options: Optional[Union[StreamOptions, None]] = Field(
+    stream_options: Optional[llama_cpp.StreamOptions] = Field(
         default=None,
         description="Options for streaming response. Only set this when you set stream: true.",
     )
@@ -239,7 +237,7 @@ class CreateChatCompletionRequest(BaseModel):
     min_p: float = min_p_field
     stop: Optional[Union[str, List[str]]] = stop_field
     stream: bool = stream_field
-    stream_options: Optional[Union[StreamOptions, None]] = Field(
+    stream_options: Optional[llama_cpp.StreamOptions] = Field(
         default=None,
         description="Options for streaming response. Only set this when you set stream: true.",
     )