Skip to content

Commit 771721e

Browse files
committed
Requested refactor to use stream_options in Llama class.
1 parent a04db0e commit 771721e

File tree

5 files changed

+62
-39
lines changed

5 files changed

+62
-39
lines changed

llama_cpp/llama.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -1013,7 +1013,7 @@ def _create_completion(
10131013
repeat_penalty: float = 1.1,
10141014
top_k: int = 40,
10151015
stream: bool = False,
1016-
stream_include_usage: Optional[bool] = False,
1016+
stream_options: Optional[StreamOptions] = None,
10171017
seed: Optional[int] = None,
10181018
tfs_z: float = 1.0,
10191019
mirostat_mode: int = 0,
@@ -1229,7 +1229,10 @@ def logit_bias_processor(
12291229
break
12301230

12311231
if stream:
1232-
include_usage = stream_include_usage
1232+
if stream_options is None or stream_options.include_usage == None:
1233+
include_usage = False
1234+
else:
1235+
include_usage = stream_options.include_usage
12331236
remaining_tokens = completion_tokens[returned_tokens:]
12341237
remaining_text = self.detokenize(remaining_tokens, prev_tokens=prompt_tokens + completion_tokens[:returned_tokens])
12351238
remaining_length = len(remaining_text)
@@ -1583,7 +1586,7 @@ def create_completion(
15831586
repeat_penalty: float = 1.1,
15841587
top_k: int = 40,
15851588
stream: bool = False,
1586-
stream_include_usage: bool = False,
1589+
stream_options: Optional[StreamOptions] = None,
15871590
seed: Optional[int] = None,
15881591
tfs_z: float = 1.0,
15891592
mirostat_mode: int = 0,
@@ -1647,7 +1650,7 @@ def create_completion(
16471650
repeat_penalty=repeat_penalty,
16481651
top_k=top_k,
16491652
stream=stream,
1650-
stream_include_usage=stream_include_usage,
1653+
stream_options=stream_options,
16511654
seed=seed,
16521655
tfs_z=tfs_z,
16531656
mirostat_mode=mirostat_mode,
@@ -1682,7 +1685,7 @@ def __call__(
16821685
repeat_penalty: float = 1.1,
16831686
top_k: int = 40,
16841687
stream: bool = False,
1685-
stream_include_usage: Optional[bool] = False,
1688+
stream_options: Optional[StreamOptions] = None,
16861689
seed: Optional[int] = None,
16871690
tfs_z: float = 1.0,
16881691
mirostat_mode: int = 0,
@@ -1746,7 +1749,7 @@ def __call__(
17461749
repeat_penalty=repeat_penalty,
17471750
top_k=top_k,
17481751
stream=stream,
1749-
stream_include_usage=stream_include_usage,
1752+
stream_options=stream_options,
17501753
seed=seed,
17511754
tfs_z=tfs_z,
17521755
mirostat_mode=mirostat_mode,
@@ -1772,7 +1775,7 @@ def create_chat_completion(
17721775
min_p: float = 0.05,
17731776
typical_p: float = 1.0,
17741777
stream: bool = False,
1775-
stream_include_usage: Optional[bool] = False,
1778+
stream_options: Optional[StreamOptions] = False,
17761779
stop: Optional[Union[str, List[str]]] = [],
17771780
seed: Optional[int] = None,
17781781
response_format: Optional[ChatCompletionRequestResponseFormat] = None,
@@ -1844,7 +1847,7 @@ def create_chat_completion(
18441847
logprobs=logprobs,
18451848
top_logprobs=top_logprobs,
18461849
stream=stream,
1847-
stream_include_usage=stream_include_usage,
1850+
stream_options=stream_options,
18481851
stop=stop,
18491852
seed=seed,
18501853
response_format=response_format,

llama_cpp/llama_chat_format.py

+45-17
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def __call__(
6666
top_p: float = 0.95,
6767
top_k: int = 40,
6868
stream: bool = False,
69-
stream_include_usage: Optional[bool] = False,
69+
stream_options: Optional[llama_types.StreamOptions] = None,
7070
stop: Optional[Union[str, List[str]]] = [],
7171
seed: Optional[int] = None,
7272
response_format: Optional[
@@ -544,7 +544,7 @@ def chat_completion_handler(
544544
min_p: float = 0.05,
545545
typical_p: float = 1.0,
546546
stream: bool = False,
547-
stream_include_usage: Optional[bool] = False,
547+
stream_options: Optional[llama_types.StreamOptions] = None,
548548
stop: Optional[Union[str, List[str]]] = [],
549549
seed: Optional[int] = None,
550550
response_format: Optional[
@@ -577,6 +577,11 @@ def chat_completion_handler(
577577
tool_choice=tool_choice,
578578
)
579579
prompt = llama.tokenize(result.prompt.encode("utf-8"), add_bos=not result.added_special, special=True)
580+
if stream and ( stream_options is not None or stream_options.include_usage is not None):
581+
stream_include_usage = stream_options.include_usage
582+
else:
583+
stream_include_usage = False
584+
580585
if result.stop is not None:
581586
stop = [] if stop is None else [stop] if isinstance(stop, str) else stop
582587
rstop = result.stop if isinstance(result.stop, list) else [result.stop]
@@ -639,7 +644,7 @@ def chat_completion_handler(
639644
typical_p=typical_p,
640645
logprobs=top_logprobs if logprobs else None,
641646
stream=stream,
642-
stream_include_usage=stream_include_usage,
647+
stream_options=stream_options,
643648
stop=stop,
644649
seed=seed,
645650
max_tokens=max_tokens,
@@ -1372,7 +1377,7 @@ def functionary_chat_handler(
13721377
min_p: float = 0.05,
13731378
typical_p: float = 1.0,
13741379
stream: bool = False,
1375-
stream_include_usage: bool = False,
1380+
stream_options: Optional(llama_types.StreamOptions) = None,
13761381
stop: Optional[Union[str, List[str]]] = [],
13771382
response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
13781383
max_tokens: Optional[int] = None,
@@ -1390,6 +1395,12 @@ def functionary_chat_handler(
13901395
) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
13911396
SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
13921397

1398+
# Define stream inclusion to avoid having to re-calculate it multiple times
1399+
if stream and ( stream_options is not None or stream_options.include_usage is not None):
1400+
stream_include_usage = stream_options.include_usage
1401+
else:
1402+
stream_include_usage = False
1403+
13931404
def generate_type_definition(
13941405
param: Dict[str, llama_types.JsonType], indent_level: int, shared_defs
13951406
) -> str:
@@ -1580,7 +1591,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
15801591
min_p=min_p,
15811592
typical_p=typical_p,
15821593
stream=stream,
1583-
stream_include_usage=stream_include_usage,
1594+
stream_options=stream_options,
15841595
stop=["user:", "</s>"],
15851596
max_tokens=max_tokens,
15861597
presence_penalty=presence_penalty,
@@ -1601,7 +1612,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
16011612
):
16021613
stop = "\n"
16031614
completion: llama_types.Completion = llama.create_completion(
1604-
prompt=prompt, stop=stop, stream=False, stream_include_usage=stream_include_usage
1615+
prompt=prompt, stop=stop, stream=False, stream_options=stream_options
16051616
) # type: ignore
16061617
completion_text = completion["choices"][0]["text"]
16071618
# strip " to=functions." and ending ":"
@@ -1657,7 +1668,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
16571668
prompt=new_prompt,
16581669
stop=["user:", "</s>"],
16591670
stream=False,
1660-
stream_include_usage=stream_include_usage,
1671+
stream_options=stream_options,
16611672
grammar=grammar,
16621673
max_tokens=max_tokens,
16631674
temperature=temperature,
@@ -1734,7 +1745,7 @@ def functionary_v1_v2_chat_handler(
17341745
min_p: float = 0.05,
17351746
typical_p: float = 1.0,
17361747
stream: bool = False,
1737-
stream_include_usage: bool = False,
1748+
stream_options: Optional[llama_types.StreamOptions] = None,
17381749
stop: Optional[Union[str, List[str]]] = [],
17391750
response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
17401751
max_tokens: Optional[int] = None,
@@ -1752,6 +1763,12 @@ def functionary_v1_v2_chat_handler(
17521763
) -> Union[llama_types.ChatCompletion, Iterator[llama_types.ChatCompletionChunk]]:
17531764
SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
17541765

1766+
# Define stream inclusion to avoid having to re-calculate it multiple times
1767+
if stream and ( stream_options is not None or stream_options.include_usage is not None):
1768+
stream_include_usage = stream_options.include_usage
1769+
else:
1770+
stream_include_usage = False
1771+
17551772
tokenizer = llama.tokenizer_
17561773
assert hasattr(
17571774
tokenizer, "hf_tokenizer"
@@ -1952,7 +1969,7 @@ def prepare_messages_for_inference(
19521969
min_p=min_p,
19531970
typical_p=typical_p,
19541971
stream=stream,
1955-
stream_include_usage=stream_include_usage,
1972+
stream_options=stream_options,
19561973
stop=stop,
19571974
max_tokens=max_tokens,
19581975
presence_penalty=presence_penalty,
@@ -2013,7 +2030,7 @@ def create_completion(prompt, stop, grammar):
20132030
min_p=min_p,
20142031
typical_p=typical_p,
20152032
stream=stream,
2016-
stream_include_usage=stream_include_usage,
2033+
stream_options=stream_options,
20172034
stop=stop,
20182035
max_tokens=max_tokens,
20192036
presence_penalty=presence_penalty,
@@ -2597,7 +2614,7 @@ def __call__(
25972614
min_p: float = 0.05,
25982615
typical_p: float = 1.0,
25992616
stream: bool = False,
2600-
stream_include_usage: bool = False,
2617+
stream_options: Optional[llama_types.StreamOptions] = None,
26012618
stop: Optional[Union[str, List[str]]] = [],
26022619
seed: Optional[int] = None,
26032620
response_format: Optional[
@@ -2623,6 +2640,11 @@ def __call__(
26232640
Iterator[llama_types.CreateChatCompletionStreamResponse],
26242641
]:
26252642
assert self.clip_ctx is not None
2643+
# Define stream inclusion to avoid having to re-calculate it multiple times
2644+
if stream and ( stream_options is not None or stream_options.include_usage is not None):
2645+
stream_include_usage = stream_options.include_usage
2646+
else:
2647+
stream_include_usage = False
26262648

26272649
system_prompt = _get_system_message(messages)
26282650
if system_prompt == "" and self.DEFAULT_SYSTEM_MESSAGE is not None:
@@ -2745,7 +2767,7 @@ def embed_image_bytes(image_bytes: bytes):
27452767
typical_p=typical_p,
27462768
logprobs=top_logprobs if logprobs else None,
27472769
stream=stream,
2748-
stream_include_usage=stream_include_usage,
2770+
stream_options=stream_options,
27492771
stop=stop,
27502772
seed=seed,
27512773
max_tokens=max_tokens,
@@ -3205,7 +3227,7 @@ def chatml_function_calling(
32053227
min_p: float = 0.05,
32063228
typical_p: float = 1.0,
32073229
stream: bool = False,
3208-
stream_include_usage: bool = False,
3230+
stream_options: Optional[llama_types.StreamOptions] = None,
32093231
stop: Optional[Union[str, List[str]]] = [],
32103232
response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
32113233
max_tokens: Optional[int] = None,
@@ -3308,6 +3330,12 @@ def chatml_function_calling(
33083330

33093331
stop = [stop, "<|im_end|>"] if isinstance(stop, str) else stop + ["<|im_end|>"] if stop else ["<|im_end|>"]
33103332

3333+
# Define stream inclusion to avoid having to re-calculate it multiple times
3334+
if stream and ( stream_options is not None or stream_options.include_usage is not None):
3335+
stream_include_usage = stream_options.include_usage
3336+
else:
3337+
stream_include_usage = False
3338+
33113339
# Case 1: No tool choice by user
33123340
if (
33133341
tool_choice is None
@@ -3334,7 +3362,7 @@ def chatml_function_calling(
33343362
min_p=min_p,
33353363
typical_p=typical_p,
33363364
stream=stream,
3337-
stream_include_usage=stream_include_usage,
3365+
stream_options=stream_options,
33383366
stop=stop,
33393367
max_tokens=max_tokens,
33403368
presence_penalty=presence_penalty,
@@ -3350,7 +3378,7 @@ def chatml_function_calling(
33503378
logprobs=top_logprobs if logprobs else None,
33513379
),
33523380
stream=stream,
3353-
stream_include_usage=stream_include_usage,
3381+
stream_include_usage=stream_include_usage
33543382
)
33553383

33563384
# Case 2: Tool choice by user
@@ -3389,7 +3417,7 @@ def chatml_function_calling(
33893417
min_p=min_p,
33903418
typical_p=typical_p,
33913419
stream=stream,
3392-
stream_include_usage=stream_include_usage,
3420+
stream_options=stream_options,
33933421
stop=stop,
33943422
max_tokens=max_tokens,
33953423
presence_penalty=presence_penalty,
@@ -3461,7 +3489,7 @@ def chatml_function_calling(
34613489
min_p=min_p,
34623490
typical_p=typical_p,
34633491
stream=stream,
3464-
stream_include_usage=stream_include_usage,
3492+
stream_options=stream_options,
34653493
stop=["<|im_end|>"],
34663494
logprobs=top_logprobs if logprobs else None,
34673495
max_tokens=None,

llama_cpp/llama_types.py

+2
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,8 @@ class ChatCompletionNamedToolChoice(TypedDict):
273273
type: Literal["function"]
274274
function: ChatCompletionNamedToolChoiceFunction
275275

276+
class StreamOptions(TypedDict):
277+
include_usage: Optional[bool]
276278

277279
ChatCompletionToolChoiceOption = Union[
278280
Literal["none", "auto"], ChatCompletionNamedToolChoice

llama_cpp/server/app.py

+1-9
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,6 @@ async def create_completion(
276276
"logit_bias_type",
277277
"user",
278278
"min_tokens",
279-
"stream_options",
280279
}
281280
kwargs = body.model_dump(exclude=exclude)
282281

@@ -287,8 +286,6 @@ async def create_completion(
287286
else body.logit_bias
288287
)
289288
# Set usage when streaming (if defined)
290-
if body.stream_options is not None and body.stream_options.include_usage:
291-
kwargs["stream_include_usage"] = body.stream_options.include_usage
292289

293290
if body.grammar is not None:
294291
kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
@@ -460,7 +457,6 @@ async def create_chat_completion(
460457
"logit_bias_type",
461458
"user",
462459
"min_tokens",
463-
"stream_options",
464460
}
465461
kwargs = body.model_dump(exclude=exclude)
466462
llama = llama_proxy(body.model)
@@ -470,11 +466,7 @@ async def create_chat_completion(
470466
if body.logit_bias_type == "tokens"
471467
else body.logit_bias
472468
)
473-
474-
# Set usage inclusion in stream (if defined)
475-
if body.stream_options is not None and body.stream_options.include_usage:
476-
kwargs["stream_include_usage"] = body.stream_options.include_usage
477-
469+
478470
if body.grammar is not None:
479471
kwargs["grammar"] = llama_cpp.LlamaGrammar.from_string(body.grammar)
480472

llama_cpp/server/types.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
description="Whether to stream the results as they are generated. Useful for chatbots.",
5555
)
5656

57+
# I'm currently not sure, how to best give this as settings.
5758
include_usage_field = Field(
5859
default=False,
5960
description="If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.",
@@ -110,9 +111,6 @@
110111
description="A CBNF grammar (as string) to be used for formatting the model's output.",
111112
)
112113

113-
class StreamOptions(BaseModel):
114-
include_usage: Optional[bool] = include_usage_field
115-
116114
class CreateCompletionRequest(BaseModel):
117115
prompt: Union[str, List[str]] = Field(
118116
default="", description="The prompt to generate completions for."
@@ -134,7 +132,7 @@ class CreateCompletionRequest(BaseModel):
134132
)
135133
stop: Optional[Union[str, List[str]]] = stop_field
136134
stream: bool = stream_field
137-
stream_options: Optional[Union[StreamOptions, None]] = Field(
135+
stream_options: Optional[llama_cpp.StreamOptions] = Field(
138136
default=None,
139137
description="Options for streaming response. Only set this when you set stream: true.",
140138
)
@@ -239,7 +237,7 @@ class CreateChatCompletionRequest(BaseModel):
239237
min_p: float = min_p_field
240238
stop: Optional[Union[str, List[str]]] = stop_field
241239
stream: bool = stream_field
242-
stream_options: Optional[Union[StreamOptions, None]] = Field(
240+
stream_options: Optional[llama_cpp.StreamOptions] = Field(
243241
default=None,
244242
description="Options for streaming response. Only set this when you set stream: true.",
245243
)

0 commit comments

Comments
 (0)