@@ -66,7 +66,7 @@ def __call__(
66
66
top_p : float = 0.95 ,
67
67
top_k : int = 40 ,
68
68
stream : bool = False ,
69
- stream_include_usage : Optional [bool ] = False ,
69
+ stream_options : Optional [llama_types . StreamOptions ] = None ,
70
70
stop : Optional [Union [str , List [str ]]] = [],
71
71
seed : Optional [int ] = None ,
72
72
response_format : Optional [
@@ -544,7 +544,7 @@ def chat_completion_handler(
544
544
min_p : float = 0.05 ,
545
545
typical_p : float = 1.0 ,
546
546
stream : bool = False ,
547
- stream_include_usage : Optional [bool ] = False ,
547
+ stream_options : Optional [llama_types . StreamOptions ] = None ,
548
548
stop : Optional [Union [str , List [str ]]] = [],
549
549
seed : Optional [int ] = None ,
550
550
response_format : Optional [
@@ -577,6 +577,11 @@ def chat_completion_handler(
577
577
tool_choice = tool_choice ,
578
578
)
579
579
prompt = llama .tokenize (result .prompt .encode ("utf-8" ), add_bos = not result .added_special , special = True )
580
+ if stream and ( stream_options is not None or stream_options .include_usage is not None ):
581
+ stream_include_usage = stream_options .include_usage
582
+ else :
583
+ stream_include_usage = False
584
+
580
585
if result .stop is not None :
581
586
stop = [] if stop is None else [stop ] if isinstance (stop , str ) else stop
582
587
rstop = result .stop if isinstance (result .stop , list ) else [result .stop ]
@@ -639,7 +644,7 @@ def chat_completion_handler(
639
644
typical_p = typical_p ,
640
645
logprobs = top_logprobs if logprobs else None ,
641
646
stream = stream ,
642
- stream_include_usage = stream_include_usage ,
647
+ stream_options = stream_options ,
643
648
stop = stop ,
644
649
seed = seed ,
645
650
max_tokens = max_tokens ,
@@ -1372,7 +1377,7 @@ def functionary_chat_handler(
1372
1377
min_p : float = 0.05 ,
1373
1378
typical_p : float = 1.0 ,
1374
1379
stream : bool = False ,
1375
- stream_include_usage : bool = False ,
1380
+ stream_options : Optional ( llama_types . StreamOptions ) = None ,
1376
1381
stop : Optional [Union [str , List [str ]]] = [],
1377
1382
response_format : Optional [llama_types .ChatCompletionRequestResponseFormat ] = None ,
1378
1383
max_tokens : Optional [int ] = None ,
@@ -1390,6 +1395,12 @@ def functionary_chat_handler(
1390
1395
) -> Union [llama_types .ChatCompletion , Iterator [llama_types .ChatCompletionChunk ]]:
1391
1396
SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
1392
1397
1398
+ # Define stream inclusion to avoid having to re-calculate it multiple times
1399
+ if stream and ( stream_options is not None or stream_options .include_usage is not None ):
1400
+ stream_include_usage = stream_options .include_usage
1401
+ else :
1402
+ stream_include_usage = False
1403
+
1393
1404
def generate_type_definition (
1394
1405
param : Dict [str , llama_types .JsonType ], indent_level : int , shared_defs
1395
1406
) -> str :
@@ -1580,7 +1591,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
1580
1591
min_p = min_p ,
1581
1592
typical_p = typical_p ,
1582
1593
stream = stream ,
1583
- stream_include_usage = stream_include_usage ,
1594
+ stream_options = stream_options ,
1584
1595
stop = ["user:" , "</s>" ],
1585
1596
max_tokens = max_tokens ,
1586
1597
presence_penalty = presence_penalty ,
@@ -1601,7 +1612,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
1601
1612
):
1602
1613
stop = "\n "
1603
1614
completion : llama_types .Completion = llama .create_completion (
1604
- prompt = prompt , stop = stop , stream = False , stream_include_usage = stream_include_usage
1615
+ prompt = prompt , stop = stop , stream = False , stream_options = stream_options
1605
1616
) # type: ignore
1606
1617
completion_text = completion ["choices" ][0 ]["text" ]
1607
1618
# strip " to=functions." and ending ":"
@@ -1657,7 +1668,7 @@ def message_to_str(msg: llama_types.ChatCompletionRequestMessage):
1657
1668
prompt = new_prompt ,
1658
1669
stop = ["user:" , "</s>" ],
1659
1670
stream = False ,
1660
- stream_include_usage = stream_include_usage ,
1671
+ stream_options = stream_options ,
1661
1672
grammar = grammar ,
1662
1673
max_tokens = max_tokens ,
1663
1674
temperature = temperature ,
@@ -1734,7 +1745,7 @@ def functionary_v1_v2_chat_handler(
1734
1745
min_p : float = 0.05 ,
1735
1746
typical_p : float = 1.0 ,
1736
1747
stream : bool = False ,
1737
- stream_include_usage : bool = False ,
1748
+ stream_options : Optional [ llama_types . StreamOptions ] = None ,
1738
1749
stop : Optional [Union [str , List [str ]]] = [],
1739
1750
response_format : Optional [llama_types .ChatCompletionRequestResponseFormat ] = None ,
1740
1751
max_tokens : Optional [int ] = None ,
@@ -1752,6 +1763,12 @@ def functionary_v1_v2_chat_handler(
1752
1763
) -> Union [llama_types .ChatCompletion , Iterator [llama_types .ChatCompletionChunk ]]:
1753
1764
SYSTEM_MESSAGE = """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"""
1754
1765
1766
+ # Define stream inclusion to avoid having to re-calculate it multiple times
1767
+ if stream and ( stream_options is not None or stream_options .include_usage is not None ):
1768
+ stream_include_usage = stream_options .include_usage
1769
+ else :
1770
+ stream_include_usage = False
1771
+
1755
1772
tokenizer = llama .tokenizer_
1756
1773
assert hasattr (
1757
1774
tokenizer , "hf_tokenizer"
@@ -1952,7 +1969,7 @@ def prepare_messages_for_inference(
1952
1969
min_p = min_p ,
1953
1970
typical_p = typical_p ,
1954
1971
stream = stream ,
1955
- stream_include_usage = stream_include_usage ,
1972
+ stream_options = stream_options ,
1956
1973
stop = stop ,
1957
1974
max_tokens = max_tokens ,
1958
1975
presence_penalty = presence_penalty ,
@@ -2013,7 +2030,7 @@ def create_completion(prompt, stop, grammar):
2013
2030
min_p = min_p ,
2014
2031
typical_p = typical_p ,
2015
2032
stream = stream ,
2016
- stream_include_usage = stream_include_usage ,
2033
+ stream_options = stream_options ,
2017
2034
stop = stop ,
2018
2035
max_tokens = max_tokens ,
2019
2036
presence_penalty = presence_penalty ,
@@ -2597,7 +2614,7 @@ def __call__(
2597
2614
min_p : float = 0.05 ,
2598
2615
typical_p : float = 1.0 ,
2599
2616
stream : bool = False ,
2600
- stream_include_usage : bool = False ,
2617
+ stream_options : Optional [ llama_types . StreamOptions ] = None ,
2601
2618
stop : Optional [Union [str , List [str ]]] = [],
2602
2619
seed : Optional [int ] = None ,
2603
2620
response_format : Optional [
@@ -2623,6 +2640,11 @@ def __call__(
2623
2640
Iterator [llama_types .CreateChatCompletionStreamResponse ],
2624
2641
]:
2625
2642
assert self .clip_ctx is not None
2643
+ # Define stream inclusion to avoid having to re-calculate it multiple times
2644
+ if stream and ( stream_options is not None or stream_options .include_usage is not None ):
2645
+ stream_include_usage = stream_options .include_usage
2646
+ else :
2647
+ stream_include_usage = False
2626
2648
2627
2649
system_prompt = _get_system_message (messages )
2628
2650
if system_prompt == "" and self .DEFAULT_SYSTEM_MESSAGE is not None :
@@ -2745,7 +2767,7 @@ def embed_image_bytes(image_bytes: bytes):
2745
2767
typical_p = typical_p ,
2746
2768
logprobs = top_logprobs if logprobs else None ,
2747
2769
stream = stream ,
2748
- stream_include_usage = stream_include_usage ,
2770
+ stream_options = stream_options ,
2749
2771
stop = stop ,
2750
2772
seed = seed ,
2751
2773
max_tokens = max_tokens ,
@@ -3205,7 +3227,7 @@ def chatml_function_calling(
3205
3227
min_p : float = 0.05 ,
3206
3228
typical_p : float = 1.0 ,
3207
3229
stream : bool = False ,
3208
- stream_include_usage : bool = False ,
3230
+ stream_options : Optional [ llama_types . StreamOptions ] = None ,
3209
3231
stop : Optional [Union [str , List [str ]]] = [],
3210
3232
response_format : Optional [llama_types .ChatCompletionRequestResponseFormat ] = None ,
3211
3233
max_tokens : Optional [int ] = None ,
@@ -3308,6 +3330,12 @@ def chatml_function_calling(
3308
3330
3309
3331
stop = [stop , "<|im_end|>" ] if isinstance (stop , str ) else stop + ["<|im_end|>" ] if stop else ["<|im_end|>" ]
3310
3332
3333
+ # Define stream inclusion to avoid having to re-calculate it multiple times
3334
+ if stream and ( stream_options is not None or stream_options .include_usage is not None ):
3335
+ stream_include_usage = stream_options .include_usage
3336
+ else :
3337
+ stream_include_usage = False
3338
+
3311
3339
# Case 1: No tool choice by user
3312
3340
if (
3313
3341
tool_choice is None
@@ -3334,7 +3362,7 @@ def chatml_function_calling(
3334
3362
min_p = min_p ,
3335
3363
typical_p = typical_p ,
3336
3364
stream = stream ,
3337
- stream_include_usage = stream_include_usage ,
3365
+ stream_options = stream_options ,
3338
3366
stop = stop ,
3339
3367
max_tokens = max_tokens ,
3340
3368
presence_penalty = presence_penalty ,
@@ -3350,7 +3378,7 @@ def chatml_function_calling(
3350
3378
logprobs = top_logprobs if logprobs else None ,
3351
3379
),
3352
3380
stream = stream ,
3353
- stream_include_usage = stream_include_usage ,
3381
+ stream_include_usage = stream_include_usage
3354
3382
)
3355
3383
3356
3384
# Case 2: Tool choice by user
@@ -3389,7 +3417,7 @@ def chatml_function_calling(
3389
3417
min_p = min_p ,
3390
3418
typical_p = typical_p ,
3391
3419
stream = stream ,
3392
- stream_include_usage = stream_include_usage ,
3420
+ stream_options = stream_options ,
3393
3421
stop = stop ,
3394
3422
max_tokens = max_tokens ,
3395
3423
presence_penalty = presence_penalty ,
@@ -3461,7 +3489,7 @@ def chatml_function_calling(
3461
3489
min_p = min_p ,
3462
3490
typical_p = typical_p ,
3463
3491
stream = stream ,
3464
- stream_include_usage = stream_include_usage ,
3492
+ stream_options = stream_options ,
3465
3493
stop = ["<|im_end|>" ],
3466
3494
logprobs = top_logprobs if logprobs else None ,
3467
3495
max_tokens = None ,
0 commit comments