|
@@ -150,6 +150,9 @@ class OpenAIServingChat(OpenAIServing):
|
|
|
created=created_time,
|
|
|
choices=[choice_data],
|
|
|
model=model_name)
|
|
|
+ if (request.stream_options
|
|
|
+ and request.stream_options.include_usage):
|
|
|
+ chunk.usage = None
|
|
|
data = chunk.model_dump_json(exclude_unset=True)
|
|
|
yield f"data: {data}\n\n"
|
|
|
|
|
@@ -179,6 +182,9 @@ class OpenAIServingChat(OpenAIServing):
|
|
|
choices=[choice_data],
|
|
|
logprobs=None,
|
|
|
model=model_name)
|
|
|
+ if (request.stream_options and
|
|
|
+ request.stream_options.include_usage):
|
|
|
+ chunk.usage = None
|
|
|
data = chunk.model_dump_json(
|
|
|
exclude_unset=True)
|
|
|
yield f"data: {data}\n\n"
|
|
@@ -230,17 +236,14 @@ class OpenAIServingChat(OpenAIServing):
|
|
|
created=created_time,
|
|
|
choices=[choice_data],
|
|
|
model=model_name)
|
|
|
+ if (request.stream_options
|
|
|
+ and request.stream_options.include_usage):
|
|
|
+ chunk.usage = None
|
|
|
data = chunk.model_dump_json(exclude_unset=True)
|
|
|
yield f"data: {data}\n\n"
|
|
|
else:
|
|
|
# Send the finish response for each request.n only once
|
|
|
prompt_tokens = len(res.prompt_token_ids)
|
|
|
- final_usage = UsageInfo(
|
|
|
- prompt_tokens=prompt_tokens,
|
|
|
- completion_tokens=previous_num_tokens[i],
|
|
|
- total_tokens=prompt_tokens +
|
|
|
- previous_num_tokens[i],
|
|
|
- )
|
|
|
choice_data = ChatCompletionResponseStreamChoice(
|
|
|
index=i,
|
|
|
delta=delta_message,
|
|
@@ -253,12 +256,32 @@ class OpenAIServingChat(OpenAIServing):
|
|
|
created=created_time,
|
|
|
choices=[choice_data],
|
|
|
model=model_name)
|
|
|
- if final_usage is not None:
|
|
|
- chunk.usage = final_usage
|
|
|
- data = chunk.model_dump_json(exclude_unset=True,
|
|
|
- exclude_none=True)
|
|
|
+ if (request.stream_options
|
|
|
+ and request.stream_options.include_usage):
|
|
|
+ chunk.usage = None
|
|
|
+ data = chunk.model_dump_json(exclude_unset=True)
|
|
|
yield f"data: {data}\n\n"
|
|
|
finish_reason_sent[i] = True
|
|
|
+
|
|
|
+ if (request.stream_options
|
|
|
+ and request.stream_options.include_usage):
|
|
|
+ final_usage = UsageInfo(
|
|
|
+ prompt_tokens=prompt_tokens,
|
|
|
+ completion_tokens=previous_num_tokens[i],
|
|
|
+ total_tokens=prompt_tokens +
|
|
|
+ previous_num_tokens[i],
|
|
|
+ )
|
|
|
+
|
|
|
+ final_usage_chunk = ChatCompletionStreamResponse(
|
|
|
+ id=request_id,
|
|
|
+ object=chunk_object_type,
|
|
|
+ created=created_time,
|
|
|
+ choices=[],
|
|
|
+ model=model_name,
|
|
|
+ usage=final_usage)
|
|
|
+ final_usage_data = (final_usage_chunk.model_dump_json(
|
|
|
+ exclude_unset=True, exclude_none=True))
|
|
|
+ yield f"data: {final_usage_data}\n\n"
|
|
|
except ValueError as e:
|
|
|
# TODO: Use an aphrodite-specific Validation Error
|
|
|
data = self.create_streaming_error_response(str(e))
|