lgarbarini nsarrazin HF Staff commited on
Commit
df0f84c
·
unverified ·
1 Parent(s): 28b6d44

feat(openai): added support for non-streaming o1 (e.g. Azure) models (#1687)

Browse files

* feat(openai): added support for non-streaming o1 (e.g. Azure) models

* feat(docs): add non streaming example

* feat: moar docs

---------

Co-authored-by: Nathan Sarrazin <[email protected]>

README.md CHANGED
@@ -482,6 +482,29 @@ MODELS=`[{
482
  }]`
483
  ```
484
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
  ##### Llama.cpp API server
486
 
487
  chat-ui also supports the llama.cpp API server directly without the need for an adapter. You can do this using the `llamacpp` endpoint type.
 
482
  }]`
483
  ```
484
 
485
+ _Non-streaming endpoints_
486
+
487
+ For endpoints that don´t support streaming like o1 on Azure, you can pass `streamingSupported: false` in your endpoint config:
488
+
489
+ ```
490
+ MODELS=`[{
491
+ "id": "o1-preview",
492
+ "name": "o1-preview",
493
+ "displayName": "o1-preview",
494
+ "systemRoleSupported": false,
495
+ "endpoints": [
496
+ {
497
+ "type": "openai",
498
+ "baseURL": "https://my-deployment.openai.azure.com/openai/deployments/o1-preview",
499
+ "defaultHeaders": {
500
+ "api-key": "$SECRET"
501
+ },
502
+ "streamingSupported": false,
503
+ }
504
+ ]
505
+ }]`
506
+ ```
507
+
508
  ##### Llama.cpp API server
509
 
510
  chat-ui also supports the llama.cpp API server directly without the need for an adapter. You can do this using the `llamacpp` endpoint type.
docs/source/configuration/models/providers/openai.md CHANGED
@@ -146,9 +146,36 @@ MODELS=`[{
146
  }]`
147
  ```
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  ## Other
150
 
151
  Some other providers and their `baseURL` for reference.
152
 
153
  [Groq](https://groq.com/): https://api.groq.com/openai/v1
154
  [Fireworks](https://fireworks.ai/): https://api.fireworks.ai/inference/v1
 
 
 
 
 
146
  }]`
147
  ```
148
 
149
+ _Non-streaming endpoints_
150
+
151
+ For endpoints that don´t support streaming like o1 on Azure, you can pass `streamingSupported: false` in your endpoint config:
152
+
153
+ ```
154
+ MODELS=`[{
155
+ "id": "o1-preview",
156
+ "name": "o1-preview",
157
+ "displayName": "o1-preview",
158
+ "systemRoleSupported": false,
159
+ "endpoints": [
160
+ {
161
+ "type": "openai",
162
+ "baseURL": "https://my-deployment.openai.azure.com/openai/deployments/o1-preview",
163
+ "defaultHeaders": {
164
+ "api-key": "$SECRET"
165
+ },
166
+ "streamingSupported": false,
167
+ }
168
+ ]
169
+ }]`
170
+ ```
171
+
172
  ## Other
173
 
174
  Some other providers and their `baseURL` for reference.
175
 
176
  [Groq](https://groq.com/): https://api.groq.com/openai/v1
177
  [Fireworks](https://fireworks.ai/): https://api.fireworks.ai/inference/v1
178
+
179
+ ```
180
+
181
+ ```
src/lib/server/endpoints/openai/endpointOai.ts CHANGED
@@ -1,8 +1,12 @@
1
  import { z } from "zod";
2
  import { openAICompletionToTextGenerationStream } from "./openAICompletionToTextGenerationStream";
3
- import { openAIChatToTextGenerationStream } from "./openAIChatToTextGenerationStream";
 
 
 
4
  import type { CompletionCreateParamsStreaming } from "openai/resources/completions";
5
  import type {
 
6
  ChatCompletionCreateParamsStreaming,
7
  ChatCompletionTool,
8
  } from "openai/resources/chat/completions";
@@ -113,6 +117,7 @@ export const endpointOAIParametersSchema = z.object({
113
  .default({}),
114
  /* enable use of max_completion_tokens in place of max_tokens */
115
  useCompletionTokens: z.boolean().default(false),
 
116
  });
117
 
118
  export async function endpointOai(
@@ -128,6 +133,7 @@ export async function endpointOai(
128
  multimodal,
129
  extraBody,
130
  useCompletionTokens,
 
131
  } = endpointOAIParametersSchema.parse(input);
132
 
133
  let OpenAI;
@@ -249,10 +255,10 @@ export async function endpointOai(
249
 
250
  const parameters = { ...model.parameters, ...generateSettings };
251
  const toolCallChoices = createChatCompletionToolsArray(tools);
252
- const body: ChatCompletionCreateParamsStreaming = {
253
  model: model.id ?? model.name,
254
  messages: messagesOpenAI,
255
- stream: true,
256
  ...(useCompletionTokens
257
  ? { max_completion_tokens: parameters?.max_new_tokens }
258
  : { max_tokens: parameters?.max_new_tokens }),
@@ -264,15 +270,31 @@ export async function endpointOai(
264
  ...(toolCallChoices.length > 0 ? { tools: toolCallChoices, tool_choice: "auto" } : {}),
265
  };
266
 
267
- const openChatAICompletion = await openai.chat.completions.create(body, {
268
- body: { ...body, ...extraBody },
269
- headers: {
270
- "ChatUI-Conversation-ID": conversationId?.toString() ?? "",
271
- "X-use-cache": "false",
272
- },
273
- });
274
-
275
- return openAIChatToTextGenerationStream(openChatAICompletion);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  };
277
  } else {
278
  throw new Error("Invalid completion type");
 
1
  import { z } from "zod";
2
  import { openAICompletionToTextGenerationStream } from "./openAICompletionToTextGenerationStream";
3
+ import {
4
+ openAIChatToTextGenerationSingle,
5
+ openAIChatToTextGenerationStream,
6
+ } from "./openAIChatToTextGenerationStream";
7
  import type { CompletionCreateParamsStreaming } from "openai/resources/completions";
8
  import type {
9
+ ChatCompletionCreateParamsNonStreaming,
10
  ChatCompletionCreateParamsStreaming,
11
  ChatCompletionTool,
12
  } from "openai/resources/chat/completions";
 
117
  .default({}),
118
  /* enable use of max_completion_tokens in place of max_tokens */
119
  useCompletionTokens: z.boolean().default(false),
120
+ streamingSupported: z.boolean().default(true),
121
  });
122
 
123
  export async function endpointOai(
 
133
  multimodal,
134
  extraBody,
135
  useCompletionTokens,
136
+ streamingSupported,
137
  } = endpointOAIParametersSchema.parse(input);
138
 
139
  let OpenAI;
 
255
 
256
  const parameters = { ...model.parameters, ...generateSettings };
257
  const toolCallChoices = createChatCompletionToolsArray(tools);
258
+ const body = {
259
  model: model.id ?? model.name,
260
  messages: messagesOpenAI,
261
+ stream: streamingSupported,
262
  ...(useCompletionTokens
263
  ? { max_completion_tokens: parameters?.max_new_tokens }
264
  : { max_tokens: parameters?.max_new_tokens }),
 
270
  ...(toolCallChoices.length > 0 ? { tools: toolCallChoices, tool_choice: "auto" } : {}),
271
  };
272
 
273
+ if (streamingSupported) {
274
+ const openChatAICompletion = await openai.chat.completions.create(
275
+ body as ChatCompletionCreateParamsStreaming,
276
+ {
277
+ body: { ...body, ...extraBody },
278
+ headers: {
279
+ "ChatUI-Conversation-ID": conversationId?.toString() ?? "",
280
+ "X-use-cache": "false",
281
+ },
282
+ }
283
+ );
284
+ return openAIChatToTextGenerationStream(openChatAICompletion);
285
+ } else {
286
+ const openChatAICompletion = await openai.chat.completions.create(
287
+ body as ChatCompletionCreateParamsNonStreaming,
288
+ {
289
+ body: { ...body, ...extraBody },
290
+ headers: {
291
+ "ChatUI-Conversation-ID": conversationId?.toString() ?? "",
292
+ "X-use-cache": "false",
293
+ },
294
+ }
295
+ );
296
+ return openAIChatToTextGenerationSingle(openChatAICompletion);
297
+ }
298
  };
299
  } else {
300
  throw new Error("Invalid completion type");
src/lib/server/endpoints/openai/openAIChatToTextGenerationStream.ts CHANGED
@@ -94,3 +94,25 @@ export async function* openAIChatToTextGenerationStream(
94
  }
95
  }
96
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  }
95
  }
96
  }
97
+
98
+ /**
99
+ * Transform a non-streaming OpenAI chat completion into a stream of TextGenerationStreamOutput
100
+ */
101
+ export async function* openAIChatToTextGenerationSingle(
102
+ completion: OpenAI.Chat.Completions.ChatCompletion
103
+ ) {
104
+ const content = completion.choices[0]?.message?.content || "";
105
+ const tokenId = 0;
106
+
107
+ // Yield the content as a single token
108
+ yield {
109
+ token: {
110
+ id: tokenId,
111
+ text: content,
112
+ logprob: 0,
113
+ special: false,
114
+ },
115
+ generated_text: content,
116
+ details: null,
117
+ } as TextGenerationStreamOutput;
118
+ }