Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat(openai): added support for non-streaming o1 (e.g. Azure) models (#1687)
Browse files* feat(openai): added support for non-streaming o1 (e.g. Azure) models
* feat(docs): add non streaming example
* feat: moar docs
---------
Co-authored-by: Nathan Sarrazin <[email protected]>
README.md
CHANGED
@@ -482,6 +482,29 @@ MODELS=`[{
|
|
482 |
}]`
|
483 |
```
|
484 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
485 |
##### Llama.cpp API server
|
486 |
|
487 |
chat-ui also supports the llama.cpp API server directly without the need for an adapter. You can do this using the `llamacpp` endpoint type.
|
|
|
482 |
}]`
|
483 |
```
|
484 |
|
485 |
+
_Non-streaming endpoints_
|
486 |
+
|
487 |
+
For endpoints that don´t support streaming like o1 on Azure, you can pass `streamingSupported: false` in your endpoint config:
|
488 |
+
|
489 |
+
```
|
490 |
+
MODELS=`[{
|
491 |
+
"id": "o1-preview",
|
492 |
+
"name": "o1-preview",
|
493 |
+
"displayName": "o1-preview",
|
494 |
+
"systemRoleSupported": false,
|
495 |
+
"endpoints": [
|
496 |
+
{
|
497 |
+
"type": "openai",
|
498 |
+
"baseURL": "https://my-deployment.openai.azure.com/openai/deployments/o1-preview",
|
499 |
+
"defaultHeaders": {
|
500 |
+
"api-key": "$SECRET"
|
501 |
+
},
|
502 |
+
"streamingSupported": false,
|
503 |
+
}
|
504 |
+
]
|
505 |
+
}]`
|
506 |
+
```
|
507 |
+
|
508 |
##### Llama.cpp API server
|
509 |
|
510 |
chat-ui also supports the llama.cpp API server directly without the need for an adapter. You can do this using the `llamacpp` endpoint type.
|
docs/source/configuration/models/providers/openai.md
CHANGED
@@ -146,9 +146,36 @@ MODELS=`[{
|
|
146 |
}]`
|
147 |
```
|
148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
## Other
|
150 |
|
151 |
Some other providers and their `baseURL` for reference.
|
152 |
|
153 |
[Groq](https://groq.com/): https://api.groq.com/openai/v1
|
154 |
[Fireworks](https://fireworks.ai/): https://api.fireworks.ai/inference/v1
|
|
|
|
|
|
|
|
|
|
146 |
}]`
|
147 |
```
|
148 |
|
149 |
+
_Non-streaming endpoints_
|
150 |
+
|
151 |
+
For endpoints that don´t support streaming like o1 on Azure, you can pass `streamingSupported: false` in your endpoint config:
|
152 |
+
|
153 |
+
```
|
154 |
+
MODELS=`[{
|
155 |
+
"id": "o1-preview",
|
156 |
+
"name": "o1-preview",
|
157 |
+
"displayName": "o1-preview",
|
158 |
+
"systemRoleSupported": false,
|
159 |
+
"endpoints": [
|
160 |
+
{
|
161 |
+
"type": "openai",
|
162 |
+
"baseURL": "https://my-deployment.openai.azure.com/openai/deployments/o1-preview",
|
163 |
+
"defaultHeaders": {
|
164 |
+
"api-key": "$SECRET"
|
165 |
+
},
|
166 |
+
"streamingSupported": false,
|
167 |
+
}
|
168 |
+
]
|
169 |
+
}]`
|
170 |
+
```
|
171 |
+
|
172 |
## Other
|
173 |
|
174 |
Some other providers and their `baseURL` for reference.
|
175 |
|
176 |
[Groq](https://groq.com/): https://api.groq.com/openai/v1
|
177 |
[Fireworks](https://fireworks.ai/): https://api.fireworks.ai/inference/v1
|
178 |
+
|
179 |
+
```
|
180 |
+
|
181 |
+
```
|
src/lib/server/endpoints/openai/endpointOai.ts
CHANGED
@@ -1,8 +1,12 @@
|
|
1 |
import { z } from "zod";
|
2 |
import { openAICompletionToTextGenerationStream } from "./openAICompletionToTextGenerationStream";
|
3 |
-
import {
|
|
|
|
|
|
|
4 |
import type { CompletionCreateParamsStreaming } from "openai/resources/completions";
|
5 |
import type {
|
|
|
6 |
ChatCompletionCreateParamsStreaming,
|
7 |
ChatCompletionTool,
|
8 |
} from "openai/resources/chat/completions";
|
@@ -113,6 +117,7 @@ export const endpointOAIParametersSchema = z.object({
|
|
113 |
.default({}),
|
114 |
/* enable use of max_completion_tokens in place of max_tokens */
|
115 |
useCompletionTokens: z.boolean().default(false),
|
|
|
116 |
});
|
117 |
|
118 |
export async function endpointOai(
|
@@ -128,6 +133,7 @@ export async function endpointOai(
|
|
128 |
multimodal,
|
129 |
extraBody,
|
130 |
useCompletionTokens,
|
|
|
131 |
} = endpointOAIParametersSchema.parse(input);
|
132 |
|
133 |
let OpenAI;
|
@@ -249,10 +255,10 @@ export async function endpointOai(
|
|
249 |
|
250 |
const parameters = { ...model.parameters, ...generateSettings };
|
251 |
const toolCallChoices = createChatCompletionToolsArray(tools);
|
252 |
-
const body
|
253 |
model: model.id ?? model.name,
|
254 |
messages: messagesOpenAI,
|
255 |
-
stream:
|
256 |
...(useCompletionTokens
|
257 |
? { max_completion_tokens: parameters?.max_new_tokens }
|
258 |
: { max_tokens: parameters?.max_new_tokens }),
|
@@ -264,15 +270,31 @@ export async function endpointOai(
|
|
264 |
...(toolCallChoices.length > 0 ? { tools: toolCallChoices, tool_choice: "auto" } : {}),
|
265 |
};
|
266 |
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
276 |
};
|
277 |
} else {
|
278 |
throw new Error("Invalid completion type");
|
|
|
1 |
import { z } from "zod";
|
2 |
import { openAICompletionToTextGenerationStream } from "./openAICompletionToTextGenerationStream";
|
3 |
+
import {
|
4 |
+
openAIChatToTextGenerationSingle,
|
5 |
+
openAIChatToTextGenerationStream,
|
6 |
+
} from "./openAIChatToTextGenerationStream";
|
7 |
import type { CompletionCreateParamsStreaming } from "openai/resources/completions";
|
8 |
import type {
|
9 |
+
ChatCompletionCreateParamsNonStreaming,
|
10 |
ChatCompletionCreateParamsStreaming,
|
11 |
ChatCompletionTool,
|
12 |
} from "openai/resources/chat/completions";
|
|
|
117 |
.default({}),
|
118 |
/* enable use of max_completion_tokens in place of max_tokens */
|
119 |
useCompletionTokens: z.boolean().default(false),
|
120 |
+
streamingSupported: z.boolean().default(true),
|
121 |
});
|
122 |
|
123 |
export async function endpointOai(
|
|
|
133 |
multimodal,
|
134 |
extraBody,
|
135 |
useCompletionTokens,
|
136 |
+
streamingSupported,
|
137 |
} = endpointOAIParametersSchema.parse(input);
|
138 |
|
139 |
let OpenAI;
|
|
|
255 |
|
256 |
const parameters = { ...model.parameters, ...generateSettings };
|
257 |
const toolCallChoices = createChatCompletionToolsArray(tools);
|
258 |
+
const body = {
|
259 |
model: model.id ?? model.name,
|
260 |
messages: messagesOpenAI,
|
261 |
+
stream: streamingSupported,
|
262 |
...(useCompletionTokens
|
263 |
? { max_completion_tokens: parameters?.max_new_tokens }
|
264 |
: { max_tokens: parameters?.max_new_tokens }),
|
|
|
270 |
...(toolCallChoices.length > 0 ? { tools: toolCallChoices, tool_choice: "auto" } : {}),
|
271 |
};
|
272 |
|
273 |
+
if (streamingSupported) {
|
274 |
+
const openChatAICompletion = await openai.chat.completions.create(
|
275 |
+
body as ChatCompletionCreateParamsStreaming,
|
276 |
+
{
|
277 |
+
body: { ...body, ...extraBody },
|
278 |
+
headers: {
|
279 |
+
"ChatUI-Conversation-ID": conversationId?.toString() ?? "",
|
280 |
+
"X-use-cache": "false",
|
281 |
+
},
|
282 |
+
}
|
283 |
+
);
|
284 |
+
return openAIChatToTextGenerationStream(openChatAICompletion);
|
285 |
+
} else {
|
286 |
+
const openChatAICompletion = await openai.chat.completions.create(
|
287 |
+
body as ChatCompletionCreateParamsNonStreaming,
|
288 |
+
{
|
289 |
+
body: { ...body, ...extraBody },
|
290 |
+
headers: {
|
291 |
+
"ChatUI-Conversation-ID": conversationId?.toString() ?? "",
|
292 |
+
"X-use-cache": "false",
|
293 |
+
},
|
294 |
+
}
|
295 |
+
);
|
296 |
+
return openAIChatToTextGenerationSingle(openChatAICompletion);
|
297 |
+
}
|
298 |
};
|
299 |
} else {
|
300 |
throw new Error("Invalid completion type");
|
src/lib/server/endpoints/openai/openAIChatToTextGenerationStream.ts
CHANGED
@@ -94,3 +94,25 @@ export async function* openAIChatToTextGenerationStream(
|
|
94 |
}
|
95 |
}
|
96 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
}
|
95 |
}
|
96 |
}
|
97 |
+
|
98 |
+
/**
|
99 |
+
* Transform a non-streaming OpenAI chat completion into a stream of TextGenerationStreamOutput
|
100 |
+
*/
|
101 |
+
export async function* openAIChatToTextGenerationSingle(
|
102 |
+
completion: OpenAI.Chat.Completions.ChatCompletion
|
103 |
+
) {
|
104 |
+
const content = completion.choices[0]?.message?.content || "";
|
105 |
+
const tokenId = 0;
|
106 |
+
|
107 |
+
// Yield the content as a single token
|
108 |
+
yield {
|
109 |
+
token: {
|
110 |
+
id: tokenId,
|
111 |
+
text: content,
|
112 |
+
logprob: 0,
|
113 |
+
special: false,
|
114 |
+
},
|
115 |
+
generated_text: content,
|
116 |
+
details: null,
|
117 |
+
} as TextGenerationStreamOutput;
|
118 |
+
}
|