File size: 3,468 Bytes
b17a5c8
9264459
 
 
2c00ea8
9264459
564e576
9264459
2c00ea8
 
 
 
564e576
 
 
 
 
 
 
6e18e46
ee5c213
e5f4e9a
2c00ea8
 
e5f4e9a
564e576
e5f4e9a
2a808d7
564e576
 
 
2a808d7
2c00ea8
 
e5f4e9a
6e18e46
 
e5f4e9a
b17a5c8
 
 
2c00ea8
b17a5c8
 
2c00ea8
564e576
2c00ea8
 
 
564e576
2c00ea8
564e576
 
2c00ea8
564e576
 
 
 
2c00ea8
 
 
e5f4e9a
 
2c00ea8
564e576
2c00ea8
 
 
 
 
 
 
564e576
2c00ea8
564e576
2c00ea8
 
 
 
 
 
 
 
 
 
5459f31
564e576
e5f4e9a
2c00ea8
ee5c213
564e576
 
 
2c00ea8
 
 
 
 
 
 
5459f31
564e576
e5f4e9a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels";

import type { Conversation } from "$lib/types/Conversation";
import type { Message } from "$lib/types/Message";
import type { WebSearch, WebSearchScrapedSource } from "$lib/types/WebSearch";
import type { Assistant } from "$lib/types/Assistant";
import type { MessageWebSearchUpdate } from "$lib/types/MessageUpdate";

import { search } from "./search/search";
import { scrape } from "./scrape/scrape";
import { findContextSources } from "./embed/embed";
import { removeParents } from "./markdown/tree";
import {
	makeErrorUpdate,
	makeFinalAnswerUpdate,
	makeGeneralUpdate,
	makeSourcesUpdate,
} from "./update";
import { mergeAsyncGenerators } from "$lib/utils/mergeAsyncGenerators";
import { MetricsServer } from "../metrics";
import { logger } from "$lib/server/logger";

const MAX_N_PAGES_TO_SCRAPE = 8 as const;
const MAX_N_PAGES_TO_EMBED = 5 as const;

export async function* runWebSearch(
	conv: Conversation,
	messages: Message[],
	ragSettings?: Assistant["rag"],
	query?: string
): AsyncGenerator<MessageWebSearchUpdate, WebSearch, undefined> {
	const prompt = messages[messages.length - 1].content;
	const createdAt = new Date();
	const updatedAt = new Date();

	MetricsServer.getMetrics().webSearch.requestCount.inc();

	try {
		const embeddingModel =
			embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;
		if (!embeddingModel) {
			throw Error(`Embedding model ${conv.embeddingModel} not available anymore`);
		}

		// Search the web
		const { searchQuery, pages } = yield* search(messages, ragSettings, query);
		if (pages.length === 0) throw Error("No results found for this search query");

		// Scrape pages
		yield makeGeneralUpdate({ message: "Browsing search results" });

		const allScrapedPages = yield* mergeAsyncGenerators(
			pages.slice(0, MAX_N_PAGES_TO_SCRAPE).map(scrape(embeddingModel.chunkCharLength))
		);
		const scrapedPages = allScrapedPages
			.filter((p): p is WebSearchScrapedSource => Boolean(p))
			.filter((p) => p.page.markdownTree.children.length > 0)
			.slice(0, MAX_N_PAGES_TO_EMBED);

		if (!scrapedPages.length) {
			throw Error(`No text found in the first ${MAX_N_PAGES_TO_SCRAPE} results`);
		}

		// Chunk the text of each of the elements and find the most similar chunks to the prompt
		yield makeGeneralUpdate({ message: "Extracting relevant information" });
		const contextSources = await findContextSources(scrapedPages, prompt, embeddingModel).then(
			(ctxSources) =>
				ctxSources.map((source) => ({
					...source,
					page: { ...source.page, markdownTree: removeParents(source.page.markdownTree) },
				}))
		);
		yield makeSourcesUpdate(contextSources);

		const webSearch: WebSearch = {
			prompt,
			searchQuery,
			results: scrapedPages.map(({ page, ...source }) => ({
				...source,
				page: { ...page, markdownTree: removeParents(page.markdownTree) },
			})),
			contextSources,
			createdAt,
			updatedAt,
		};
		yield makeFinalAnswerUpdate();
		return webSearch;
	} catch (searchError) {
		const message = searchError instanceof Error ? searchError.message : String(searchError);
		logger.error(message);
		yield makeErrorUpdate({ message: "An error occurred", args: [message] });

		const webSearch: WebSearch = {
			prompt,
			searchQuery: "",
			results: [],
			contextSources: [],
			createdAt,
			updatedAt,
		};
		yield makeFinalAnswerUpdate();
		return webSearch;
	}
}