File size: 2,463 Bytes
2c00ea8
564e576
aee936e
2c00ea8
 
 
 
76a7af0
6e18e46
76a7af0
2c00ea8
564e576
 
 
 
2c00ea8
6e18e46
 
 
2c00ea8
6e18e46
 
 
76a7af0
 
 
 
2c00ea8
 
6e18e46
dc98038
2c00ea8
 
 
 
aee936e
853f8fc
514377e
564e576
853f8fc
 
 
 
 
 
 
 
 
 
 
 
 
564e576
853f8fc
 
 
 
 
 
 
2c00ea8
853f8fc
 
 
 
 
 
 
 
 
aee936e
2c00ea8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import type { WebSearchScrapedSource, WebSearchSource } from "$lib/types/WebSearch";
import type { MessageWebSearchUpdate } from "$lib/types/MessageUpdate";
import { withPage } from "./playwright";

import { spatialParser } from "./parser";
import { htmlToMarkdownTree } from "../markdown/tree";
import { timeout } from "$lib/utils/timeout";
import { makeGeneralUpdate } from "../update";
import { MetricsServer } from "$lib/server/metrics";
import { logger } from "$lib/server/logger";

export const scrape = (maxCharsPerElem: number) =>
	async function* (
		source: WebSearchSource
	): AsyncGenerator<MessageWebSearchUpdate, WebSearchScrapedSource | undefined, undefined> {
		try {
			const startTime = Date.now();
			MetricsServer.getMetrics().webSearch.pageFetchCount.inc();

			const page = await scrapeUrl(source.link, maxCharsPerElem);

			MetricsServer.getMetrics().webSearch.pageFetchDuration.observe(Date.now() - startTime);

			yield makeGeneralUpdate({
				message: "Browsing webpage",
				args: [source.link],
			});
			return { ...source, page };
		} catch (e) {
			MetricsServer.getMetrics().webSearch.pageFetchCountError.inc();
			logger.error(e, `Error scraping webpage: ${source.link}`);
		}
	};

export async function scrapeUrl(url: string, maxCharsPerElem: number) {
	return withPage(url, async (page, res) => {
		if (!res) throw Error("Failed to load page");
		if (!res.ok()) throw Error(`Failed to load page: ${res.status()}`);

		// Check if it's a non-html content type that we can handle directly
		// TODO: direct mappings to markdown can be added for markdown, csv and others
		const contentType = res.headers()["content-type"] ?? "";
		if (
			contentType.includes("text/plain") ||
			contentType.includes("text/markdown") ||
			contentType.includes("application/json") ||
			contentType.includes("application/xml") ||
			contentType.includes("text/csv")
		) {
			const title = await page.title();
			const content = await page.content();
			return {
				title,
				markdownTree: htmlToMarkdownTree(
					title,
					[{ tagName: "p", attributes: {}, content: [content] }],
					maxCharsPerElem
				),
			};
		}

		const scrapedOutput = await timeout(page.evaluate(spatialParser), 2000)
			.then(({ elements, ...parsed }) => ({
				...parsed,
				markdownTree: htmlToMarkdownTree(parsed.title, elements, maxCharsPerElem),
			}))
			.catch((cause) => {
				throw Error("Parsing failed", { cause });
			});
		return scrapedOutput;
	});
}