Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 2,463 Bytes
2c00ea8 564e576 aee936e 2c00ea8 76a7af0 6e18e46 76a7af0 2c00ea8 564e576 2c00ea8 6e18e46 2c00ea8 6e18e46 76a7af0 2c00ea8 6e18e46 dc98038 2c00ea8 aee936e 853f8fc 514377e 564e576 853f8fc 564e576 853f8fc 2c00ea8 853f8fc aee936e 2c00ea8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import type { WebSearchScrapedSource, WebSearchSource } from "$lib/types/WebSearch";
import type { MessageWebSearchUpdate } from "$lib/types/MessageUpdate";
import { withPage } from "./playwright";
import { spatialParser } from "./parser";
import { htmlToMarkdownTree } from "../markdown/tree";
import { timeout } from "$lib/utils/timeout";
import { makeGeneralUpdate } from "../update";
import { MetricsServer } from "$lib/server/metrics";
import { logger } from "$lib/server/logger";
export const scrape = (maxCharsPerElem: number) =>
async function* (
source: WebSearchSource
): AsyncGenerator<MessageWebSearchUpdate, WebSearchScrapedSource | undefined, undefined> {
try {
const startTime = Date.now();
MetricsServer.getMetrics().webSearch.pageFetchCount.inc();
const page = await scrapeUrl(source.link, maxCharsPerElem);
MetricsServer.getMetrics().webSearch.pageFetchDuration.observe(Date.now() - startTime);
yield makeGeneralUpdate({
message: "Browsing webpage",
args: [source.link],
});
return { ...source, page };
} catch (e) {
MetricsServer.getMetrics().webSearch.pageFetchCountError.inc();
logger.error(e, `Error scraping webpage: ${source.link}`);
}
};
export async function scrapeUrl(url: string, maxCharsPerElem: number) {
return withPage(url, async (page, res) => {
if (!res) throw Error("Failed to load page");
if (!res.ok()) throw Error(`Failed to load page: ${res.status()}`);
// Check if it's a non-html content type that we can handle directly
// TODO: direct mappings to markdown can be added for markdown, csv and others
const contentType = res.headers()["content-type"] ?? "";
if (
contentType.includes("text/plain") ||
contentType.includes("text/markdown") ||
contentType.includes("application/json") ||
contentType.includes("application/xml") ||
contentType.includes("text/csv")
) {
const title = await page.title();
const content = await page.content();
return {
title,
markdownTree: htmlToMarkdownTree(
title,
[{ tagName: "p", attributes: {}, content: [content] }],
maxCharsPerElem
),
};
}
const scrapedOutput = await timeout(page.evaluate(spatialParser), 2000)
.then(({ elements, ...parsed }) => ({
...parsed,
markdownTree: htmlToMarkdownTree(parsed.title, elements, maxCharsPerElem),
}))
.catch((cause) => {
throw Error("Parsing failed", { cause });
});
return scrapedOutput;
});
}
|