Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import type { SerializedHTMLElement } from "../../scrape/types"; | |
import { MarkdownElementType, type MarkdownElement } from "../types"; | |
// --- Markdown Elements --- | |
/** Converts markdown element to a string with formatting */ | |
export function stringifyMarkdownElement(elem: MarkdownElement): string { | |
const content = elem.content.trim(); | |
if (elem.type === MarkdownElementType.Header) return `${"#".repeat(elem.level)} ${content}\n\n`; | |
if (elem.type === MarkdownElementType.BlockQuote) { | |
return `${"> ".repeat(elem.depth)}${content}\n\n`; | |
} | |
if (elem.type === MarkdownElementType.CodeBlock) return `\`\`\`\n${content}\n\`\`\`\n\n`; | |
if (elem.type === MarkdownElementType.UnorderedListItem) return `- ${content}\n`; | |
if (elem.type === MarkdownElementType.OrderedListItem) { | |
const siblings = elem.parent?.children ?? [elem]; | |
const currentIndex = siblings.indexOf(elem); | |
const lastAdjacentIndex = siblings | |
.slice(currentIndex + 1) | |
.findLastIndex((child) => child.type === MarkdownElementType.OrderedListItem); | |
const order = currentIndex - lastAdjacentIndex + 1; | |
return `${order}. ${content}\n`; | |
} | |
return `${content}\n\n`; | |
} | |
/** Converts a tree of markdown elements to a string with formatting */ | |
export function stringifyMarkdownElementTree(elem: MarkdownElement): string { | |
const stringified = stringifyMarkdownElement(elem); | |
if (!("children" in elem)) return stringified; | |
return stringified + elem.children.map(stringifyMarkdownElementTree).join(""); | |
} | |
// ----- HTML Elements ----- | |
/** Ignores all non-inline tag types and grabs their text. Converts inline tags to markdown */ | |
export function stringifyHTMLElements(elems: (SerializedHTMLElement | string)[]): string { | |
return elems.map(stringifyHTMLElement).join("").trim(); | |
} | |
/** Ignores all non-inline tag types and grabs their text. Converts inline tags to markdown */ | |
export function stringifyHTMLElement(elem: SerializedHTMLElement | string): string { | |
if (typeof elem === "string") return elem; | |
if (elem.tagName === "br") return "\n"; | |
const content = elem.content.map(stringifyHTMLElement).join(""); | |
if (content.length === 0) return content; | |
if (elem.tagName === "strong" || elem.tagName === "b") return `**${content}**`; | |
if (elem.tagName === "em" || elem.tagName === "i") return `*${content}*`; | |
if (elem.tagName === "s" || elem.tagName === "strike") return `~~${content}~~`; | |
if (elem.tagName === "code" || elem.tagName === "var" || elem.tagName === "tt") { | |
return `\`${content}\``; | |
} | |
if (elem.tagName === "sup") return `<sup>${content}</sup>`; | |
if (elem.tagName === "sub") return `<sub>${content}</sub>`; | |
if (elem.tagName === "a" && content.trim().length > 0) { | |
const href = elem.attributes.href; | |
if (!href) return elem.content.map(stringifyHTMLElement).join(""); | |
return `[${elem.content.map(stringifyHTMLElement).join("")}](${href})`; | |
} | |
return elem.content.map(stringifyHTMLElement).join(""); | |
} | |
/** Grabs all text content directly, ignoring HTML tags */ | |
export function stringifyHTMLElementsUnformatted( | |
elems: (SerializedHTMLElement | string)[] | |
): string { | |
return elems.map(stringifyHTMLElementUnformatted).join(""); | |
} | |
/** Grabs all text content directly, ignoring HTML tags */ | |
function stringifyHTMLElementUnformatted(elem: SerializedHTMLElement | string): string { | |
if (typeof elem === "string") return elem; | |
return elem.content.map(stringifyHTMLElementUnformatted).join(""); | |
} | |