Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import { collapseString, sanitizeString } from "./utils/nlp"; | |
import { stringifyHTMLElements, stringifyHTMLElementsUnformatted } from "./utils/stringify"; | |
import { MarkdownElementType, tagNameMap, type HeaderElement, type MarkdownElement } from "./types"; | |
import type { SerializedHTMLElement } from "../scrape/types"; | |
interface ConversionState { | |
defaultType: | |
| MarkdownElementType.Paragraph | |
| MarkdownElementType.BlockQuote | |
| MarkdownElementType.UnorderedListItem | |
| MarkdownElementType.OrderedListItem; | |
listDepth: number; | |
blockQuoteDepth: number; | |
} | |
export function htmlElementToMarkdownElements( | |
parent: HeaderElement, | |
elem: SerializedHTMLElement | string, | |
prevState: ConversionState = { | |
defaultType: MarkdownElementType.Paragraph, | |
listDepth: 0, | |
blockQuoteDepth: 0, | |
} | |
): MarkdownElement | MarkdownElement[] { | |
// Found text so create an element based on the previous state | |
if (typeof elem === "string") { | |
if (elem.trim().length === 0) return []; | |
if ( | |
prevState.defaultType === MarkdownElementType.UnorderedListItem || | |
prevState.defaultType === MarkdownElementType.OrderedListItem | |
) { | |
return { | |
parent, | |
type: prevState.defaultType, | |
content: elem, | |
depth: prevState.listDepth, | |
}; | |
} | |
if (prevState.defaultType === MarkdownElementType.BlockQuote) { | |
return { | |
parent, | |
type: prevState.defaultType, | |
content: elem, | |
depth: prevState.blockQuoteDepth, | |
}; | |
} | |
return { parent, type: prevState.defaultType, content: elem }; | |
} | |
const type = tagNameMap[elem.tagName] ?? MarkdownElementType.Paragraph; | |
// Update the state based on the current element | |
const state: ConversionState = { ...prevState }; | |
if (type === MarkdownElementType.UnorderedList || type === MarkdownElementType.OrderedList) { | |
state.listDepth += 1; | |
state.defaultType = | |
type === MarkdownElementType.UnorderedList | |
? MarkdownElementType.UnorderedListItem | |
: MarkdownElementType.OrderedListItem; | |
} | |
if (type === MarkdownElementType.BlockQuote) { | |
state.defaultType = MarkdownElementType.BlockQuote; | |
state.blockQuoteDepth += 1; | |
} | |
// Headers | |
if (type === MarkdownElementType.Header) { | |
return { | |
parent, | |
type, | |
level: Number(elem.tagName[1]), | |
content: collapseString(stringifyHTMLElements(elem.content)), | |
children: [], | |
}; | |
} | |
// Code blocks | |
if (type === MarkdownElementType.CodeBlock) { | |
return { | |
parent, | |
type, | |
content: sanitizeString(stringifyHTMLElementsUnformatted(elem.content)), | |
}; | |
} | |
// Typical case, we want to flatten the DOM and only create elements when we see text | |
return elem.content.flatMap((el) => htmlElementToMarkdownElements(parent, el, state)); | |
} | |
export function mergeAdjacentElements(elements: MarkdownElement[]): MarkdownElement[] { | |
return elements.reduce<MarkdownElement[]>((acc, elem) => { | |
const last = acc[acc.length - 1]; | |
if (last && last.type === MarkdownElementType.Paragraph && last.type === elem.type) { | |
last.content += elem.content; | |
return acc; | |
} | |
return [...acc, elem]; | |
}, []); | |
} | |