Spaces:

jdelavande
/

chat-ui-energy

Running on CPU Upgrade

File size: 16,884 Bytes

import type { SerializedHTMLElement } from "./types";

interface DBSCANOptions<T> {
	dataset: T[];
	epsilon?: number;
	epsilonCompare?: (distance: number, epsilon: number) => boolean;
	minimumPoints?: number;
	distanceFunction: (a: T, b: T) => number;
}

export function spatialParser() {
	/**
	 * Implementation for dbscan, inlined and migrated to typescript from https://github.com/cdxOo/dbscan (MIT License)
	 */
	const DBSCAN = <T>({
		dataset,
		epsilon = 1,
		epsilonCompare = (dist, e) => dist < e,
		minimumPoints = 2,
		distanceFunction,
	}: DBSCANOptions<T>) => {
		const visitedIndices: Record<number, boolean> = {};
		const isVisited = (i: number) => visitedIndices[i];
		const markVisited = (i: number) => {
			visitedIndices[i] = true;
		};

		const clusteredIndices: Record<number, boolean> = {};
		const isClustered = (i: number) => clusteredIndices[i];
		const markClustered = (i: number) => {
			clusteredIndices[i] = true;
		};

		const uniqueMerge = <U>(targetArray: U[], sourceArray: U[]) => {
			for (let i = 0; i < sourceArray.length; i += 1) {
				const item = sourceArray[i];
				if (targetArray.indexOf(item) < 0) {
					targetArray.push(item);
				}
			}
		};

		const findNeighbors = (index: number) => {
			const neighbors = [];
			for (let other = 0; other < dataset.length; other += 1) {
				const distance = distanceFunction(dataset[index], dataset[other]);
				if (epsilonCompare(distance, epsilon)) {
					neighbors.push(other);
				}
			}
			return neighbors;
		};

		const noise: number[] = [];
		const addNoise = (i: number) => noise.push(i);

		const clusters: number[][] = [];
		const createCluster = () => clusters.push([]) - 1;
		const addIndexToCluster = (c: number, i: number) => {
			clusters[c].push(i);
			markClustered(i);
		};

		const expandCluster = (c: number, neighbors: number[]) => {
			for (let i = 0; i < neighbors.length; i += 1) {
				const neighborIndex = neighbors[i];
				if (!isVisited(neighborIndex)) {
					markVisited(neighborIndex);

					const secondaryNeighbors = findNeighbors(neighborIndex);
					if (secondaryNeighbors.length >= minimumPoints) {
						uniqueMerge(neighbors, secondaryNeighbors);
					}
				}

				if (!isClustered(neighborIndex)) {
					addIndexToCluster(c, neighborIndex);
				}
			}
		};

		dataset.forEach((_, index) => {
			if (!isVisited(index)) {
				markVisited(index);

				const neighbors = findNeighbors(index);
				if (neighbors.length < minimumPoints) {
					addNoise(index);
				} else {
					const clusterIndex = createCluster();
					addIndexToCluster(clusterIndex, index);
					expandCluster(clusterIndex, neighbors);
				}
			}
		});

		return { clusters, noise };
	};

	// -----------
	// Scraping implementation

	const IgnoredTagsList = [
		"footer",
		"nav",
		"aside",
		"script",
		"style",
		"noscript",
		"form",
		"button",
	];
	const InlineTags = [
		"a",
		"abbrv",
		"span",
		"address",
		"time",
		"acronym",
		"strong",
		"b",
		"br",
		"sub",
		"sup",
		"tt",
		"var",
		"em",
		"i",
	];

	type ReadableNode = HTMLElement;
	type NodeWithRect = {
		node: ReadableNode;
		rect: DOMRect;
	};

	const isOnlyChild = (node: Node) => {
		if (!node.parentElement) return true;
		if (node.parentElement.nodeName === "body") return false;
		if (node.parentElement.childNodes.length === 1) return true;
		return false;
	};

	const hasValidInlineParent = (node: Node) => {
		return node.parentElement && !node.parentElement.matches("div, section, article, main, body ");
	};

	const hasValidParent = (node: Node) => {
		return node.parentElement && !node.parentElement.isSameNode(document.body);
	};

	const possibleCodeParents = Array.from(document.querySelectorAll("pre, p"));
	const possibleTableParents = Array.from(document.querySelectorAll("table"));
	const possibleListParents = Array.from(document.querySelectorAll("ul, ol"));
	/**
	 * We want to find the highest parent of text node in the cluster.
	 * For example in this case: <p><span>Text here</span></p>
	 * the P tag is highest parent.
	 */
	const findHighestDirectParentOfReadableNode = (node: Node): HTMLElement => {
		// go up the tree until the parent is no longer an only child
		let parent = node.parentElement;
		// if the parent is an inline tag, then go up one more level
		while (
			parent &&
			hasValidInlineParent(parent) &&
			InlineTags.includes(parent?.tagName.toLowerCase())
		) {
			parent = parent.parentElement;
		}

		while (parent && isOnlyChild(parent)) {
			if (!hasValidParent(parent)) break;
			parent = parent.parentElement;
		}

		if (!parent) {
			throw new Error(
				"disconnected node found, this should not really be possible when traversing through the dom"
			);
		}

		// if the parent is a span, code or div tag check if there is a pre tag or p tag above it
		if (["span", "code", "div"].includes(parent.nodeName.toLowerCase())) {
			const hasParent = possibleCodeParents.find((tag) => tag.contains(parent)) as HTMLElement;
			if (hasParent) {
				parent = hasParent;
			}
		}

		// if the parent is a li tag check if there is a ul or ol tag above it
		if (parent.nodeName.toLowerCase() === "li") {
			const hasParent = possibleListParents.find((tag) => tag.contains(parent)) as HTMLElement;
			if (hasParent) {
				parent = hasParent;
			}
		}

		// if the parent is a td, th, tr tag check if there is a table tag above it
		if (["td", "th", "tr"].includes(parent.nodeName.toLowerCase())) {
			const hasParent = possibleTableParents.find((tag) => tag.contains(parent)) as HTMLElement;
			if (hasParent) {
				parent = hasParent;
			}
		}

		return parent;
	};
	const barredNodes = Array.from(document.querySelectorAll(IgnoredTagsList.join(",")));

	const doesNodePassHeuristics = (node: Node) => {
		if ((node.textContent ?? "").trim().length < 10) {
			return false;
		}

		const parentNode = findHighestDirectParentOfReadableNode(node);

		if (parentNode && parentNode instanceof Element) {
			if (
				!parentNode.checkVisibility({
					checkOpacity: true,
					checkVisibilityCSS: true,
				})
			)
				return false;

			const rect = parentNode.getBoundingClientRect();
			// elements that are readable usually don't have really small height or width
			if (rect.width < 4 || rect.height < 4) {
				return false;
			}
		}

		if (parentNode && parentNode instanceof Element) {
			if (barredNodes.some((barredNode) => barredNode.contains(parentNode))) {
				return false;
			}
		}

		return true;
	};

	const getAllReadableNodes = (): NodeWithRect[] => {
		if (!document.body) throw new Error("Page failed to load");
		const treeWalker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, {
			acceptNode(node) {
				if (doesNodePassHeuristics(node)) {
					return NodeFilter.FILTER_ACCEPT;
				} else {
					return NodeFilter.FILTER_SKIP;
				}
			},
		});

		const readableNodes = [];

		while (treeWalker.nextNode()) {
			readableNodes.push(treeWalker.currentNode as ReadableNode);
		}

		/*
		 * <table><p>hello</p><p>world</p></table>
		 * table is already included in the parent of the first p tag
		 */

		const parentsForReadableNodes = readableNodes.map(findHighestDirectParentOfReadableNode);
		const listWithOnlyParents: HTMLElement[] = [];
		// find unique nodes in the parent list, a unique node is a node that is not a child of any other node in the list
		for (let i = 0; i < parentsForReadableNodes.length; i++) {
			const node = parentsForReadableNodes[i];
			const hasParentInList = parentsForReadableNodes.find((otherNode, idx) => {
				if (i === idx) return false;
				return otherNode.contains(node);
			});
			listWithOnlyParents.push(hasParentInList ? hasParentInList : node);
		}

		const uniqueParents = Array.from(new Set(listWithOnlyParents));

		return uniqueParents.map((node) => {
			return {
				node,
				rect: node.getBoundingClientRect(),
			};
		});
	};

	const distanceFunction = (a: NodeWithRect, b: NodeWithRect) => {
		// we make two assumptions here which are fine to make for rects returned from getBoundingClientRect
		// 1. rects are upright and not rotated
		// 2. If two rects intersect, we assume distance to be 0
		let dx = 0;
		let dy = 0;
		const rect1 = a.rect;
		const rect2 = b.rect;
		// Calculate the horizontal distance
		if (rect1.x + rect1.width < rect2.x) {
			dx = rect2.x - (rect1.x + rect1.width);
		} else if (rect2.x + rect2.width < rect1.x) {
			dx = rect1.x - (rect2.x + rect2.width);
		}

		// Calculate the vertical distance
		if (rect1.y + rect1.height < rect2.y) {
			dy = rect2.y - (rect1.y + rect1.height);
		} else if (rect2.y + rect2.height < rect1.y) {
			dy = rect1.y - (rect2.y + rect2.height);
		}

		const distance = Math.sqrt(dx * dx + dy * dy);
		// Return the Euclidean distance
		return distance;
	};
	/**
	 * Clusters nodes using dbscan
	 */
	const clusterReadableNodes = (nodes: NodeWithRect[]) => {
		const { clusters } = DBSCAN({
			dataset: nodes,
			epsilon: 28,
			minimumPoints: 1,
			distanceFunction,
		});

		return clusters;
	};

	const totalTextLength = (cluster: number[]) => {
		return cluster
			.map((t) => readableNodes[t].node.innerText?.replaceAll(/ {2}|\r\n|\n|\r/gm, ""))
			.join("").length;
	};

	const approximatelyEqual = (a: number, b: number, epsilon = 1) => {
		return Math.abs(a - b) < epsilon;
	};

	const getClusterBounds = (cluster: number[]) => {
		const leftMostPoint = Math.min(...cluster.map((c) => readableNodes[c].rect.x));
		const topMostPoint = Math.min(...cluster.map((c) => readableNodes[c].rect.y));
		const rightMostPoint = Math.max(
			...cluster.map((c) => readableNodes[c].rect.x + readableNodes[c].rect.width)
		);
		const bottomMostPoint = Math.max(
			...cluster.map((c) => readableNodes[c].rect.y + readableNodes[c].rect.height)
		);
		return {
			// left most element
			x: leftMostPoint,
			y: topMostPoint,
			width: rightMostPoint - leftMostPoint,
			height: bottomMostPoint - topMostPoint,
		};
	};

	const round = (num: number, decimalPlaces = 2) => {
		const factor = Math.pow(10, decimalPlaces);
		return Math.round(num * factor) / factor;
	};

	/** minimum distance to center of the screen */
	const clusterCentrality = (cluster: number[]) => {
		const bounds = getClusterBounds(cluster);
		const centerOfScreen = window.innerWidth / 2;
		// the cluster contains the center of the screen
		if (bounds.x < centerOfScreen && bounds.x + bounds.width > centerOfScreen) {
			return 0;
		}

		// the cluster is to the left of the screen
		if (bounds.x + bounds.width < centerOfScreen) {
			return centerOfScreen - (bounds.x + bounds.width);
		}

		// the cluster is to the right of the screen
		return bounds.x - centerOfScreen;
	};
	/** measure of text share that belong to the cluster */
	const percentageTextShare = (cluster: number[], totalLength: number) => {
		// apply an exponentially increasing penalty for centrality per 100 pixels distance from center

		return round((totalTextLength(cluster) / totalLength) * 100);
	};

	const shouldMergeClusters = (clusterA: number[], clusterB: number[]) => {
		const clusterABounds = getClusterBounds(clusterA);
		const clusterBBounds = getClusterBounds(clusterB);

		// A cluster is horizontally aligned if the x and width are roughly equal
		const isHorizontallyAligned =
			approximatelyEqual(clusterABounds.x, clusterBBounds.x, 40) &&
			approximatelyEqual(clusterABounds.width, clusterBBounds.width, 40);

		if (!isHorizontallyAligned) return false;

		// check the y gap between the clusters
		const higherCluster = clusterABounds.y < clusterBBounds.y ? clusterABounds : clusterBBounds;
		const lowerCluster = clusterABounds.y < clusterBBounds.y ? clusterBBounds : clusterABounds;
		const yGap = lowerCluster.y - (higherCluster.y + higherCluster.height);

		if (approximatelyEqual(yGap, 0, 100)) return true;
	};

	const findCriticalClusters = (clusters: number[][]) => {
		// merge the clusters that have similar widths and x position

		let i = 0;
		while (i < clusters.length) {
			const cluster = clusters[i];
			for (let j = i + 1; j < clusters.length; j++) {
				const otherCluster = clusters[j];
				if (shouldMergeClusters(cluster, otherCluster)) {
					cluster.push(...otherCluster);
					clusters.splice(j, 1);
					j -= 1;
				}
			}

			i++;
		}

		const totalText = totalTextLength(clusters.flat());

		// sort in descending order of text share
		const clusterWithMetrics = clusters.map((cluster) => {
			const centrality = clusterCentrality(cluster);
			return {
				cluster,
				centrality,
				percentageTextShare: percentageTextShare(cluster, totalText),
			};
		});

		// if there is a dominant cluster with more than 60% text share, return that
		const dominantCluster = clusterWithMetrics[0]?.percentageTextShare > 60;
		if (dominantCluster) return [clusterWithMetrics[0].cluster];

		// clusters are sorted by text share after applying a penalty for centrality
		const sortedClusters = clusterWithMetrics.sort((a, b) => {
			const penaltyForA = Math.pow(0.9, a.centrality / 100);
			const penaltyForB = Math.pow(0.9, b.centrality / 100);
			const adjustedTextShareA = a.percentageTextShare * penaltyForA;
			const adjustedTextShareB = b.percentageTextShare * penaltyForB;

			return adjustedTextShareB - adjustedTextShareA;
		});

		// find all clusters that are similar to the largest cluster in terms of text share
		// and see if they are enough to cover at least 60% of the text share
		const largeTextShareClusters = sortedClusters.filter((c) =>
			approximatelyEqual(c.percentageTextShare, sortedClusters[0]?.percentageTextShare, 10)
		);

		const totalTextShareOfLargeClusters = largeTextShareClusters.reduce(
			(acc, cluster) => acc + cluster.percentageTextShare,
			0
		);

		if (totalTextShareOfLargeClusters > 60) {
			return largeTextShareClusters.map((c) => c.cluster);
		}

		// choose clusters till the text share is greater than 60%
		let totalTextShare = 0;
		const criticalClusters = [];
		for (const cluster of sortedClusters) {
			/** Ignore clusters with less than 2%*/
			if (cluster.percentageTextShare < 2) continue;
			if (totalTextShare > 60) break;
			criticalClusters.push(cluster.cluster);
			totalTextShare += cluster.percentageTextShare;
		}

		// if the total text share is less than 60% then return an empty array
		// as this website should not be particularly useful for the web search anyways
		// this should almost never happen on structured website with a lot of text
		if (totalTextShare < 60) {
			return [];
		}

		return criticalClusters;
	};

	const allowListedAttributes = ["href", "src", "alt", "title", "class", "id"];
	function serializeHTMLElement(node: Element): SerializedHTMLElement {
		return {
			tagName: node.tagName.toLowerCase(),
			attributes: allowListedAttributes.reduce(
				(acc, attr) => {
					const value = node.getAttribute(attr);
					if (value) {
						acc[attr] = value;
					}
					return acc;
				},
				{} as Record<string, string>
			),
			content: Array.from(node.childNodes).map(serializeNode).filter(Boolean),
		};
	}

	function serializeNode(node: Node): SerializedHTMLElement | string {
		if (node.nodeType === 1) return serializeHTMLElement(node as Element);
		else if (node.nodeType === 3) return node.textContent ?? "";
		else return "";
	}

	function getPageMetadata(): {
		title: string;
		siteName?: string;
		author?: string;
		description?: string;
		createdAt?: string;
		updatedAt?: string;
	} {
		const title = document.title ?? "";
		const siteName =
			document.querySelector("meta[property='og:site_name']")?.getAttribute("content") ?? undefined;
		const author =
			document.querySelector("meta[name='author']")?.getAttribute("content") ?? undefined;
		const description =
			document.querySelector("meta[name='description']")?.getAttribute("content") ??
			document.querySelector("meta[property='og:description']")?.getAttribute("content") ??
			undefined;
		const createdAt =
			document.querySelector("meta[property='article:published_time']")?.getAttribute("content") ??
			document.querySelector("meta[name='date']")?.getAttribute("content") ??
			undefined;
		const updatedAt =
			document.querySelector("meta[property='article:modified_time']")?.getAttribute("content") ??
			undefined;

		return { title, siteName, author, description, createdAt, updatedAt };
	}

	const readableNodes = getAllReadableNodes();
	const clusters = clusterReadableNodes(readableNodes);

	const criticalClusters = findCriticalClusters(clusters);

	// filter readable nodes using the above information as well as heuristics
	const filteredNodes = readableNodes.filter((_, idx) => {
		return criticalClusters.some((cluster) => {
			return cluster.includes(idx);
		});
	});

	const elements = filteredNodes
		.filter(
			(node, idx, nodes) => !nodes.slice(idx + 1).some((otherNode) => node.node === otherNode.node)
		)
		.map<SerializedHTMLElement>(({ node }) => serializeHTMLElement(node));
	const metadata = getPageMetadata();
	return { ...metadata, elements };
}