Spaces:

lucas-ventura
/

chapter-llama

Running on Zero

App Files Files Community

chapter-llama / src /test /utils_chapters.py

lucas-ventura

Upload utils_chapters.py

0ca274b verified 28 days ago

raw

history blame contribute delete

2.36 kB

	import re


	def extract_chapters(output: str \| list[str]):
	"""
	Extract chapters from the given output string or list of strings.

	Args:
	output (str \| list[str]): The input text containing chapter information.
	vid_duration (str \| None): The video duration in hh:mm:ss format. Default is None.

	Returns:
	dict: A dictionary of extracted chapters with timestamps as keys and titles as values.
	"""

	# Only capture the first timestamp (hh:mm:ss) and ignore the second.
	pattern = r"(\d{2}:[0-5]\d:[0-5]\d)\b"
	chapters = {}

	if isinstance(output, str):
	output = output.split("\n")

	for line in output:
	if len(line) == 0:
	continue

	match = re.search(pattern, line)
	if match:
	time = match.group(1)
	# Strip any additional timestamp or text following it
	title = re.sub(pattern, "", line).strip()
	title = title.lstrip(" -:") # Remove leading dash, colon, or space
	title = title.strip()
	if len(title) > 0:
	chapters[time] = title

	return chapters


	def filter_chapters(chapters: dict, vid_duration: str \| None = None):
	if vid_duration:
	filter_chapters = {}
	for k, v in sorted(chapters.items()):
	if k > vid_duration:
	break
	filter_chapters[k] = v
	chapters = filter_chapters

	# Check if chapters are in ordered by time
	times = list(chapters.keys())
	for i in range(1, len(times)):
	if times[i] < times[i - 1]:
	return {}

	# remove empty chapters
	chapters = {k: v for k, v in chapters.items() if len(v) > 0}

	# if only one chapter at 00:00:00, return empty dict
	if len(chapters) == 1 and list(chapters.keys())[0] == "00:00:00":
	return {}

	return chapters


	if __name__ == "__main__":
	# Example usage
	text = """
	00:00:00 Introduction - good
	00:05:30 - 00:05:33: Second Chapter
	00:05:33: Another Chapter
	00:90:00 - Wrong time
	00:42:00 - After video duration
	00:39:00 - What is this?
	01:04:00 - Outside of video duration
	"""
	chapters = extract_chapters(text)
	chapters = filter_chapters(chapters, vid_duration="00:40:00")
	for time, title in chapters.items():
	print(f"Time: {time}, Title: {title}")