chapter-llama / src /test /utils_chapters.py
lucas-ventura's picture
Upload utils_chapters.py
0ca274b verified
import re
def extract_chapters(output: str | list[str]):
"""
Extract chapters from the given output string or list of strings.
Args:
output (str | list[str]): The input text containing chapter information.
vid_duration (str | None): The video duration in hh:mm:ss format. Default is None.
Returns:
dict: A dictionary of extracted chapters with timestamps as keys and titles as values.
"""
# Only capture the first timestamp (hh:mm:ss) and ignore the second.
pattern = r"(\d{2}:[0-5]\d:[0-5]\d)\b"
chapters = {}
if isinstance(output, str):
output = output.split("\n")
for line in output:
if len(line) == 0:
continue
match = re.search(pattern, line)
if match:
time = match.group(1)
# Strip any additional timestamp or text following it
title = re.sub(pattern, "", line).strip()
title = title.lstrip(" -:") # Remove leading dash, colon, or space
title = title.strip()
if len(title) > 0:
chapters[time] = title
return chapters
def filter_chapters(chapters: dict, vid_duration: str | None = None):
if vid_duration:
filter_chapters = {}
for k, v in sorted(chapters.items()):
if k > vid_duration:
break
filter_chapters[k] = v
chapters = filter_chapters
# Check if chapters are in ordered by time
times = list(chapters.keys())
for i in range(1, len(times)):
if times[i] < times[i - 1]:
return {}
# remove empty chapters
chapters = {k: v for k, v in chapters.items() if len(v) > 0}
# if only one chapter at 00:00:00, return empty dict
if len(chapters) == 1 and list(chapters.keys())[0] == "00:00:00":
return {}
return chapters
if __name__ == "__main__":
# Example usage
text = """
00:00:00 Introduction - good
00:05:30 - 00:05:33: Second Chapter
00:05:33: Another Chapter
00:90:00 - Wrong time
00:42:00 - After video duration
00:39:00 - What is this?
01:04:00 - Outside of video duration
"""
chapters = extract_chapters(text)
chapters = filter_chapters(chapters, vid_duration="00:40:00")
for time, title in chapters.items():
print(f"Time: {time}, Title: {title}")