Spaces:
Running
on
Zero
Running
on
Zero
import re | |
def extract_chapters(output: str | list[str]): | |
""" | |
Extract chapters from the given output string or list of strings. | |
Args: | |
output (str | list[str]): The input text containing chapter information. | |
vid_duration (str | None): The video duration in hh:mm:ss format. Default is None. | |
Returns: | |
dict: A dictionary of extracted chapters with timestamps as keys and titles as values. | |
""" | |
# Only capture the first timestamp (hh:mm:ss) and ignore the second. | |
pattern = r"(\d{2}:[0-5]\d:[0-5]\d)\b" | |
chapters = {} | |
if isinstance(output, str): | |
output = output.split("\n") | |
for line in output: | |
if len(line) == 0: | |
continue | |
match = re.search(pattern, line) | |
if match: | |
time = match.group(1) | |
# Strip any additional timestamp or text following it | |
title = re.sub(pattern, "", line).strip() | |
title = title.lstrip(" -:") # Remove leading dash, colon, or space | |
title = title.strip() | |
if len(title) > 0: | |
chapters[time] = title | |
return chapters | |
def filter_chapters(chapters: dict, vid_duration: str | None = None): | |
if vid_duration: | |
filter_chapters = {} | |
for k, v in sorted(chapters.items()): | |
if k > vid_duration: | |
break | |
filter_chapters[k] = v | |
chapters = filter_chapters | |
# Check if chapters are in ordered by time | |
times = list(chapters.keys()) | |
for i in range(1, len(times)): | |
if times[i] < times[i - 1]: | |
return {} | |
# remove empty chapters | |
chapters = {k: v for k, v in chapters.items() if len(v) > 0} | |
# if only one chapter at 00:00:00, return empty dict | |
if len(chapters) == 1 and list(chapters.keys())[0] == "00:00:00": | |
return {} | |
return chapters | |
if __name__ == "__main__": | |
# Example usage | |
text = """ | |
00:00:00 Introduction - good | |
00:05:30 - 00:05:33: Second Chapter | |
00:05:33: Another Chapter | |
00:90:00 - Wrong time | |
00:42:00 - After video duration | |
00:39:00 - What is this? | |
01:04:00 - Outside of video duration | |
""" | |
chapters = extract_chapters(text) | |
chapters = filter_chapters(chapters, vid_duration="00:40:00") | |
for time, title in chapters.items(): | |
print(f"Time: {time}, Title: {title}") | |