Spaces:
Runtime error
Runtime error
import os | |
import time | |
import gradio as gr | |
from selenium.webdriver.common.by import By | |
import undetected_chromedriver as uc | |
# Function to extract YouTube captions using a headless browser | |
def get_captions_selenium(video_url): | |
try: | |
print("π Launching Chromium via undetected-chromedriver...") | |
options = uc.ChromeOptions() | |
# Point to the system-installed Chromium binary | |
options.binary_location = os.environ.get("CHROME_BINARY", "/usr/bin/chromium") | |
options.add_argument("--headless=new") | |
options.add_argument("--no-sandbox") | |
options.add_argument("--disable-dev-shm-usage") | |
driver = uc.Chrome(options=options) | |
print("π Navigating to video URL...") | |
driver.get(video_url) | |
print("β Waiting for page to load...") | |
time.sleep(5) | |
print("π Scraping page source...") | |
page_source = driver.page_source | |
if "captionTracks" in page_source: | |
start = page_source.find("captionTracks") | |
end = page_source.find("]", start) + 1 | |
caption_json = page_source[start:end] | |
driver.quit() | |
return ( | |
"β Found potential captions info.\n" | |
"(You can parse this JSON string to extract subtitles.)\n\n" | |
+ caption_json | |
) | |
else: | |
driver.quit() | |
return "β οΈ Captions info not found in source. May not be available or blocked." | |
except Exception as e: | |
print(f"β Exception occurred: {e}") | |
return f"β Error: {str(e)}" | |
# Gradio interface definition | |
default_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ" | |
gr.Interface( | |
fn=get_captions_selenium, | |
inputs=[ | |
gr.Textbox(value=default_url, label="YouTube Video URL") | |
], | |
outputs="text", | |
title="YouTube Captions Scraper (Selenium)", | |
description=( | |
"Extract captions from a YouTube video using a headless browser with " | |
"undetected-chromedriver. Logs will appear in the Space's console." | |
) | |
).launch() |