import os import time import gradio as gr from selenium.webdriver.common.by import By import undetected_chromedriver as uc # Function to extract YouTube captions using a headless browser def get_captions_selenium(video_url): try: print("🚀 Launching Chromium via undetected-chromedriver...") options = uc.ChromeOptions() # Point to the system-installed Chromium binary options.binary_location = os.environ.get("CHROME_BINARY", "/usr/bin/chromium") options.add_argument("--headless=new") options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") driver = uc.Chrome(options=options) print("🌍 Navigating to video URL...") driver.get(video_url) print("⌛ Waiting for page to load...") time.sleep(5) print("📄 Scraping page source...") page_source = driver.page_source if "captionTracks" in page_source: start = page_source.find("captionTracks") end = page_source.find("]", start) + 1 caption_json = page_source[start:end] driver.quit() return ( "✅ Found potential captions info.\n" "(You can parse this JSON string to extract subtitles.)\n\n" + caption_json ) else: driver.quit() return "⚠️ Captions info not found in source. May not be available or blocked." except Exception as e: print(f"❌ Exception occurred: {e}") return f"❌ Error: {str(e)}" # Gradio interface definition default_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ" gr.Interface( fn=get_captions_selenium, inputs=[ gr.Textbox(value=default_url, label="YouTube Video URL") ], outputs="text", title="YouTube Captions Scraper (Selenium)", description=( "Extract captions from a YouTube video using a headless browser with " "undetected-chromedriver. Logs will appear in the Space's console." ) ).launch()