Spaces:
Runtime error
Runtime error
File size: 2,069 Bytes
5427076 cbf58a5 5427076 cbf58a5 5427076 cbf58a5 5427076 cbf58a5 5427076 cbf58a5 a0bdeee cbf58a5 a0bdeee cbf58a5 a0bdeee cbf58a5 5427076 cbf58a5 a0bdeee cbf58a5 5427076 cbf58a5 5427076 39ddda1 5427076 cbf58a5 5427076 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import os
import time
import gradio as gr
from selenium.webdriver.common.by import By
import undetected_chromedriver as uc
# Function to extract YouTube captions using a headless browser
def get_captions_selenium(video_url):
try:
print("π Launching Chromium via undetected-chromedriver...")
options = uc.ChromeOptions()
# Point to the system-installed Chromium binary
options.binary_location = os.environ.get("CHROME_BINARY", "/usr/bin/chromium")
options.add_argument("--headless=new")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = uc.Chrome(options=options)
print("π Navigating to video URL...")
driver.get(video_url)
print("β Waiting for page to load...")
time.sleep(5)
print("π Scraping page source...")
page_source = driver.page_source
if "captionTracks" in page_source:
start = page_source.find("captionTracks")
end = page_source.find("]", start) + 1
caption_json = page_source[start:end]
driver.quit()
return (
"β
Found potential captions info.\n"
"(You can parse this JSON string to extract subtitles.)\n\n"
+ caption_json
)
else:
driver.quit()
return "β οΈ Captions info not found in source. May not be available or blocked."
except Exception as e:
print(f"β Exception occurred: {e}")
return f"β Error: {str(e)}"
# Gradio interface definition
default_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
gr.Interface(
fn=get_captions_selenium,
inputs=[
gr.Textbox(value=default_url, label="YouTube Video URL")
],
outputs="text",
title="YouTube Captions Scraper (Selenium)",
description=(
"Extract captions from a YouTube video using a headless browser with "
"undetected-chromedriver. Logs will appear in the Space's console."
)
).launch() |