File size: 2,069 Bytes
5427076
cbf58a5
 
 
 
 
5427076
cbf58a5
 
5427076
cbf58a5
5427076
 
 
cbf58a5
 
 
 
5427076
cbf58a5
 
a0bdeee
 
cbf58a5
a0bdeee
cbf58a5
a0bdeee
cbf58a5
 
 
 
 
5427076
 
 
 
 
cbf58a5
 
 
 
 
a0bdeee
cbf58a5
 
5427076
 
cbf58a5
 
5427076
39ddda1
 
5427076
cbf58a5
 
5427076
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import time
import gradio as gr
from selenium.webdriver.common.by import By
import undetected_chromedriver as uc

# Function to extract YouTube captions using a headless browser
def get_captions_selenium(video_url):
    try:
        print("πŸš€ Launching Chromium via undetected-chromedriver...")
        options = uc.ChromeOptions()
        # Point to the system-installed Chromium binary
        options.binary_location = os.environ.get("CHROME_BINARY", "/usr/bin/chromium")
        options.add_argument("--headless=new")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")

        driver = uc.Chrome(options=options)
        print("🌍 Navigating to video URL...")
        driver.get(video_url)

        print("βŒ› Waiting for page to load...")
        time.sleep(5)

        print("πŸ“„ Scraping page source...")
        page_source = driver.page_source

        if "captionTracks" in page_source:
            start = page_source.find("captionTracks")
            end = page_source.find("]", start) + 1
            caption_json = page_source[start:end]
            driver.quit()
            return (
                "βœ… Found potential captions info.\n"
                "(You can parse this JSON string to extract subtitles.)\n\n"
                + caption_json
            )
        else:
            driver.quit()
            return "⚠️ Captions info not found in source. May not be available or blocked."

    except Exception as e:
        print(f"❌ Exception occurred: {e}")
        return f"❌ Error: {str(e)}"

# Gradio interface definition
default_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
gr.Interface(
    fn=get_captions_selenium,
    inputs=[
        gr.Textbox(value=default_url, label="YouTube Video URL")

    ],
    outputs="text",
    title="YouTube Captions Scraper (Selenium)",
    description=(
        "Extract captions from a YouTube video using a headless browser with "
        "undetected-chromedriver. Logs will appear in the Space's console."
    )
).launch()