Rivalcoder commited on
Commit
5427076
Β·
1 Parent(s): 68c6dab
Files changed (2) hide show
  1. Dockerfile +32 -27
  2. app.py +22 -8
Dockerfile CHANGED
@@ -1,36 +1,41 @@
1
  FROM python:3.10-slim
2
 
3
- # Install Chrome and dependencies
4
- RUN apt-get update && apt-get install -y wget gnupg unzip curl ca-certificates fonts-liberation libappindicator3-1 libasound2 libatk-bridge2.0-0 libnspr4 libnss3 libx11-xcb1 libxcomposite1 libxdamage1 libxrandr2 xdg-utils libu2f-udev libvulkan1
5
-
6
- # Install Chrome
7
- RUN wget -q -O google-chrome.deb https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
8
- && apt install -y ./google-chrome.deb \
9
- && rm google-chrome.deb
10
-
11
- # Install matching ChromeDriver
12
- RUN CHROME_VERSION=$(google-chrome --version | grep -oP '\d+\.\d+\.\d+\.\d+') \
13
- && CHROMEDRIVER_VERSION=$(curl -s "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_$CHROME_VERSION") \
14
- && wget -O /tmp/chromedriver.zip https://chromedriver.storage.googleapis.com/${CHROMEDRIVER_VERSION}/chromedriver_linux64.zip \
15
- && unzip /tmp/chromedriver.zip -d /usr/local/bin/ \
16
- && chmod +x /usr/local/bin/chromedriver \
17
- && rm /tmp/chromedriver.zip
18
-
19
-
20
- # Set environment variables
21
- ENV PATH="/usr/local/bin:$PATH"
22
- ENV CHROME_BIN="/usr/bin/google-chrome"
23
- ENV CHROMEDRIVER="/usr/local/bin/chromedriver"
24
-
25
- # Set working dir
 
 
26
  WORKDIR /app
27
 
28
- # Install Python dependencies
29
  COPY requirements.txt .
30
  RUN pip install --no-cache-dir -r requirements.txt
31
 
32
- # Copy app code
33
  COPY app.py .
34
 
35
- # Run the app
36
- CMD ["python", "app.py"]
 
 
 
 
1
  FROM python:3.10-slim
2
 
3
+ # 1) Install Chromium + driver + required libraries
4
+ RUN apt-get update && apt-get install -y \
5
+ chromium \
6
+ chromium-driver \
7
+ wget \
8
+ gnupg \
9
+ unzip \
10
+ curl \
11
+ ca-certificates \
12
+ fonts-liberation \
13
+ libappindicator3-1 \
14
+ libasound2 \
15
+ libatk-bridge2.0-0 \
16
+ libnspr4 \
17
+ libnss3 \
18
+ libx11-xcb1 \
19
+ libxcomposite1 \
20
+ libxdamage1 \
21
+ libxrandr2 \
22
+ xdg-utils \
23
+ libu2f-udev \
24
+ libvulkan1 \
25
+ && rm -rf /var/lib/apt/lists/*
26
+
27
+ # 2) Set working directory
28
  WORKDIR /app
29
 
30
+ # 3) Install Python dependencies
31
  COPY requirements.txt .
32
  RUN pip install --no-cache-dir -r requirements.txt
33
 
34
+ # 4) Copy application code
35
  COPY app.py .
36
 
37
+ # 5) Point undetected-chromedriver at system Chromium
38
+ ENV CHROME_BINARY=/usr/bin/chromium
39
+
40
+ # 6) Launch the Gradio app
41
+ CMD ["python", "app.py"]
app.py CHANGED
@@ -1,18 +1,22 @@
 
1
  import time
2
  import gradio as gr
3
  from selenium.webdriver.common.by import By
4
  import undetected_chromedriver as uc
5
 
 
6
  def get_captions_selenium(video_url):
7
  try:
8
- print("πŸš€ Launching Chrome...")
9
  options = uc.ChromeOptions()
10
- options.add_argument("--headless=new") # Use 'new' headless mode for Chrome 109+
 
 
11
  options.add_argument("--no-sandbox")
12
  options.add_argument("--disable-dev-shm-usage")
13
 
14
  driver = uc.Chrome(options=options)
15
- print("🌍 Navigating to video...")
16
  driver.get(video_url)
17
 
18
  print("βŒ› Waiting for page to load...")
@@ -26,7 +30,11 @@ def get_captions_selenium(video_url):
26
  end = page_source.find("]", start) + 1
27
  caption_json = page_source[start:end]
28
  driver.quit()
29
- return "βœ… Found potential captions info in page source (you may need to parse this JSON).\n\n" + caption_json
 
 
 
 
30
  else:
31
  driver.quit()
32
  return "⚠️ Captions info not found in source. May not be available or blocked."
@@ -35,11 +43,17 @@ def get_captions_selenium(video_url):
35
  print(f"❌ Exception occurred: {e}")
36
  return f"❌ Error: {str(e)}"
37
 
38
- # Gradio interface
 
39
  gr.Interface(
40
  fn=get_captions_selenium,
41
- inputs=[gr.Textbox(label="YouTube Video URL")],
 
 
42
  outputs="text",
43
  title="YouTube Captions Scraper (Selenium)",
44
- description="Uses Selenium with undetected-chromedriver to extract captions from a YouTube video."
45
- ).launch()
 
 
 
 
1
+ import os
2
  import time
3
  import gradio as gr
4
  from selenium.webdriver.common.by import By
5
  import undetected_chromedriver as uc
6
 
7
+ # Function to extract YouTube captions using a headless browser
8
  def get_captions_selenium(video_url):
9
  try:
10
+ print("πŸš€ Launching Chromium via undetected-chromedriver...")
11
  options = uc.ChromeOptions()
12
+ # Point to the system-installed Chromium binary
13
+ options.binary_location = os.environ.get("CHROME_BINARY", "/usr/bin/chromium")
14
+ options.add_argument("--headless=new")
15
  options.add_argument("--no-sandbox")
16
  options.add_argument("--disable-dev-shm-usage")
17
 
18
  driver = uc.Chrome(options=options)
19
+ print("🌍 Navigating to video URL...")
20
  driver.get(video_url)
21
 
22
  print("βŒ› Waiting for page to load...")
 
30
  end = page_source.find("]", start) + 1
31
  caption_json = page_source[start:end]
32
  driver.quit()
33
+ return (
34
+ "βœ… Found potential captions info.\n"
35
+ "(You can parse this JSON string to extract subtitles.)\n\n"
36
+ + caption_json
37
+ )
38
  else:
39
  driver.quit()
40
  return "⚠️ Captions info not found in source. May not be available or blocked."
 
43
  print(f"❌ Exception occurred: {e}")
44
  return f"❌ Error: {str(e)}"
45
 
46
+ # Gradio interface definition
47
+ default_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
48
  gr.Interface(
49
  fn=get_captions_selenium,
50
+ inputs=[
51
+ gr.Textbox(default=default_url, label="YouTube Video URL")
52
+ ],
53
  outputs="text",
54
  title="YouTube Captions Scraper (Selenium)",
55
+ description=(
56
+ "Extract captions from a YouTube video using a headless browser with "
57
+ "undetected-chromedriver. Logs will appear in the Space's console."
58
+ )
59
+ ).launch()