Spaces:
Running
Running
File size: 10,946 Bytes
85468c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 |
import feedparser
import pandas as pd
from datetime import datetime, timedelta
import ssl
from bs4 import BeautifulSoup
import warnings
import concurrent.futures
import re
import requests
warnings.filterwarnings("ignore")
URL = "https://www.deeplearning.ai/the-batch/"
# Configure SSL once at the module level
if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context
def extract_date(date_str):
"""Extract date from various formats using regex patterns"""
try:
# Try different patterns to match various date formats
# Pattern 1: Standard RFC format like "Mon, 14 Apr 2025 10:00:00 GMT"
pattern1 = r'(?:\w+,\s+)?(\d{1,2}\s+\w{3}\s+\d{4})'
match = re.search(pattern1, date_str)
if match:
date_str = match.group(1)
return pd.to_datetime(date_str, format='%d %b %Y')
# Pattern 2: Simple format like "14 Apr 2025"
pattern2 = r'(\d{1,2}\s+\w{3}\s+\d{4})'
match = re.search(pattern2, date_str)
if match:
return pd.to_datetime(match.group(1), format='%d %b %Y')
# Pattern 3: ISO format like "2025-04-14"
pattern3 = r'(\d{4}-\d{2}-\d{2})'
match = re.search(pattern3, date_str)
if match:
return pd.to_datetime(match.group(1))
# Pattern 4: Format like "Mar 12, 2025"
pattern4 = r'(\w{3}\s+\d{1,2},\s+\d{4})'
match = re.search(pattern4, date_str)
if match:
return pd.to_datetime(match.group(1), format='%b %d, %Y')
# If none of the patterns match, return original parsed date
return pd.to_datetime(date_str)
except:
# If all else fails, return NaT
return pd.NaT
def clean_html(text):
"""Clean HTML tags from text"""
try:
soup = BeautifulSoup(text, "html.parser")
return soup.get_text()
except Exception as e:
print(f"Error cleaning HTML: {e}")
return text
def extract_image_url(entry, description):
"""Extract image URL from RSS entry if available"""
try:
# Check for media:content
if hasattr(entry, 'media_content') and entry.media_content:
for media in entry.media_content:
if isinstance(media, dict) and 'url' in media:
return media['url']
# Check for media:thumbnail
if hasattr(entry, 'media_thumbnail') and entry.media_thumbnail:
for media in entry.media_thumbnail:
if isinstance(media, dict) and 'url' in media:
return media['url']
# Check for enclosures
if hasattr(entry, 'enclosures') and entry.enclosures:
for enclosure in entry.enclosures:
if isinstance(enclosure, dict) and 'url' in enclosure and enclosure.get('type', '').startswith('image/'):
return enclosure['url']
# Try to extract from description using BeautifulSoup
if description:
soup = BeautifulSoup(description, "html.parser")
# First, check meta tags for twitter:image
meta_img = soup.find('meta', attrs={'name': 'twitter:image'})
if meta_img and meta_img.has_attr('content'):
return meta_img['content']
# Then check for regular img tags
img_tag = soup.find('img')
if img_tag and img_tag.has_attr('src'):
return img_tag['src']
# Try to extract image URL from HTML
img_match = re.search(r'<img[^>]+src=[\'"]([^\'"]+)[\'"]', description)
if img_match:
return img_match.group(1)
# No image found
return None
except Exception as e:
print(f"Error extracting image URL: {e}")
return None
def fetch_single_feed(link_source_tuple):
"""Fetch a single RSS feed and return its entries"""
link, source = link_source_tuple
entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": [], "Image": []}
try:
feed = feedparser.parse(link)
for entry in feed.entries:
title = entry.get("title", "No Title")
link = entry.get("link", "No Link")
published = entry.get("published", "No Date")
description = entry.get("description", "No Description")
# Extract image URL
image_url = extract_image_url(entry, description)
entries["Title"].append(title)
entries["Link"].append(link)
entries["Published"].append(published)
entries["Description"].append(description)
entries["Source"].append(source)
entries["Image"].append(image_url) # Add image URL
except Exception as e:
print(f"Error fetching {link}: {e}")
return entries
def fetch_feed(links):
"""Fetch multiple RSS feeds in parallel"""
all_entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": [], "Image": []}
# Use ThreadPoolExecutor to fetch feeds in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
future_to_link = {executor.submit(fetch_single_feed, (link, source)): (link, source)
for link, source in links.items()}
for future in concurrent.futures.as_completed(future_to_link):
link, source = future_to_link[future]
try:
result = future.result()
# Merge results into all_entries
for key in all_entries:
all_entries[key].extend(result[key])
except Exception as e:
print(f"Exception for {link}: {e}")
# Create a DataFrame from all entries
df = pd.DataFrame(all_entries)
return df
def scrape_the_batch_articles():
all_entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": [], "Image": []}
try:
res = requests.get(URL)
soup = BeautifulSoup(res.text, "html.parser")
articles = soup.find_all("article")
for article in articles:
# Link
link_tag = article.find("a", href=True)
link = "https://www.deeplearning.ai" + link_tag["href"] if link_tag else "#"
# Title
title_tag = article.find("h2")
title = title_tag.get_text(strip=True) if title_tag else "No title"
# Summary
summary_tag = article.find("div", class_="text-sm")
summary = summary_tag.get_text(strip=True) if summary_tag else ""
# Date (based on div with specific class)
date_tag = article.find("div", class_="text-slate-500")
date_str = date_tag.get_text(strip=True) if date_tag else ""
# Image
img_tag = article.find("img")
image_url = img_tag["src"] if img_tag and img_tag.has_attr("src") else None
try:
parsed_date = datetime.strptime(date_str, "%b %d, %Y")
except Exception as e:
parsed_date = None
if parsed_date:
all_entries["Title"].append(title)
all_entries["Description"].append(summary)
all_entries["Link"].append(link)
all_entries["Published"].append(date_str)
all_entries["Source"].append("deeplearning.ai")
all_entries["Image"].append(image_url)
return pd.DataFrame(all_entries)
except Exception as e:
print(f"Error scraping The Batch: {e}")
return pd.DataFrame()
def extract_and_clean_data(df):
"""Process and clean the feed data"""
if df.empty:
return df
try:
# Apply the custom date extraction function
df['date'] = df['Published'].apply(extract_date)
# Drop rows with invalid dates
df = df.dropna(subset=['date'])
# Drop the original 'Published' column
df.drop(columns=['Published'], inplace=True)
# Filter for the last 30 days (increased from 7 for more content)
today = datetime.now()
thirty_days_ago = today - timedelta(days=30)
df_filtered = df[(df['date'] >= thirty_days_ago) & (df['date'] <= today)]
# Sort by date in descending order
df_filtered = df_filtered.sort_values(by='date', ascending=False)
# Clean HTML and limit description length in one step
df_filtered['Description'] = df_filtered['Description'].apply(
lambda x: clean_html(x)[:500].replace("\n", "")
)
return df_filtered
except Exception as e:
print(f"An error occurred while processing the data: {e}")
return pd.DataFrame()
def main():
# RSS links
links = {
"https://bair.berkeley.edu/blog/feed.xml": "The Berkeley Artificial Intelligence Research Blog",
"https://feeds.feedburner.com/nvidiablog": "NVDIA Blog",
"https://www.microsoft.com/en-us/research/feed/": "Microsoft Research",
"https://www.sciencedaily.com/rss/computers_math/artificial_intelligence.xml": "Science Daily",
"https://research.facebook.com/feed/": "META Research",
"https://openai.com/news/rss.xml": "OpenAI News",
"https://deepmind.google/blog/feed/basic/": "Google DeepMind Blog",
"https://news.mit.edu/rss/topic/artificial-intelligence2": "MIT News - Artificial intelligence",
"https://www.technologyreview.com/topic/artificial-intelligence/feed": "MIT Technology Review - Artificial intelligence",
"https://www.wired.com/feed/tag/ai/latest/rss": "Wired: Artificial Intelligence Latest",
"https://raw.githubusercontent.com/Olshansk/rss-feeds/refs/heads/main/feeds/feed_ollama.xml": "Ollama Blog",
"https://newsroom.ibm.com/press-releases-artificial-intelligence?pagetemplate=rss": "IBM - Announcements (Artificial intelligence)"
}
# Fetch data from The Batch
batch_df = scrape_the_batch_articles()
# Fetch data from RSS feeds
rss_df = fetch_feed(links)
# Combine both dataframes
combined_df = pd.concat([batch_df, rss_df], ignore_index=True)
# Process and clean data
final_df = extract_and_clean_data(combined_df)
return final_df
if __name__ == "__main__":
df = main()
print(df.head())
df.to_excel("ai_news.xlsx") |