latest-ai-news / fetch_data.py
alphaaico's picture
Upload 8 files
85468c7 verified
import feedparser
import pandas as pd
from datetime import datetime, timedelta
import ssl
from bs4 import BeautifulSoup
import warnings
import concurrent.futures
import re
import requests
warnings.filterwarnings("ignore")
URL = "https://www.deeplearning.ai/the-batch/"
# Configure SSL once at the module level
if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context
def extract_date(date_str):
"""Extract date from various formats using regex patterns"""
try:
# Try different patterns to match various date formats
# Pattern 1: Standard RFC format like "Mon, 14 Apr 2025 10:00:00 GMT"
pattern1 = r'(?:\w+,\s+)?(\d{1,2}\s+\w{3}\s+\d{4})'
match = re.search(pattern1, date_str)
if match:
date_str = match.group(1)
return pd.to_datetime(date_str, format='%d %b %Y')
# Pattern 2: Simple format like "14 Apr 2025"
pattern2 = r'(\d{1,2}\s+\w{3}\s+\d{4})'
match = re.search(pattern2, date_str)
if match:
return pd.to_datetime(match.group(1), format='%d %b %Y')
# Pattern 3: ISO format like "2025-04-14"
pattern3 = r'(\d{4}-\d{2}-\d{2})'
match = re.search(pattern3, date_str)
if match:
return pd.to_datetime(match.group(1))
# Pattern 4: Format like "Mar 12, 2025"
pattern4 = r'(\w{3}\s+\d{1,2},\s+\d{4})'
match = re.search(pattern4, date_str)
if match:
return pd.to_datetime(match.group(1), format='%b %d, %Y')
# If none of the patterns match, return original parsed date
return pd.to_datetime(date_str)
except:
# If all else fails, return NaT
return pd.NaT
def clean_html(text):
"""Clean HTML tags from text"""
try:
soup = BeautifulSoup(text, "html.parser")
return soup.get_text()
except Exception as e:
print(f"Error cleaning HTML: {e}")
return text
def extract_image_url(entry, description):
"""Extract image URL from RSS entry if available"""
try:
# Check for media:content
if hasattr(entry, 'media_content') and entry.media_content:
for media in entry.media_content:
if isinstance(media, dict) and 'url' in media:
return media['url']
# Check for media:thumbnail
if hasattr(entry, 'media_thumbnail') and entry.media_thumbnail:
for media in entry.media_thumbnail:
if isinstance(media, dict) and 'url' in media:
return media['url']
# Check for enclosures
if hasattr(entry, 'enclosures') and entry.enclosures:
for enclosure in entry.enclosures:
if isinstance(enclosure, dict) and 'url' in enclosure and enclosure.get('type', '').startswith('image/'):
return enclosure['url']
# Try to extract from description using BeautifulSoup
if description:
soup = BeautifulSoup(description, "html.parser")
# First, check meta tags for twitter:image
meta_img = soup.find('meta', attrs={'name': 'twitter:image'})
if meta_img and meta_img.has_attr('content'):
return meta_img['content']
# Then check for regular img tags
img_tag = soup.find('img')
if img_tag and img_tag.has_attr('src'):
return img_tag['src']
# Try to extract image URL from HTML
img_match = re.search(r'<img[^>]+src=[\'"]([^\'"]+)[\'"]', description)
if img_match:
return img_match.group(1)
# No image found
return None
except Exception as e:
print(f"Error extracting image URL: {e}")
return None
def fetch_single_feed(link_source_tuple):
"""Fetch a single RSS feed and return its entries"""
link, source = link_source_tuple
entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": [], "Image": []}
try:
feed = feedparser.parse(link)
for entry in feed.entries:
title = entry.get("title", "No Title")
link = entry.get("link", "No Link")
published = entry.get("published", "No Date")
description = entry.get("description", "No Description")
# Extract image URL
image_url = extract_image_url(entry, description)
entries["Title"].append(title)
entries["Link"].append(link)
entries["Published"].append(published)
entries["Description"].append(description)
entries["Source"].append(source)
entries["Image"].append(image_url) # Add image URL
except Exception as e:
print(f"Error fetching {link}: {e}")
return entries
def fetch_feed(links):
"""Fetch multiple RSS feeds in parallel"""
all_entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": [], "Image": []}
# Use ThreadPoolExecutor to fetch feeds in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
future_to_link = {executor.submit(fetch_single_feed, (link, source)): (link, source)
for link, source in links.items()}
for future in concurrent.futures.as_completed(future_to_link):
link, source = future_to_link[future]
try:
result = future.result()
# Merge results into all_entries
for key in all_entries:
all_entries[key].extend(result[key])
except Exception as e:
print(f"Exception for {link}: {e}")
# Create a DataFrame from all entries
df = pd.DataFrame(all_entries)
return df
def scrape_the_batch_articles():
all_entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": [], "Image": []}
try:
res = requests.get(URL)
soup = BeautifulSoup(res.text, "html.parser")
articles = soup.find_all("article")
for article in articles:
# Link
link_tag = article.find("a", href=True)
link = "https://www.deeplearning.ai" + link_tag["href"] if link_tag else "#"
# Title
title_tag = article.find("h2")
title = title_tag.get_text(strip=True) if title_tag else "No title"
# Summary
summary_tag = article.find("div", class_="text-sm")
summary = summary_tag.get_text(strip=True) if summary_tag else ""
# Date (based on div with specific class)
date_tag = article.find("div", class_="text-slate-500")
date_str = date_tag.get_text(strip=True) if date_tag else ""
# Image
img_tag = article.find("img")
image_url = img_tag["src"] if img_tag and img_tag.has_attr("src") else None
try:
parsed_date = datetime.strptime(date_str, "%b %d, %Y")
except Exception as e:
parsed_date = None
if parsed_date:
all_entries["Title"].append(title)
all_entries["Description"].append(summary)
all_entries["Link"].append(link)
all_entries["Published"].append(date_str)
all_entries["Source"].append("deeplearning.ai")
all_entries["Image"].append(image_url)
return pd.DataFrame(all_entries)
except Exception as e:
print(f"Error scraping The Batch: {e}")
return pd.DataFrame()
def extract_and_clean_data(df):
"""Process and clean the feed data"""
if df.empty:
return df
try:
# Apply the custom date extraction function
df['date'] = df['Published'].apply(extract_date)
# Drop rows with invalid dates
df = df.dropna(subset=['date'])
# Drop the original 'Published' column
df.drop(columns=['Published'], inplace=True)
# Filter for the last 30 days (increased from 7 for more content)
today = datetime.now()
thirty_days_ago = today - timedelta(days=30)
df_filtered = df[(df['date'] >= thirty_days_ago) & (df['date'] <= today)]
# Sort by date in descending order
df_filtered = df_filtered.sort_values(by='date', ascending=False)
# Clean HTML and limit description length in one step
df_filtered['Description'] = df_filtered['Description'].apply(
lambda x: clean_html(x)[:500].replace("\n", "")
)
return df_filtered
except Exception as e:
print(f"An error occurred while processing the data: {e}")
return pd.DataFrame()
def main():
# RSS links
links = {
"https://bair.berkeley.edu/blog/feed.xml": "The Berkeley Artificial Intelligence Research Blog",
"https://feeds.feedburner.com/nvidiablog": "NVDIA Blog",
"https://www.microsoft.com/en-us/research/feed/": "Microsoft Research",
"https://www.sciencedaily.com/rss/computers_math/artificial_intelligence.xml": "Science Daily",
"https://research.facebook.com/feed/": "META Research",
"https://openai.com/news/rss.xml": "OpenAI News",
"https://deepmind.google/blog/feed/basic/": "Google DeepMind Blog",
"https://news.mit.edu/rss/topic/artificial-intelligence2": "MIT News - Artificial intelligence",
"https://www.technologyreview.com/topic/artificial-intelligence/feed": "MIT Technology Review - Artificial intelligence",
"https://www.wired.com/feed/tag/ai/latest/rss": "Wired: Artificial Intelligence Latest",
"https://raw.githubusercontent.com/Olshansk/rss-feeds/refs/heads/main/feeds/feed_ollama.xml": "Ollama Blog",
"https://newsroom.ibm.com/press-releases-artificial-intelligence?pagetemplate=rss": "IBM - Announcements (Artificial intelligence)"
}
# Fetch data from The Batch
batch_df = scrape_the_batch_articles()
# Fetch data from RSS feeds
rss_df = fetch_feed(links)
# Combine both dataframes
combined_df = pd.concat([batch_df, rss_df], ignore_index=True)
# Process and clean data
final_df = extract_and_clean_data(combined_df)
return final_df
if __name__ == "__main__":
df = main()
print(df.head())
df.to_excel("ai_news.xlsx")