import feedparser import pandas as pd from datetime import datetime, timedelta import ssl from bs4 import BeautifulSoup import warnings import concurrent.futures import re import requests warnings.filterwarnings("ignore") URL = "https://www.deeplearning.ai/the-batch/" # Configure SSL once at the module level if hasattr(ssl, '_create_unverified_context'): ssl._create_default_https_context = ssl._create_unverified_context def extract_date(date_str): """Extract date from various formats using regex patterns""" try: # Try different patterns to match various date formats # Pattern 1: Standard RFC format like "Mon, 14 Apr 2025 10:00:00 GMT" pattern1 = r'(?:\w+,\s+)?(\d{1,2}\s+\w{3}\s+\d{4})' match = re.search(pattern1, date_str) if match: date_str = match.group(1) return pd.to_datetime(date_str, format='%d %b %Y') # Pattern 2: Simple format like "14 Apr 2025" pattern2 = r'(\d{1,2}\s+\w{3}\s+\d{4})' match = re.search(pattern2, date_str) if match: return pd.to_datetime(match.group(1), format='%d %b %Y') # Pattern 3: ISO format like "2025-04-14" pattern3 = r'(\d{4}-\d{2}-\d{2})' match = re.search(pattern3, date_str) if match: return pd.to_datetime(match.group(1)) # Pattern 4: Format like "Mar 12, 2025" pattern4 = r'(\w{3}\s+\d{1,2},\s+\d{4})' match = re.search(pattern4, date_str) if match: return pd.to_datetime(match.group(1), format='%b %d, %Y') # If none of the patterns match, return original parsed date return pd.to_datetime(date_str) except: # If all else fails, return NaT return pd.NaT def clean_html(text): """Clean HTML tags from text""" try: soup = BeautifulSoup(text, "html.parser") return soup.get_text() except Exception as e: print(f"Error cleaning HTML: {e}") return text def extract_image_url(entry, description): """Extract image URL from RSS entry if available""" try: # Check for media:content if hasattr(entry, 'media_content') and entry.media_content: for media in entry.media_content: if isinstance(media, dict) and 'url' in media: return media['url'] # Check for media:thumbnail if hasattr(entry, 'media_thumbnail') and entry.media_thumbnail: for media in entry.media_thumbnail: if isinstance(media, dict) and 'url' in media: return media['url'] # Check for enclosures if hasattr(entry, 'enclosures') and entry.enclosures: for enclosure in entry.enclosures: if isinstance(enclosure, dict) and 'url' in enclosure and enclosure.get('type', '').startswith('image/'): return enclosure['url'] # Try to extract from description using BeautifulSoup if description: soup = BeautifulSoup(description, "html.parser") # First, check meta tags for twitter:image meta_img = soup.find('meta', attrs={'name': 'twitter:image'}) if meta_img and meta_img.has_attr('content'): return meta_img['content'] # Then check for regular img tags img_tag = soup.find('img') if img_tag and img_tag.has_attr('src'): return img_tag['src'] # Try to extract image URL from HTML img_match = re.search(r']+src=[\'"]([^\'"]+)[\'"]', description) if img_match: return img_match.group(1) # No image found return None except Exception as e: print(f"Error extracting image URL: {e}") return None def fetch_single_feed(link_source_tuple): """Fetch a single RSS feed and return its entries""" link, source = link_source_tuple entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": [], "Image": []} try: feed = feedparser.parse(link) for entry in feed.entries: title = entry.get("title", "No Title") link = entry.get("link", "No Link") published = entry.get("published", "No Date") description = entry.get("description", "No Description") # Extract image URL image_url = extract_image_url(entry, description) entries["Title"].append(title) entries["Link"].append(link) entries["Published"].append(published) entries["Description"].append(description) entries["Source"].append(source) entries["Image"].append(image_url) # Add image URL except Exception as e: print(f"Error fetching {link}: {e}") return entries def fetch_feed(links): """Fetch multiple RSS feeds in parallel""" all_entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": [], "Image": []} # Use ThreadPoolExecutor to fetch feeds in parallel with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: future_to_link = {executor.submit(fetch_single_feed, (link, source)): (link, source) for link, source in links.items()} for future in concurrent.futures.as_completed(future_to_link): link, source = future_to_link[future] try: result = future.result() # Merge results into all_entries for key in all_entries: all_entries[key].extend(result[key]) except Exception as e: print(f"Exception for {link}: {e}") # Create a DataFrame from all entries df = pd.DataFrame(all_entries) return df def scrape_the_batch_articles(): all_entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": [], "Image": []} try: res = requests.get(URL) soup = BeautifulSoup(res.text, "html.parser") articles = soup.find_all("article") for article in articles: # Link link_tag = article.find("a", href=True) link = "https://www.deeplearning.ai" + link_tag["href"] if link_tag else "#" # Title title_tag = article.find("h2") title = title_tag.get_text(strip=True) if title_tag else "No title" # Summary summary_tag = article.find("div", class_="text-sm") summary = summary_tag.get_text(strip=True) if summary_tag else "" # Date (based on div with specific class) date_tag = article.find("div", class_="text-slate-500") date_str = date_tag.get_text(strip=True) if date_tag else "" # Image img_tag = article.find("img") image_url = img_tag["src"] if img_tag and img_tag.has_attr("src") else None try: parsed_date = datetime.strptime(date_str, "%b %d, %Y") except Exception as e: parsed_date = None if parsed_date: all_entries["Title"].append(title) all_entries["Description"].append(summary) all_entries["Link"].append(link) all_entries["Published"].append(date_str) all_entries["Source"].append("deeplearning.ai") all_entries["Image"].append(image_url) return pd.DataFrame(all_entries) except Exception as e: print(f"Error scraping The Batch: {e}") return pd.DataFrame() def extract_and_clean_data(df): """Process and clean the feed data""" if df.empty: return df try: # Apply the custom date extraction function df['date'] = df['Published'].apply(extract_date) # Drop rows with invalid dates df = df.dropna(subset=['date']) # Drop the original 'Published' column df.drop(columns=['Published'], inplace=True) # Filter for the last 30 days (increased from 7 for more content) today = datetime.now() thirty_days_ago = today - timedelta(days=30) df_filtered = df[(df['date'] >= thirty_days_ago) & (df['date'] <= today)] # Sort by date in descending order df_filtered = df_filtered.sort_values(by='date', ascending=False) # Clean HTML and limit description length in one step df_filtered['Description'] = df_filtered['Description'].apply( lambda x: clean_html(x)[:500].replace("\n", "") ) return df_filtered except Exception as e: print(f"An error occurred while processing the data: {e}") return pd.DataFrame() def main(): # RSS links links = { "https://bair.berkeley.edu/blog/feed.xml": "The Berkeley Artificial Intelligence Research Blog", "https://feeds.feedburner.com/nvidiablog": "NVDIA Blog", "https://www.microsoft.com/en-us/research/feed/": "Microsoft Research", "https://www.sciencedaily.com/rss/computers_math/artificial_intelligence.xml": "Science Daily", "https://research.facebook.com/feed/": "META Research", "https://openai.com/news/rss.xml": "OpenAI News", "https://deepmind.google/blog/feed/basic/": "Google DeepMind Blog", "https://news.mit.edu/rss/topic/artificial-intelligence2": "MIT News - Artificial intelligence", "https://www.technologyreview.com/topic/artificial-intelligence/feed": "MIT Technology Review - Artificial intelligence", "https://www.wired.com/feed/tag/ai/latest/rss": "Wired: Artificial Intelligence Latest", "https://raw.githubusercontent.com/Olshansk/rss-feeds/refs/heads/main/feeds/feed_ollama.xml": "Ollama Blog", "https://newsroom.ibm.com/press-releases-artificial-intelligence?pagetemplate=rss": "IBM - Announcements (Artificial intelligence)" } # Fetch data from The Batch batch_df = scrape_the_batch_articles() # Fetch data from RSS feeds rss_df = fetch_feed(links) # Combine both dataframes combined_df = pd.concat([batch_df, rss_df], ignore_index=True) # Process and clean data final_df = extract_and_clean_data(combined_df) return final_df if __name__ == "__main__": df = main() print(df.head()) df.to_excel("ai_news.xlsx")