Spaces:

alpha-ai
/

latest-ai-news

Running

App Files Files Community

latest-ai-news / fetch_data.py

alphaaico

Upload 8 files

85468c7 verified 1 day ago

raw

history blame contribute delete

10.9 kB

	import feedparser
	import pandas as pd
	from datetime import datetime, timedelta
	import ssl
	from bs4 import BeautifulSoup
	import warnings
	import concurrent.futures
	import re
	import requests

	warnings.filterwarnings("ignore")

	URL = "https://www.deeplearning.ai/the-batch/"

	# Configure SSL once at the module level
	if hasattr(ssl, '_create_unverified_context'):
	ssl._create_default_https_context = ssl._create_unverified_context

	def extract_date(date_str):
	"""Extract date from various formats using regex patterns"""
	try:
	# Try different patterns to match various date formats

	# Pattern 1: Standard RFC format like "Mon, 14 Apr 2025 10:00:00 GMT"
	pattern1 = r'(?:\w+,\s+)?(\d{1,2}\s+\w{3}\s+\d{4})'
	match = re.search(pattern1, date_str)
	if match:
	date_str = match.group(1)
	return pd.to_datetime(date_str, format='%d %b %Y')

	# Pattern 2: Simple format like "14 Apr 2025"
	pattern2 = r'(\d{1,2}\s+\w{3}\s+\d{4})'
	match = re.search(pattern2, date_str)
	if match:
	return pd.to_datetime(match.group(1), format='%d %b %Y')

	# Pattern 3: ISO format like "2025-04-14"
	pattern3 = r'(\d{4}-\d{2}-\d{2})'
	match = re.search(pattern3, date_str)
	if match:
	return pd.to_datetime(match.group(1))

	# Pattern 4: Format like "Mar 12, 2025"
	pattern4 = r'(\w{3}\s+\d{1,2},\s+\d{4})'
	match = re.search(pattern4, date_str)
	if match:
	return pd.to_datetime(match.group(1), format='%b %d, %Y')

	# If none of the patterns match, return original parsed date
	return pd.to_datetime(date_str)
	except:
	# If all else fails, return NaT
	return pd.NaT

	def clean_html(text):
	"""Clean HTML tags from text"""
	try:
	soup = BeautifulSoup(text, "html.parser")
	return soup.get_text()
	except Exception as e:
	print(f"Error cleaning HTML: {e}")
	return text

	def extract_image_url(entry, description):
	"""Extract image URL from RSS entry if available"""
	try:
	# Check for media:content
	if hasattr(entry, 'media_content') and entry.media_content:
	for media in entry.media_content:
	if isinstance(media, dict) and 'url' in media:
	return media['url']

	# Check for media:thumbnail
	if hasattr(entry, 'media_thumbnail') and entry.media_thumbnail:
	for media in entry.media_thumbnail:
	if isinstance(media, dict) and 'url' in media:
	return media['url']

	# Check for enclosures
	if hasattr(entry, 'enclosures') and entry.enclosures:
	for enclosure in entry.enclosures:
	if isinstance(enclosure, dict) and 'url' in enclosure and enclosure.get('type', '').startswith('image/'):
	return enclosure['url']

	# Try to extract from description using BeautifulSoup
	if description:
	soup = BeautifulSoup(description, "html.parser")

	# First, check meta tags for twitter:image
	meta_img = soup.find('meta', attrs={'name': 'twitter:image'})
	if meta_img and meta_img.has_attr('content'):
	return meta_img['content']

	# Then check for regular img tags
	img_tag = soup.find('img')
	if img_tag and img_tag.has_attr('src'):
	return img_tag['src']

	# Try to extract image URL from HTML
	img_match = re.search(r'<img[^>]+src=[\'"]([^\'"]+)[\'"]', description)
	if img_match:
	return img_match.group(1)

	# No image found
	return None
	except Exception as e:
	print(f"Error extracting image URL: {e}")
	return None

	def fetch_single_feed(link_source_tuple):
	"""Fetch a single RSS feed and return its entries"""
	link, source = link_source_tuple
	entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": [], "Image": []}

	try:
	feed = feedparser.parse(link)

	for entry in feed.entries:
	title = entry.get("title", "No Title")
	link = entry.get("link", "No Link")
	published = entry.get("published", "No Date")
	description = entry.get("description", "No Description")

	# Extract image URL
	image_url = extract_image_url(entry, description)

	entries["Title"].append(title)
	entries["Link"].append(link)
	entries["Published"].append(published)
	entries["Description"].append(description)
	entries["Source"].append(source)
	entries["Image"].append(image_url) # Add image URL

	except Exception as e:
	print(f"Error fetching {link}: {e}")

	return entries

	def fetch_feed(links):
	"""Fetch multiple RSS feeds in parallel"""
	all_entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": [], "Image": []}

	# Use ThreadPoolExecutor to fetch feeds in parallel
	with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
	future_to_link = {executor.submit(fetch_single_feed, (link, source)): (link, source)
	for link, source in links.items()}

	for future in concurrent.futures.as_completed(future_to_link):
	link, source = future_to_link[future]
	try:
	result = future.result()
	# Merge results into all_entries
	for key in all_entries:
	all_entries[key].extend(result[key])
	except Exception as e:
	print(f"Exception for {link}: {e}")

	# Create a DataFrame from all entries
	df = pd.DataFrame(all_entries)
	return df

	def scrape_the_batch_articles():
	all_entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": [], "Image": []}

	try:
	res = requests.get(URL)
	soup = BeautifulSoup(res.text, "html.parser")

	articles = soup.find_all("article")

	for article in articles:
	# Link
	link_tag = article.find("a", href=True)
	link = "https://www.deeplearning.ai" + link_tag["href"] if link_tag else "#"

	# Title
	title_tag = article.find("h2")
	title = title_tag.get_text(strip=True) if title_tag else "No title"

	# Summary
	summary_tag = article.find("div", class_="text-sm")
	summary = summary_tag.get_text(strip=True) if summary_tag else ""

	# Date (based on div with specific class)
	date_tag = article.find("div", class_="text-slate-500")
	date_str = date_tag.get_text(strip=True) if date_tag else ""

	# Image
	img_tag = article.find("img")
	image_url = img_tag["src"] if img_tag and img_tag.has_attr("src") else None

	try:
	parsed_date = datetime.strptime(date_str, "%b %d, %Y")
	except Exception as e:
	parsed_date = None

	if parsed_date:
	all_entries["Title"].append(title)
	all_entries["Description"].append(summary)
	all_entries["Link"].append(link)
	all_entries["Published"].append(date_str)
	all_entries["Source"].append("deeplearning.ai")
	all_entries["Image"].append(image_url)

	return pd.DataFrame(all_entries)
	except Exception as e:
	print(f"Error scraping The Batch: {e}")
	return pd.DataFrame()

	def extract_and_clean_data(df):
	"""Process and clean the feed data"""
	if df.empty:
	return df

	try:
	# Apply the custom date extraction function
	df['date'] = df['Published'].apply(extract_date)

	# Drop rows with invalid dates
	df = df.dropna(subset=['date'])

	# Drop the original 'Published' column
	df.drop(columns=['Published'], inplace=True)

	# Filter for the last 30 days (increased from 7 for more content)
	today = datetime.now()
	thirty_days_ago = today - timedelta(days=30)
	df_filtered = df[(df['date'] >= thirty_days_ago) & (df['date'] <= today)]

	# Sort by date in descending order
	df_filtered = df_filtered.sort_values(by='date', ascending=False)

	# Clean HTML and limit description length in one step
	df_filtered['Description'] = df_filtered['Description'].apply(
	lambda x: clean_html(x)[:500].replace("\n", "")
	)

	return df_filtered

	except Exception as e:
	print(f"An error occurred while processing the data: {e}")
	return pd.DataFrame()

	def main():
	# RSS links
	links = {
	"https://bair.berkeley.edu/blog/feed.xml": "The Berkeley Artificial Intelligence Research Blog",
	"https://feeds.feedburner.com/nvidiablog": "NVDIA Blog",
	"https://www.microsoft.com/en-us/research/feed/": "Microsoft Research",
	"https://www.sciencedaily.com/rss/computers_math/artificial_intelligence.xml": "Science Daily",
	"https://research.facebook.com/feed/": "META Research",
	"https://openai.com/news/rss.xml": "OpenAI News",
	"https://deepmind.google/blog/feed/basic/": "Google DeepMind Blog",
	"https://news.mit.edu/rss/topic/artificial-intelligence2": "MIT News - Artificial intelligence",
	"https://www.technologyreview.com/topic/artificial-intelligence/feed": "MIT Technology Review - Artificial intelligence",
	"https://www.wired.com/feed/tag/ai/latest/rss": "Wired: Artificial Intelligence Latest",
	"https://raw.githubusercontent.com/Olshansk/rss-feeds/refs/heads/main/feeds/feed_ollama.xml": "Ollama Blog",
	"https://newsroom.ibm.com/press-releases-artificial-intelligence?pagetemplate=rss": "IBM - Announcements (Artificial intelligence)"
	}

	# Fetch data from The Batch
	batch_df = scrape_the_batch_articles()

	# Fetch data from RSS feeds
	rss_df = fetch_feed(links)

	# Combine both dataframes
	combined_df = pd.concat([batch_df, rss_df], ignore_index=True)

	# Process and clean data
	final_df = extract_and_clean_data(combined_df)

	return final_df

	if __name__ == "__main__":
	df = main()
	print(df.head())
	df.to_excel("ai_news.xlsx")