File size: 10,946 Bytes
85468c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
import feedparser
import pandas as pd
from datetime import datetime, timedelta
import ssl
from bs4 import BeautifulSoup
import warnings
import concurrent.futures
import re
import requests

warnings.filterwarnings("ignore")

URL = "https://www.deeplearning.ai/the-batch/"

# Configure SSL once at the module level
if hasattr(ssl, '_create_unverified_context'):
    ssl._create_default_https_context = ssl._create_unverified_context

def extract_date(date_str):
    """Extract date from various formats using regex patterns"""
    try:
        # Try different patterns to match various date formats
        
        # Pattern 1: Standard RFC format like "Mon, 14 Apr 2025 10:00:00 GMT"
        pattern1 = r'(?:\w+,\s+)?(\d{1,2}\s+\w{3}\s+\d{4})'
        match = re.search(pattern1, date_str)
        if match:
            date_str = match.group(1)
            return pd.to_datetime(date_str, format='%d %b %Y')
        
        # Pattern 2: Simple format like "14 Apr 2025"
        pattern2 = r'(\d{1,2}\s+\w{3}\s+\d{4})'
        match = re.search(pattern2, date_str)
        if match:
            return pd.to_datetime(match.group(1), format='%d %b %Y')
        
        # Pattern 3: ISO format like "2025-04-14"
        pattern3 = r'(\d{4}-\d{2}-\d{2})'
        match = re.search(pattern3, date_str)
        if match:
            return pd.to_datetime(match.group(1))
        
        # Pattern 4: Format like "Mar 12, 2025"
        pattern4 = r'(\w{3}\s+\d{1,2},\s+\d{4})'
        match = re.search(pattern4, date_str)
        if match:
            return pd.to_datetime(match.group(1), format='%b %d, %Y')
        
        # If none of the patterns match, return original parsed date
        return pd.to_datetime(date_str)
    except:
        # If all else fails, return NaT
        return pd.NaT

def clean_html(text):
    """Clean HTML tags from text"""
    try:
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()
    except Exception as e:
        print(f"Error cleaning HTML: {e}")
        return text

def extract_image_url(entry, description):
    """Extract image URL from RSS entry if available"""
    try:
        # Check for media:content
        if hasattr(entry, 'media_content') and entry.media_content:
            for media in entry.media_content:
                if isinstance(media, dict) and 'url' in media:
                    return media['url']
        
        # Check for media:thumbnail
        if hasattr(entry, 'media_thumbnail') and entry.media_thumbnail:
            for media in entry.media_thumbnail:
                if isinstance(media, dict) and 'url' in media:
                    return media['url']
        
        # Check for enclosures
        if hasattr(entry, 'enclosures') and entry.enclosures:
            for enclosure in entry.enclosures:
                if isinstance(enclosure, dict) and 'url' in enclosure and enclosure.get('type', '').startswith('image/'):
                    return enclosure['url']
        
        # Try to extract from description using BeautifulSoup
        if description:
            soup = BeautifulSoup(description, "html.parser")
            
            # First, check meta tags for twitter:image
            meta_img = soup.find('meta', attrs={'name': 'twitter:image'})
            if meta_img and meta_img.has_attr('content'):
                return meta_img['content']
            
            # Then check for regular img tags
            img_tag = soup.find('img')
            if img_tag and img_tag.has_attr('src'):
                return img_tag['src']
            
            # Try to extract image URL from HTML
            img_match = re.search(r'<img[^>]+src=[\'"]([^\'"]+)[\'"]', description)
            if img_match:
                return img_match.group(1)
        
        # No image found
        return None
    except Exception as e:
        print(f"Error extracting image URL: {e}")
        return None

def fetch_single_feed(link_source_tuple):
    """Fetch a single RSS feed and return its entries"""
    link, source = link_source_tuple
    entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": [], "Image": []}
    
    try:
        feed = feedparser.parse(link)
        
        for entry in feed.entries:
            title = entry.get("title", "No Title")
            link = entry.get("link", "No Link")
            published = entry.get("published", "No Date")
            description = entry.get("description", "No Description")
            
            # Extract image URL
            image_url = extract_image_url(entry, description)
            
            entries["Title"].append(title)
            entries["Link"].append(link)
            entries["Published"].append(published)
            entries["Description"].append(description)
            entries["Source"].append(source)
            entries["Image"].append(image_url)  # Add image URL
            
    except Exception as e:
        print(f"Error fetching {link}: {e}")
    
    return entries

def fetch_feed(links):
    """Fetch multiple RSS feeds in parallel"""
    all_entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": [], "Image": []}
    
    # Use ThreadPoolExecutor to fetch feeds in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        future_to_link = {executor.submit(fetch_single_feed, (link, source)): (link, source) 
                         for link, source in links.items()}
        
        for future in concurrent.futures.as_completed(future_to_link):
            link, source = future_to_link[future]
            try:
                result = future.result()
                # Merge results into all_entries
                for key in all_entries:
                    all_entries[key].extend(result[key])
            except Exception as e:
                print(f"Exception for {link}: {e}")
    
    # Create a DataFrame from all entries
    df = pd.DataFrame(all_entries)
    return df

def scrape_the_batch_articles():
    all_entries = {"Title": [], "Link": [], "Published": [], "Description": [], "Source": [], "Image": []}
    
    try:
        res = requests.get(URL)
        soup = BeautifulSoup(res.text, "html.parser")

        articles = soup.find_all("article")

        for article in articles:
            # Link
            link_tag = article.find("a", href=True)
            link = "https://www.deeplearning.ai" + link_tag["href"] if link_tag else "#"

            # Title
            title_tag = article.find("h2")
            title = title_tag.get_text(strip=True) if title_tag else "No title"

            # Summary
            summary_tag = article.find("div", class_="text-sm")
            summary = summary_tag.get_text(strip=True) if summary_tag else ""

            # Date (based on div with specific class)
            date_tag = article.find("div", class_="text-slate-500")
            date_str = date_tag.get_text(strip=True) if date_tag else ""
            
            # Image
            img_tag = article.find("img")
            image_url = img_tag["src"] if img_tag and img_tag.has_attr("src") else None

            try:
                parsed_date = datetime.strptime(date_str, "%b %d, %Y")
            except Exception as e:
                parsed_date = None

            if parsed_date:
                all_entries["Title"].append(title)
                all_entries["Description"].append(summary)
                all_entries["Link"].append(link)
                all_entries["Published"].append(date_str)
                all_entries["Source"].append("deeplearning.ai")
                all_entries["Image"].append(image_url)
        
        return pd.DataFrame(all_entries)
    except Exception as e:
        print(f"Error scraping The Batch: {e}")
        return pd.DataFrame()

def extract_and_clean_data(df):
    """Process and clean the feed data"""
    if df.empty:
        return df
        
    try:
        # Apply the custom date extraction function
        df['date'] = df['Published'].apply(extract_date)
        
        # Drop rows with invalid dates
        df = df.dropna(subset=['date'])
        
        # Drop the original 'Published' column
        df.drop(columns=['Published'], inplace=True)
        
        # Filter for the last 30 days (increased from 7 for more content)
        today = datetime.now()
        thirty_days_ago = today - timedelta(days=30)
        df_filtered = df[(df['date'] >= thirty_days_ago) & (df['date'] <= today)]
        
        # Sort by date in descending order
        df_filtered = df_filtered.sort_values(by='date', ascending=False)
        
        # Clean HTML and limit description length in one step
        df_filtered['Description'] = df_filtered['Description'].apply(
            lambda x: clean_html(x)[:500].replace("\n", "")
        )
        
        return df_filtered
        
    except Exception as e:
        print(f"An error occurred while processing the data: {e}")
        return pd.DataFrame()

def main():
    # RSS links
    links = {
        "https://bair.berkeley.edu/blog/feed.xml": "The Berkeley Artificial Intelligence Research Blog",
        "https://feeds.feedburner.com/nvidiablog": "NVDIA Blog",
        "https://www.microsoft.com/en-us/research/feed/": "Microsoft Research",
        "https://www.sciencedaily.com/rss/computers_math/artificial_intelligence.xml": "Science Daily",
        "https://research.facebook.com/feed/": "META Research",
        "https://openai.com/news/rss.xml": "OpenAI News",
        "https://deepmind.google/blog/feed/basic/": "Google DeepMind Blog",
        "https://news.mit.edu/rss/topic/artificial-intelligence2": "MIT News - Artificial intelligence",
        "https://www.technologyreview.com/topic/artificial-intelligence/feed": "MIT Technology Review - Artificial intelligence",
        "https://www.wired.com/feed/tag/ai/latest/rss": "Wired: Artificial Intelligence Latest",
        "https://raw.githubusercontent.com/Olshansk/rss-feeds/refs/heads/main/feeds/feed_ollama.xml": "Ollama Blog",
        "https://newsroom.ibm.com/press-releases-artificial-intelligence?pagetemplate=rss": "IBM - Announcements (Artificial intelligence)"
    }

    # Fetch data from The Batch
    batch_df = scrape_the_batch_articles()
    
    # Fetch data from RSS feeds
    rss_df = fetch_feed(links)
    
    # Combine both dataframes
    combined_df = pd.concat([batch_df, rss_df], ignore_index=True)
    
    # Process and clean data
    final_df = extract_and_clean_data(combined_df)
    
    return final_df

if __name__ == "__main__":
    df = main()
    print(df.head())
    df.to_excel("ai_news.xlsx")