File size: 15,854 Bytes
1445b61
 
 
 
 
 
662dfae
6d4fa29
1445b61
 
6593698
1445b61
 
 
6593698
1445b61
 
6593698
 
1445b61
 
 
 
 
 
662dfae
 
1445b61
 
662dfae
1445b61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6593698
 
 
 
1445b61
6593698
662dfae
1445b61
 
 
662dfae
6593698
 
 
 
 
662dfae
 
 
 
 
 
 
 
 
6593698
 
662dfae
 
6593698
662dfae
6593698
662dfae
6593698
 
662dfae
6593698
 
 
 
 
 
 
 
 
662dfae
6593698
 
 
 
662dfae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6593698
662dfae
6593698
662dfae
 
6593698
662dfae
6593698
 
 
662dfae
 
 
6593698
662dfae
 
 
6593698
662dfae
6593698
662dfae
 
 
 
 
 
 
 
6593698
662dfae
6593698
 
662dfae
 
 
6593698
 
 
662dfae
 
 
 
 
 
 
 
 
 
 
 
 
6593698
1445b61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6593698
662dfae
1445b61
 
662dfae
1445b61
662dfae
1445b61
 
 
 
 
 
 
 
 
6593698
 
662dfae
1445b61
6593698
1445b61
 
 
 
 
 
 
662dfae
 
6593698
 
 
662dfae
 
 
30d1447
 
662dfae
 
d212f62
662dfae
d212f62
 
662dfae
6593698
662dfae
 
1445b61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
662dfae
1445b61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
662dfae
1445b61
 
 
 
 
 
 
 
 
 
 
 
662dfae
1445b61
662dfae
1445b61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
662dfae
 
1445b61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
# News Information and data article 
from newspaper import Article, Config
from gnews import GNews

# Data Analysis and Profiling
import pandas as pd
from ydata_profiling import ProfileReport
from st_aggrid import AgGrid

# Streamlit for Building the Dashboard
import streamlit as st
from streamlit_pandas_profiling import st_profile_report

# Language Detection
from langdetect import detect

# NLP and Text Processing
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from deep_translator import GoogleTranslator
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

# Sentiment Analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

# URL Parsing
from urllib.parse import urlparse

# Data Visualization
import plotly.express as px
import matplotlib.pyplot as plt

# Word Cloud Generation
from wordcloud import WordCloud

# Other Libraries
import torch
import requests
import subprocess
import logging
import json
import re
import os

# NLTK Data Download
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

## ............................................... ##
# Set page configuration (Call this once and make changes as needed)
st.set_page_config(page_title='News Scrapping',  layout='wide', page_icon=':newspaper:')

with st.container():
    # Initialize Streamlit app
    st.title('News Article Scrapping')
    st.write("Created by Bayhaqy")

## ............................................... ##
# Set up logging
logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

## ............................................... ##
# Function for get model and tokenize
@st.cache_resource
def get_models_and_tokenizers():
    model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    #model.eval()

    return model, tokenizer

# Function for sentiment analysis
@st.cache_resource
def analyze_sentiment_distilbert(text, _model, _tokenizer):
    try:
        tokens_info = _tokenizer(text, truncation=True, return_tensors="pt")
        with torch.no_grad():
            raw_predictions = _model(**tokens_info).logits

        predicted_class_id = raw_predictions.argmax().item()
        predict = _model.config.id2label[predicted_class_id]

        softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
        if (softmaxed > 70):
            status = 'Not trust'
        elif (softmaxed > 40):
            status = 'Not sure'
        else:
            status = 'Trust'
        return status, predict

    except Exception as e:
        logging.error(f"Sentiment analysis error: {str(e)}")
        return 'N/A', 'N/A'

# Function for sentiment analysis using VADER
@st.cache_data
def analyze_sentiment_vader(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(text)
    compound_score = sentiment['compound']
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Function for sentiment analysis using TextBlob
@st.cache_data
def analyze_sentiment_textblob(text):
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

## ............................................... ##
# Function to process an article
@st.cache_data
def process_article(url, _config):
    try:
        article = Article(url=url, config=_config)
        article.download()
        article.parse()

        # Check if publish_date is not None before further processing
        if article.publish_date is None:
            return None  # Skip processing and return None

        # Check if text is not None before further processing
        if len(article.text) <= 5:
            return None  # Skip processing and return None

        # Get the article data if publish_date is not not None
        text = article.text
        url = article.canonical_link
        source_url = urlparse(url).netloc

        title = article.title
        authors = article.authors
        #publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S%z')
        publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M')

        article.nlp()
        keywords = article.meta_keywords
        summary = article.summary

        language = detect(title)

        return publish_date, language, url, source_url, title, authors, keywords, text, summary

    except Exception as e:
        logging.error(f"Article processing error: {str(e)}")
        return None  # Skip processing and return None

# Function for translation
@st.cache_data
def translate_text(text, source='auto', target='en'):
    try:
        if source != target:
            text = GoogleTranslator(source=source, target=target).translate(text)
        return text

    except Exception as e:
        logging.error(f"Translation error: {str(e)}")
        return text

## ............................................... ##
# Function to preprocess the data
@st.cache_data
def preprocessing_data(df):
    # Remove duplicates
    df = df.drop_duplicates(subset='Translation')

    # Reset the index to add the date column
    df.reset_index(inplace=True,drop=True)

    # Function to clean and preprocess text
    def clean_text(text):
        # Remove URLs
        text = re.sub(r'http\S+', '', text)

        # Convert to lowercase
        text = text.lower()

        # Remove non-alphanumeric characters
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Tokenize text
        words = nltk.word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]

        # Lemmatize words
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

        return ' '.join(words)

    # Apply the clean_text function to the "Translation" column
    df['Cleaned Translation'] = df['Translation'].apply(clean_text)

    return df
    
## ............................................... ##
# Function to create a Word Cloud
@st.cache_data
def create_wordcloud(df):
    # Combine all text
    text = ' '.join(df['Cleaned Translation'])

    # Create a Word Cloud
    wordcloud = WordCloud(width=700, height=400, max_words=80).generate(text)

    # Convert the word cloud to an image
    wordcloud_image = wordcloud.to_image()

    # Display the Word Cloud using st.image
    st.image(wordcloud_image, use_column_width=True)

## ............................................... ##
with st.container():
    # Input search parameters
    search_term = st.text_input('Enter a search term :', 'Indonesia')

    col1, col2, col3 = st.columns(3)

    with col1:
        period = st.text_input('Enter a news period :', '7d')
        max_results = st.number_input('Maximum number of results :', min_value=1, value=10)
    with col2:
        country = st.text_input('Country :', 'Indonesia')
        language = st.text_input('Language :', 'indonesian')
    with col3:  
        start_date = st.date_input('Start Date :', pd.to_datetime('2023-01-01'))
        end_date = st.date_input('End Date :', pd.to_datetime('2023-12-01'))

## ............................................... ##
with st.container():
    col1, col2 = st.columns(2)

    with col1:
        # Checkbox options for different processing steps
        include_translation = st.checkbox("Include Translation", value=True)
        include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=True)
    with col2:
        include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=True)
        include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=True)

## ............................................... ##
# Create a variable to track whether the data has been processed
data_processed = False

## ............................................... ##
# Create a custom configuration
config = Config()
config.number_threads = 500
config.request_timeout = 10

## ............................................... ##
# Initialize the DataFrame
df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Source_Url', 'Title', 'Authors', 'Keywords', 'Text', 'Summary']) 

# Initialize your model and tokenizer
model, tokenizer = get_models_and_tokenizers()

## ............................................... ##
with st.container():
    # Fetch news and process articles
    if st.button('Fetch and Process News'): 
        # Your news retrieval code
        google_news = GNews()
        google_news.period = period  # News from last 7 days
        google_news.max_results = max_results # number of responses across a keyword
        google_news.country = country  # News from a specific country
        google_news.language = language  # News in a specific language
        #google_news.exclude_websites = ['yahoo.com', 'cnn.com']  # Exclude news from specific website i.e Yahoo.com and CNN.com
        google_news.start_date = (start_date.year, start_date.month, start_date.day) # Search from 1st Jan 2023
        google_news.end_date = (end_date.year, end_date.month, end_date.day) # Search until 1st Dec 2023
        
        news = google_news.get_news(search_term)
        
        ## ............................................... ##,
        # Progress bar for fetching and processing news
        progress_bar = st.progress(0)
        total_news = len(news)
        
        # Your news retrieval code (assuming 'news' is a list of article URLs)
        #for x in news:
        for idx, x in enumerate(news):
            result = process_article(x['url'], _config=config)
            if result is not None:
                publish_date, language, url, source_url, title, authors, keywords, text, summary = result

                # Insert to dataframe
                temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Source_Url': [source_url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
                                      'Text': [text], 'Summary': [summary]})
                df = pd.concat([df, temp_df], ignore_index=True)

                # Convert 'Publish_Date' to DatetimeIndex
                df['Publish_Date'] = pd.to_datetime(df['Publish_Date'])
            
            # Update the progress bar
            progress = (idx + 1) / total_news
            progress_bar.progress(progress)
        
        # Conditionally apply translation function to the 'Translation' column
        if include_translation:
            df['Translation'] = df.apply(lambda row: translate_text((row['Title'] + ' | ' + row['Summary']), source=row['Language'], target='en'), axis=1)
            
            # Preprocessing Data
            df = preprocessing_data(df)
        
        # Conditionally apply sentiment analysis function to the 'Translation' column
        if include_sentiment_analysis:
            df[['Fake Check', 'Sentiment Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer)))
          
        
        # Conditionally apply VADER sentiment analysis to the 'Translation' column
        if include_sentiment_vader:
            df['Sentiment VADER'] = df['Translation'].apply(analyze_sentiment_vader)
        
        # Conditionally apply TextBlob sentiment analysis to the 'Translation' column
        if include_sentiment_textblob:
            df['Sentiment TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)
        
        # Set data_processed to True when the data has been successfully processed
        data_processed = True
        
    ## ............................................... ##
    # Add a button to download the data as a CSV file
    if data_processed:
        st.markdown("### Download Processed Data as CSV")
        st.write("Click the button below to download the processed data as a CSV file.")
        
        # Create a downloadable link
        csv_data = df.to_csv(index=False).encode()
        st.download_button(
            label="Download CSV",
            data=csv_data,
            file_name="processed_data.csv",
        )

    ## ............................................... ##
    with st.expander("See for Table"):
        # Display processed data
        if data_processed:
            AgGrid(df, height=400)
    
    ## ............................................... ##
    # Display processed data
    with st.expander("See for Exploratory Data Analysis"):
        if data_processed:
            col1, col2 = st.columns(2)
            with col1:
                ## ............................................... ##
                # Create a DataFrame to count the number of tweets by Fake Check
                FakeCheck_counts = df['Fake Check'].value_counts().reset_index()
                FakeCheck_counts.columns = ['Fake Check', 'News Count']
                fig = px.bar(FakeCheck_counts, x='Fake Check', y='News Count', text='News Count', title='Total News by Fake Check')
                st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
        
                ## ............................................... ##
                # Create wordcloud
                try:
                    st.write('WordCloud for News')
                    create_wordcloud(df)
                except Exception as e:
                    logging.error(f" Column Translation Not Available : {str(e)}")
        
                ## ............................................... ##

            with col2:
                ## ............................................... ##
                # Create a DataFrame to count the number of News by language
                language_counts = df['Language'].value_counts().reset_index()
                language_counts.columns = ['Language', 'News Count']
                fig = px.bar(language_counts, x='Language', y='News Count', text='News Count', title='Total News by Language')
                st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
                
                ## ............................................... ##
                # Group by Sentiment columns and get the count
                try:
                    sentiment_counts = df[['Sentiment Distilbert', 'Sentiment VADER', 'Sentiment TextBlob']].apply(lambda x: x.value_counts()).T
                    sentiment_counts = sentiment_counts.reset_index()
                    sentiment_counts = pd.melt(sentiment_counts, id_vars='index', var_name='Sentiment', value_name='Count')
                    fig = px.bar(sentiment_counts, x='Sentiment', y='Count', color='index', barmode='group', title='Total News per Sentiment')
                    st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
        
                except Exception as e:
                    logging.error(f" Column Sentiment Not Available : {str(e)}")
            
                ## ............................................... ##
        
    with st.expander("See for Analysis with ydata-profiling"):
        ## ............................................... ##
        # Display processed data
        if data_processed:
            pr = ProfileReport(df)
            st_profile_report(pr)