Spaces:

Bayhaqy
/

Classification-News-Analysis-and-Prediction

Running

Classification-News-Analysis-and-Prediction

File size: 15,854 Bytes

# News Information and data article 
from newspaper import Article, Config
from gnews import GNews

# Data Analysis and Profiling
import pandas as pd
from ydata_profiling import ProfileReport
from st_aggrid import AgGrid

# Streamlit for Building the Dashboard
import streamlit as st
from streamlit_pandas_profiling import st_profile_report

# Language Detection
from langdetect import detect

# NLP and Text Processing
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from deep_translator import GoogleTranslator
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

# Sentiment Analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

# URL Parsing
from urllib.parse import urlparse

# Data Visualization
import plotly.express as px
import matplotlib.pyplot as plt

# Word Cloud Generation
from wordcloud import WordCloud

# Other Libraries
import torch
import requests
import subprocess
import logging
import json
import re
import os

# NLTK Data Download
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

## ............................................... ##
# Set page configuration (Call this once and make changes as needed)
st.set_page_config(page_title='News Scrapping',  layout='wide', page_icon=':newspaper:')

with st.container():
    # Initialize Streamlit app
    st.title('News Article Scrapping')
    st.write("Created by Bayhaqy")

## ............................................... ##
# Set up logging
logging.basicConfig(filename='news_processing.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

## ............................................... ##
# Function for get model and tokenize
@st.cache_resource
def get_models_and_tokenizers():
    model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    #model.eval()

    return model, tokenizer

# Function for sentiment analysis
@st.cache_resource
def analyze_sentiment_distilbert(text, _model, _tokenizer):
    try:
        tokens_info = _tokenizer(text, truncation=True, return_tensors="pt")
        with torch.no_grad():
            raw_predictions = _model(**tokens_info).logits

        predicted_class_id = raw_predictions.argmax().item()
        predict = _model.config.id2label[predicted_class_id]

        softmaxed = int(torch.nn.functional.softmax(raw_predictions[0], dim=0)[1] * 100)
        if (softmaxed > 70):
            status = 'Not trust'
        elif (softmaxed > 40):
            status = 'Not sure'
        else:
            status = 'Trust'
        return status, predict

    except Exception as e:
        logging.error(f"Sentiment analysis error: {str(e)}")
        return 'N/A', 'N/A'

# Function for sentiment analysis using VADER
@st.cache_data
def analyze_sentiment_vader(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(text)
    compound_score = sentiment['compound']
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Function for sentiment analysis using TextBlob
@st.cache_data
def analyze_sentiment_textblob(text):
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

## ............................................... ##
# Function to process an article
@st.cache_data
def process_article(url, _config):
    try:
        article = Article(url=url, config=_config)
        article.download()
        article.parse()

        # Check if publish_date is not None before further processing
        if article.publish_date is None:
            return None  # Skip processing and return None

        # Check if text is not None before further processing
        if len(article.text) <= 5:
            return None  # Skip processing and return None

        # Get the article data if publish_date is not not None
        text = article.text
        url = article.canonical_link
        source_url = urlparse(url).netloc

        title = article.title
        authors = article.authors
        #publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S%z')
        publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M')

        article.nlp()
        keywords = article.meta_keywords
        summary = article.summary

        language = detect(title)

        return publish_date, language, url, source_url, title, authors, keywords, text, summary

    except Exception as e:
        logging.error(f"Article processing error: {str(e)}")
        return None  # Skip processing and return None

# Function for translation
@st.cache_data
def translate_text(text, source='auto', target='en'):
    try:
        if source != target:
            text = GoogleTranslator(source=source, target=target).translate(text)
        return text

    except Exception as e:
        logging.error(f"Translation error: {str(e)}")
        return text

## ............................................... ##
# Function to preprocess the data
@st.cache_data
def preprocessing_data(df):
    # Remove duplicates
    df = df.drop_duplicates(subset='Translation')

    # Reset the index to add the date column
    df.reset_index(inplace=True,drop=True)

    # Function to clean and preprocess text
    def clean_text(text):
        # Remove URLs
        text = re.sub(r'http\S+', '', text)

        # Convert to lowercase
        text = text.lower()

        # Remove non-alphanumeric characters
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Tokenize text
        words = nltk.word_tokenize(text)

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]

        # Lemmatize words
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

        return ' '.join(words)

    # Apply the clean_text function to the "Translation" column
    df['Cleaned Translation'] = df['Translation'].apply(clean_text)

    return df
    
## ............................................... ##
# Function to create a Word Cloud
@st.cache_data
def create_wordcloud(df):
    # Combine all text
    text = ' '.join(df['Cleaned Translation'])

    # Create a Word Cloud
    wordcloud = WordCloud(width=700, height=400, max_words=80).generate(text)

    # Convert the word cloud to an image
    wordcloud_image = wordcloud.to_image()

    # Display the Word Cloud using st.image
    st.image(wordcloud_image, use_column_width=True)

## ............................................... ##
with st.container():
    # Input search parameters
    search_term = st.text_input('Enter a search term :', 'Indonesia')

    col1, col2, col3 = st.columns(3)

    with col1:
        period = st.text_input('Enter a news period :', '7d')
        max_results = st.number_input('Maximum number of results :', min_value=1, value=10)
    with col2:
        country = st.text_input('Country :', 'Indonesia')
        language = st.text_input('Language :', 'indonesian')
    with col3:  
        start_date = st.date_input('Start Date :', pd.to_datetime('2023-01-01'))
        end_date = st.date_input('End Date :', pd.to_datetime('2023-12-01'))

## ............................................... ##
with st.container():
    col1, col2 = st.columns(2)

    with col1:
        # Checkbox options for different processing steps
        include_translation = st.checkbox("Include Translation", value=True)
        include_sentiment_analysis = st.checkbox("Include Sentiment Analysis", value=True)
    with col2:
        include_sentiment_vader = st.checkbox("Include VADER Sentiment Analysis", value=True)
        include_sentiment_textblob = st.checkbox("Include TextBlob Sentiment Analysis", value=True)

## ............................................... ##
# Create a variable to track whether the data has been processed
data_processed = False

## ............................................... ##
# Create a custom configuration
config = Config()
config.number_threads = 500
config.request_timeout = 10

## ............................................... ##
# Initialize the DataFrame
df = pd.DataFrame(columns=['Publish_Date', 'Language', 'URL', 'Source_Url', 'Title', 'Authors', 'Keywords', 'Text', 'Summary']) 

# Initialize your model and tokenizer
model, tokenizer = get_models_and_tokenizers()

## ............................................... ##
with st.container():
    # Fetch news and process articles
    if st.button('Fetch and Process News'): 
        # Your news retrieval code
        google_news = GNews()
        google_news.period = period  # News from last 7 days
        google_news.max_results = max_results # number of responses across a keyword
        google_news.country = country  # News from a specific country
        google_news.language = language  # News in a specific language
        #google_news.exclude_websites = ['yahoo.com', 'cnn.com']  # Exclude news from specific website i.e Yahoo.com and CNN.com
        google_news.start_date = (start_date.year, start_date.month, start_date.day) # Search from 1st Jan 2023
        google_news.end_date = (end_date.year, end_date.month, end_date.day) # Search until 1st Dec 2023
        
        news = google_news.get_news(search_term)
        
        ## ............................................... ##,
        # Progress bar for fetching and processing news
        progress_bar = st.progress(0)
        total_news = len(news)
        
        # Your news retrieval code (assuming 'news' is a list of article URLs)
        #for x in news:
        for idx, x in enumerate(news):
            result = process_article(x['url'], _config=config)
            if result is not None:
                publish_date, language, url, source_url, title, authors, keywords, text, summary = result

                # Insert to dataframe
                temp_df = pd.DataFrame({'Publish_Date': [publish_date], 'Language': [language], 'URL': [url], 'Source_Url': [source_url], 'Title': [title], 'Authors': [authors], 'Keywords': [keywords],
                                      'Text': [text], 'Summary': [summary]})
                df = pd.concat([df, temp_df], ignore_index=True)

                # Convert 'Publish_Date' to DatetimeIndex
                df['Publish_Date'] = pd.to_datetime(df['Publish_Date'])
            
            # Update the progress bar
            progress = (idx + 1) / total_news
            progress_bar.progress(progress)
        
        # Conditionally apply translation function to the 'Translation' column
        if include_translation:
            df['Translation'] = df.apply(lambda row: translate_text((row['Title'] + ' | ' + row['Summary']), source=row['Language'], target='en'), axis=1)
            
            # Preprocessing Data
            df = preprocessing_data(df)
        
        # Conditionally apply sentiment analysis function to the 'Translation' column
        if include_sentiment_analysis:
            df[['Fake Check', 'Sentiment Distilbert']] = df['Translation'].apply(lambda text: pd.Series(analyze_sentiment_distilbert(text, model, tokenizer)))
          
        
        # Conditionally apply VADER sentiment analysis to the 'Translation' column
        if include_sentiment_vader:
            df['Sentiment VADER'] = df['Translation'].apply(analyze_sentiment_vader)
        
        # Conditionally apply TextBlob sentiment analysis to the 'Translation' column
        if include_sentiment_textblob:
            df['Sentiment TextBlob'] = df['Translation'].apply(analyze_sentiment_textblob)
        
        # Set data_processed to True when the data has been successfully processed
        data_processed = True
        
    ## ............................................... ##
    # Add a button to download the data as a CSV file
    if data_processed:
        st.markdown("### Download Processed Data as CSV")
        st.write("Click the button below to download the processed data as a CSV file.")
        
        # Create a downloadable link
        csv_data = df.to_csv(index=False).encode()
        st.download_button(
            label="Download CSV",
            data=csv_data,
            file_name="processed_data.csv",
        )

    ## ............................................... ##
    with st.expander("See for Table"):
        # Display processed data
        if data_processed:
            AgGrid(df, height=400)
    
    ## ............................................... ##
    # Display processed data
    with st.expander("See for Exploratory Data Analysis"):
        if data_processed:
            col1, col2 = st.columns(2)
            with col1:
                ## ............................................... ##
                # Create a DataFrame to count the number of tweets by Fake Check
                FakeCheck_counts = df['Fake Check'].value_counts().reset_index()
                FakeCheck_counts.columns = ['Fake Check', 'News Count']
                fig = px.bar(FakeCheck_counts, x='Fake Check', y='News Count', text='News Count', title='Total News by Fake Check')
                st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
        
                ## ............................................... ##
                # Create wordcloud
                try:
                    st.write('WordCloud for News')
                    create_wordcloud(df)
                except Exception as e:
                    logging.error(f" Column Translation Not Available : {str(e)}")
        
                ## ............................................... ##

            with col2:
                ## ............................................... ##
                # Create a DataFrame to count the number of News by language
                language_counts = df['Language'].value_counts().reset_index()
                language_counts.columns = ['Language', 'News Count']
                fig = px.bar(language_counts, x='Language', y='News Count', text='News Count', title='Total News by Language')
                st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
                
                ## ............................................... ##
                # Group by Sentiment columns and get the count
                try:
                    sentiment_counts = df[['Sentiment Distilbert', 'Sentiment VADER', 'Sentiment TextBlob']].apply(lambda x: x.value_counts()).T
                    sentiment_counts = sentiment_counts.reset_index()
                    sentiment_counts = pd.melt(sentiment_counts, id_vars='index', var_name='Sentiment', value_name='Count')
                    fig = px.bar(sentiment_counts, x='Sentiment', y='Count', color='index', barmode='group', title='Total News per Sentiment')
                    st.plotly_chart(fig, use_container_width=True, use_container_height=True, width=700, height=400)
        
                except Exception as e:
                    logging.error(f" Column Sentiment Not Available : {str(e)}")
            
                ## ............................................... ##
        
    with st.expander("See for Analysis with ydata-profiling"):
        ## ............................................... ##
        # Display processed data
        if data_processed:
            pr = ProfileReport(df)
            st_profile_report(pr)