Spaces:

awacke1
/

Transcript-EDA-NLTK

Sleeping

App Files Files Community

awacke1 commited on Aug 14, 2024

Commit

069bed5

verified ·

1 Parent(s): df130bb

Create app.py

Browse files

Files changed (1) hide show

app.py +109 -0

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import streamlit as st
+import pandas as pd
+import os
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.cluster import KMeans
+from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
+import nltk
+from nltk.corpus import stopwords
+from nltk import FreqDist
+import re
+import base64
+from graphviz import Digraph
+from io import BytesIO
+import networkx as nx
+import matplotlib.pyplot as plt
+# ... [Keep all the existing imports and configurations] ...
+def get_txt_files():
+    # Exclude specific files
+    excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
+    # List all .txt files excluding the ones in excluded_files
+    txt_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
+    # Create a dataframe with file names and full paths
+    df = pd.DataFrame({
+        'File Name': txt_files,
+        'Full Path': [os.path.abspath(f) for f in txt_files]
+    })
+    return df
+# ... [Keep all the existing functions] ...
+# Main code for UI
+st.title("📺 Transcript Analysis 📊")
+# Display dataframe of .txt files
+txt_files_df = get_txt_files()
+st.write("Available .txt files:")
+st.dataframe(txt_files_df)
+# Allow user to select a file from the dataframe
+selected_file = st.selectbox("Select a file to process:", txt_files_df['File Name'])
+if st.button(f"Process {selected_file}"):
+    file_path = txt_files_df[txt_files_df['File Name'] == selected_file]['Full Path'].iloc[0]
+    with open(file_path, 'r', encoding="utf-8") as file:
+        file_text = file.read()
+    # Process the selected file
+    text_without_timestamps = remove_timestamps(file_text)
+    top_words = extract_high_information_words(text_without_timestamps, 10)
+    with st.expander("📊 Top 10 High Information Words"):
+        st.write(top_words)
+    with st.expander("📈 Relationship Graph"):
+        display_relationship_graph(top_words)
+    context_words = extract_context_words(text_without_timestamps, top_words)
+    with st.expander("🔗 Context Graph"):
+        display_context_graph(context_words)
+    with st.expander("📑 Context Table"):
+        display_context_table(context_words)
+    sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
+    num_sentences = len(sentences)
+    st.write(f"Total Sentences: {num_sentences}")
+    num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
+    clustered_sentences = cluster_sentences(sentences, num_clusters)
+    col1, col2 = st.columns(2)
+    with col1:
+        st.subheader("Original Text")
+        original_text = "\n".join(sentences)
+        st.text_area("Original Sentences", value=original_text, height=400)
+    with col2:
+        st.subheader("Clustered Text")
+        clusters = ""
+        clustered_text = ""
+        cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
+        for i, cluster in enumerate(clustered_sentences):
+            cluster_text = "\n".join(cluster)
+            high_info_words = ", ".join(cluster_high_info_words[i])
+            clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
+            clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
+        st.text_area("Clusters", value=clusters, height=200)
+        st.text_area("Clustered Sentences", value=clustered_text, height=200)
+        # Verify that all sentences are accounted for in the clustered output
+        clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
+        if set(sentences) == set(clustered_sentences_flat):
+            st.write("✅ All sentences are accounted for in the clustered output.")
+        else:
+            st.write("❌ Some sentences are missing in the clustered output.")
+    plot_cluster_words(clustered_sentences)
+st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")