Spaces:

awacke1
/

Transcript-EDA-NLTK

Sleeping

App Files Files Community

awacke1 commited on Aug 14, 2024

Commit

1ebbc73

verified ·

1 Parent(s): 34e2c53

Update app.py

Browse files

Files changed (1) hide show

app.py +147 -151

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import streamlit as st
-import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import KMeans
 from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
@@ -32,56 +31,6 @@ st.markdown('''
 3. 📺 **Transcript Analysis** 📈:Speech recognition 🎙️ and thematic extraction 🌐, audiovisual content to actionable insights 🔑.
 ''')
-# 🧠 Cluster sentences using K-means
-def cluster_sentences(sentences, num_clusters):
-    sentences = [s for s in sentences if len(s) > 10]
-    num_clusters = min(num_clusters, len(sentences))
-    vectorizer = TfidfVectorizer()
-    X = vectorizer.fit_transform(sentences)
-    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
-    kmeans.fit(X)
-    clustered_sentences = [[] for _ in range(num_clusters)]
-    for i, label in enumerate(kmeans.labels_):
-        similarity = linear_kernel(kmeans.cluster_centers_[label:label+1], X[i:i+1]).flatten()[0]
-        clustered_sentences[label].append((similarity, sentences[i]))
-    return [[s for _, s in sorted(cluster, reverse=True)] for cluster in clustered_sentences]
-# 📊 Create context graph
-def create_context_graph(context_words):
-    graph = Digraph()
-    for i, (before, high, after) in enumerate(context_words):
-        if before:
-            graph.node(f'before{i}', before, shape='box')
-            graph.edge(f'before{i}', f'high{i}', label=before)
-        graph.node(f'high{i}', high, shape='ellipse')
-        if after:
-            graph.node(f'after{i}', after, shape='diamond')
-            graph.edge(f'high{i}', f'after{i}', label=after)
-    return graph
-# 🔗 Create relationship graph
-def create_relationship_graph(words):
-    graph = Digraph()
-    for i, word in enumerate(words):
-        graph.node(str(i), word)
-        if i > 0:
-            graph.edge(str(i-1), str(i), label=word)
-    return graph
-# 📈 Display context graph
-def display_context_graph(context_words):
-    st.graphviz_chart(create_context_graph(context_words))
-# 📊 Display context table
-def display_context_table(context_words):
-    table = "| Before | High Info Word | After |\n|--------|----------------|-------|\n"
-    table += "\n".join(f"| {b if b else ''} | {h} | {a if a else ''} |" for b, h, a in context_words)
-    st.markdown(table)
-# 📈 Display relationship graph
-def display_relationship_graph(words):
-    st.graphviz_chart(create_relationship_graph(words))
 # 📥 Download NLTK data
 @st.cache_resource
 def download_nltk_data():
@@ -94,11 +43,11 @@ def download_nltk_data():
             nltk.download('stopwords')
     st.success('NLTK data is ready!')
-# 🔍 Extract context words
-def extract_context_words(text, high_information_words):
-    words = nltk.word_tokenize(text)
-    return [(words[i-1] if i > 0 else None, word, words[i+1] if i < len(words)-1 else None)
-            for i, word in enumerate(words) if word.lower() in high_information_words]
 # 📊 Extract high information words
 def extract_high_information_words(text, top_n=10):
@@ -111,25 +60,91 @@ def extract_high_information_words(text, top_n=10):
         st.error(f"Error in extract_high_information_words: {str(e)}")
         return []
-# 📁 Get text files
-def get_txt_files():
-    excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
-    txt_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
-    return pd.DataFrame({'File Name': txt_files, 'Full Path': [os.path.abspath(f) for f in txt_files]})
-# 📊 Get high info words per cluster
-def get_high_info_words_per_cluster(cluster_sentences, num_words=5):
-    return [extract_high_information_words(" ".join(cluster), num_words) for cluster in cluster_sentences]
 # 💾 Get text file download link
 def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="💾 Save"):
     b64 = base64.b64encode(text_to_download.encode()).decode()
     return f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
-# 📊 Perform EDA
-def perform_eda(file_name):
-    st.subheader(f"EDA for {file_name}")
-    process_file(os.path.abspath(file_name))
 # 📊 Plot cluster words
 def plot_cluster_words(cluster_sentences):
@@ -154,91 +169,72 @@ def plot_cluster_words(cluster_sentences):
         st.markdown(f"Number of Sentences: {len(cluster)}")
         st.markdown("---")
-# 📝 Process file
-def process_file(file_path):
-    try:
-        with open(file_path, 'r', encoding="utf-8") as file:
-            file_text = file.read()
-        text_without_timestamps = remove_timestamps(file_text)
-        top_words = extract_high_information_words(text_without_timestamps, 10)
-        with st.expander("📊 Top 10 High Information Words"):
-            st.write(top_words)
-        with st.expander("📈 Relationship Graph"):
-            display_relationship_graph(top_words) if top_words else st.warning("Unable to generate relationship graph.")
-        context_words = extract_context_words(text_without_timestamps, top_words)
-        with st.expander("🔗 Context Graph"):
-            display_context_graph(context_words) if context_words else st.warning("Unable to generate context graph.")
-        with st.expander("📑 Context Table"):
-            display_context_table(context_words) if context_words else st.warning("Unable to display context table.")
-        sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
-        st.write(f"Total Sentences: {len(sentences)}")
-        num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
-        clustered_sentences = cluster_sentences(sentences, num_clusters)
-        col1, col2 = st.columns(2)
-        with col1:
-            st.subheader("Original Text")
-            st.text_area("Original Sentences", value="\n".join(sentences), height=400)
-        with col2:
-            st.subheader("Clustered Text")
-            cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
-            clusters = "\n".join(f"Cluster {i+1} (High Info Words: {', '.join(words)})"
-                                 for i, words in enumerate(cluster_high_info_words))
-            clustered_text = "\n\n".join(f"Cluster {i+1} (High Info Words: {', '.join(words)}):\n{cluster_text}"
-                                         for i, (words, cluster_text) in enumerate(zip(cluster_high_info_words,
-                                                                                       ["\n".join(cluster) for cluster in clustered_sentences])))
-            st.text_area("Clusters", value=clusters, height=200)
-            st.text_area("Clustered Sentences", value=clustered_text, height=200)
-            clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
-            st.write("✅ All sentences are accounted for in the clustered output." if set(sentences) == set(clustered_sentences_flat)
-                     else "❌ Some sentences are missing in the clustered output.")
-        plot_cluster_words(clustered_sentences)
-    except Exception as e:
-        st.error(f"Error processing file: {str(e)}")
-# 🕰️ Remove timestamps
-def remove_timestamps(text):
-    return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
-# Main execution
-download_nltk_data()
-st.title("📺 Transcript Analysis 📊")
-txt_files_df = get_txt_files()
-st.write("Available .txt files:")
-st.dataframe(txt_files_df[['File Name']])
-st.write("Select a file to perform EDA:")
-cols = st.columns(len(txt_files_df))
-for i, (_, row) in enumerate(txt_files_df.iterrows()):
-    if cols[i].button(f":file_folder: {row['File Name']}"):
-        perform_eda(row['File Name'])
-if "messages" not in st.session_state:
-    st.session_state.messages = []
-for message in st.session_state.messages:
-    with st.chat_message(message["role"]):
-        st.markdown(message["content"])
-if prompt := st.chat_input("Ask a question about the data"):
-    st.session_state.messages.append({"role": "user", "content": prompt})
-    with st.chat_message("user"):
-        st.markdown(prompt)
-    response = f"You asked: {prompt}\n\nThis is a placeholder response. In a real application, you would process the user's question and provide an answer based on the data and EDA results."
-    st.session_state.messages.append({"role": "assistant", "content": response})
-    with st.chat_message("assistant"):
-        st.markdown(response)
 st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")

 import streamlit as st
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cluster import KMeans
 from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
 3. 📺 **Transcript Analysis** 📈:Speech recognition 🎙️ and thematic extraction 🌐, audiovisual content to actionable insights 🔑.
 ''')
 # 📥 Download NLTK data
 @st.cache_resource
 def download_nltk_data():
             nltk.download('stopwords')
     st.success('NLTK data is ready!')
+download_nltk_data()
+# 🕰️ Remove timestamps
+def remove_timestamps(text):
+    return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
 # 📊 Extract high information words
 def extract_high_information_words(text, top_n=10):
         st.error(f"Error in extract_high_information_words: {str(e)}")
         return []
+# 🔗 Create relationship graph
+def create_relationship_graph(words):
+    graph = Digraph()
+    for i, word in enumerate(words):
+        graph.node(str(i), word)
+        if i > 0:
+            graph.edge(str(i-1), str(i), label=word)
+    return graph
+# 📈 Display relationship graph
+def display_relationship_graph(words):
+    try:
+        graph = create_relationship_graph(words)
+        st.graphviz_chart(graph)
+    except Exception as e:
+        st.error(f"Error displaying relationship graph: {str(e)}")
+# 🔍 Extract context words
+def extract_context_words(text, high_information_words):
+    words = nltk.word_tokenize(text)
+    return [(words[i-1] if i > 0 else None, word, words[i+1] if i < len(words)-1 else None)
+            for i, word in enumerate(words) if word.lower() in high_information_words]
+# 📊 Create context graph
+def create_context_graph(context_words):
+    graph = Digraph()
+    for i, (before, high, after) in enumerate(context_words):
+        if before:
+            graph.node(f'before{i}', before, shape='box')
+            graph.edge(f'before{i}', f'high{i}', label=before)
+        graph.node(f'high{i}', high, shape='ellipse')
+        if after:
+            graph.node(f'after{i}', after, shape='diamond')
+            graph.edge(f'high{i}', f'after{i}', label=after)
+    return graph
+# 📈 Display context graph
+def display_context_graph(context_words):
+    try:
+        graph = create_context_graph(context_words)
+        st.graphviz_chart(graph)
+    except Exception as e:
+        st.error(f"Error displaying context graph: {str(e)}")
+# 📊 Display context table
+def display_context_table(context_words):
+    table = "| Before | High Info Word | After |\n|--------|----------------|-------|\n"
+    table += "\n".join(f"| {b if b else ''} | {h} | {a if a else ''} |" for b, h, a in context_words)
+    st.markdown(table)
+# 📁 Load example files
+def load_example_files():
+    excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
+    example_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
+    if example_files:
+        selected_file = st.selectbox("📄 Select an example file:", example_files)
+        if st.button(f"📂 Load {selected_file}"):
+            with open(selected_file, 'r', encoding="utf-8") as file:
+                return file.read()
+    else:
+        st.write("No suitable example files found.")
+    return None
+# 🧠 Cluster sentences
+def cluster_sentences(sentences, num_clusters):
+    sentences = [s for s in sentences if len(s) > 10]
+    num_clusters = min(num_clusters, len(sentences))
+    vectorizer = TfidfVectorizer()
+    X = vectorizer.fit_transform(sentences)
+    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
+    kmeans.fit(X)
+    clustered_sentences = [[] for _ in range(num_clusters)]
+    for i, label in enumerate(kmeans.labels_):
+        similarity = linear_kernel(kmeans.cluster_centers_[label:label+1], X[i:i+1]).flatten()[0]
+        clustered_sentences[label].append((similarity, sentences[i]))
+    return [[s for _, s in sorted(cluster, reverse=True)] for cluster in clustered_sentences]
 # 💾 Get text file download link
 def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="💾 Save"):
     b64 = base64.b64encode(text_to_download.encode()).decode()
     return f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
+# 📊 Get high info words per cluster
+def get_high_info_words_per_cluster(cluster_sentences, num_words=5):
+    return [extract_high_information_words(" ".join(cluster), num_words) for cluster in cluster_sentences]
 # 📊 Plot cluster words
 def plot_cluster_words(cluster_sentences):
         st.markdown(f"Number of Sentences: {len(cluster)}")
         st.markdown("---")
+# Main code for UI
+uploaded_file = st.file_uploader("📁 Choose a .txt file", type=['txt'])
+example_text = load_example_files()
+if example_text:
+    file_text = example_text
+elif uploaded_file:
+    file_text = uploaded_file.read().decode("utf-8")
+else:
+    file_text = ""
+if file_text:
+    text_without_timestamps = remove_timestamps(file_text)
+    top_words = extract_high_information_words(text_without_timestamps, 10)
+    with st.expander("📊 Top 10 High Information Words"):
+        st.write(top_words)
+    with st.expander("📈 Relationship Graph"):
+        display_relationship_graph(top_words)
+    context_words = extract_context_words(text_without_timestamps, top_words)
+    with st.expander("🔗 Context Graph"):
+        display_context_graph(context_words)
+    with st.expander("📑 Context Table"):
+        display_context_table(context_words)
+    sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
+    num_sentences = len(sentences)
+    st.write(f"Total Sentences: {num_sentences}")
+    num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
+    clustered_sentences = cluster_sentences(sentences, num_clusters)
+    col1, col2 = st.columns(2)
+    with col1:
+        st.subheader("Original Text")
+        original_text = "\n".join(sentences)
+        st.text_area("Original Sentences", value=original_text, height=400)
+    with col2:
+        st.subheader("Clustered Text")
+        clusters = ""
+        clustered_text = ""
+        cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
+        for i, cluster in enumerate(clustered_sentences):
+            cluster_text = "\n".join(cluster)
+            high_info_words = ", ".join(cluster_high_info_words[i])
+            clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
+            clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
+        st.text_area("Clusters", value=clusters, height=200)
+        st.text_area("Clustered Sentences", value=clustered_text, height=200)
+        clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
+        if set(sentences) == set(clustered_sentences_flat):
+            st.write("✅ All sentences are accounted for in the clustered output.")
+        else:
+            st.write("❌ Some sentences are missing in the clustered output.")
+    plot_cluster_words(clustered_sentences)
 st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")