Spaces:

awacke1
/

Transcript-EDA-NLTK

Sleeping

App Files Files Community

awacke1 commited on Aug 14, 2024

Commit

96afb92

verified ·

1 Parent(s): 868ef7a

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -79

app.py CHANGED Viewed

@@ -14,7 +14,6 @@ from io import BytesIO
 import networkx as nx
 import matplotlib.pyplot as plt
-# Set page configuration with a title and favicon
 st.set_page_config(
     page_title="📺Transcript📜EDA🔍NLTK",
     page_icon="🌠",
@@ -33,7 +32,6 @@ st.markdown('''
 3. 📺 **Transcript Analysis** 📈:Speech recognition 🎙️ and thematic extraction 🌐, audiovisual content to actionable insights 🔑.
 ''')
-# Download NLTK resources
 nltk.download('punkt')
 nltk.download('stopwords')
@@ -53,7 +51,7 @@ def create_relationship_graph(words):
     for index, word in enumerate(words):
         graph.node(str(index), word)
         if index > 0:
-            graph.edge(str(index - 1), str(index), label=word)  # Add word as edge label
     return graph
 def display_relationship_graph(words):
@@ -79,9 +77,9 @@ def create_context_graph(context_words):
         if after_word:
             graph.node(f'after{index}', after_word, shape='diamond')
         if before_word:
-            graph.edge(f'before{index}', f'high{index}', label=before_word)  # Add before_word as edge label
         if after_word:
-            graph.edge(f'high{index}', f'after{index}', label=after_word)  # Add after_word as edge label
     return graph
 def display_context_graph(context_words):
@@ -95,51 +93,29 @@ def display_context_table(context_words):
     st.markdown(table)
 def get_txt_files():
-    # Exclude specific files
     excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
-    # List all .txt files excluding the ones in excluded_files
     txt_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
-    # Create a dataframe with file names and full paths
     df = pd.DataFrame({
         'File Name': txt_files,
         'Full Path': [os.path.abspath(f) for f in txt_files]
     })
     return df
 def cluster_sentences(sentences, num_clusters):
-    # Filter sentences with length over 10 characters
     sentences = [sentence for sentence in sentences if len(sentence) > 10]
-    # Check if the number of sentences is less than the desired number of clusters
     if len(sentences) < num_clusters:
-        # If so, adjust the number of clusters to match the number of sentences
         num_clusters = len(sentences)
-    # Vectorize the sentences
     vectorizer = TfidfVectorizer()
     X = vectorizer.fit_transform(sentences)
-    # Perform k-means clustering
     kmeans = KMeans(n_clusters=num_clusters, random_state=42)
     kmeans.fit(X)
-    # Calculate the centroid of each cluster
     cluster_centers = kmeans.cluster_centers_
-    # Group sentences by cluster and calculate similarity to centroid
     clustered_sentences = [[] for _ in range(num_clusters)]
     for i, label in enumerate(kmeans.labels_):
         similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
         clustered_sentences[label].append((similarity, sentences[i]))
-    # Order sentences within each cluster based on their similarity to the centroid
     for cluster in clustered_sentences:
-        cluster.sort(reverse=True)  # Sort based on similarity (descending order)
-    # Return the ordered clustered sentences without similarity scores for display
     return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
 def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="💾 Save"):
@@ -164,23 +140,17 @@ def plot_cluster_words(cluster_sentences):
         words = re.findall(r'\b[a-z]{4,}\b', cluster_text)
         word_freq = FreqDist(words)
         top_words = [word for word, _ in word_freq.most_common(20)]
         vectorizer = TfidfVectorizer()
         X = vectorizer.fit_transform(top_words)
         word_vectors = X.toarray()
         similarity_matrix = cosine_similarity(word_vectors)
         G = nx.from_numpy_array(similarity_matrix)
         pos = nx.spring_layout(G, k=0.5)
         plt.figure(figsize=(8, 6))
-        nx.draw_networkx(G, pos, node_size=500, font_size=12, font_weight='bold', with_labels=True, labels={i: word for i, word in enumerate(top_words)}, node_color='skyblue', edge_color='gray')  # Add word labels to nodes
         plt.axis('off')
         plt.title(f"Cluster {i+1} Word Arrangement")
         st.pyplot(plt)
         st.markdown(f"**Cluster {i+1} Details:**")
         st.markdown(f"Top Words: {', '.join(top_words)}")
         st.markdown(f"Number of Sentences: {len(cluster)}")
@@ -189,94 +159,76 @@ def plot_cluster_words(cluster_sentences):
 def process_file(file_path):
     with open(file_path, 'r', encoding="utf-8") as file:
         file_text = file.read()
-    # Process the selected file
     text_without_timestamps = remove_timestamps(file_text)
     top_words = extract_high_information_words(text_without_timestamps, 10)
     with st.expander("📊 Top 10 High Information Words"):
         st.write(top_words)
     with st.expander("📈 Relationship Graph"):
         display_relationship_graph(top_words)
     context_words = extract_context_words(text_without_timestamps, top_words)
     with st.expander("🔗 Context Graph"):
         display_context_graph(context_words)
     with st.expander("📑 Context Table"):
         display_context_table(context_words)
     sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
     num_sentences = len(sentences)
     st.write(f"Total Sentences: {num_sentences}")
     num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
     clustered_sentences = cluster_sentences(sentences, num_clusters)
     col1, col2 = st.columns(2)
     with col1:
         st.subheader("Original Text")
         original_text = "\n".join(sentences)
         st.text_area("Original Sentences", value=original_text, height=400)
     with col2:
         st.subheader("Clustered Text")
         clusters = ""
         clustered_text = ""
         cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
         for i, cluster in enumerate(clustered_sentences):
             cluster_text = "\n".join(cluster)
             high_info_words = ", ".join(cluster_high_info_words[i])
             clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
             clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
         st.text_area("Clusters", value=clusters, height=200)
         st.text_area("Clustered Sentences", value=clustered_text, height=200)
-        # Verify that all sentences are accounted for in the clustered output
         clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
         if set(sentences) == set(clustered_sentences_flat):
             st.write("✅ All sentences are accounted for in the clustered output.")
         else:
             st.write("❌ Some sentences are missing in the clustered output.")
     plot_cluster_words(clustered_sentences)
-def on_file_select():
-    selected_rows = st.session_state.file_selector['selected_rows']
-    if selected_rows:
-        st.session_state.selected_file = selected_rows[0]['File Name']
-    else:
-        st.session_state.selected_file = None
-# Main code for UI
 st.title("📺 Transcript Analysis 📊")
-# Display dataframe of .txt files
 txt_files_df = get_txt_files()
 st.write("Available .txt files:")
-# Use st.data_editor() with on_change callback
-st.data_editor(
-    txt_files_df,
-    hide_index=True,
-    key="file_selector",
-    on_change=on_file_select,
-    disabled=("Full Path",)
-)
-# Display the selected file and process button
-if 'selected_file' in st.session_state and st.session_state.selected_file:
-    st.write(f"Selected file: {st.session_state.selected_file}")
-    if st.button(f"Process {st.session_state.selected_file}"):
-        file_path = txt_files_df[txt_files_df['File Name'] == st.session_state.selected_file]['Full Path'].iloc[0]
-        process_file(file_path)
-else:
-    st.write("Please select a file to process.")
 st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")

 import networkx as nx
 import matplotlib.pyplot as plt
 st.set_page_config(
     page_title="📺Transcript📜EDA🔍NLTK",
     page_icon="🌠",
 3. 📺 **Transcript Analysis** 📈:Speech recognition 🎙️ and thematic extraction 🌐, audiovisual content to actionable insights 🔑.
 ''')
 nltk.download('punkt')
 nltk.download('stopwords')
     for index, word in enumerate(words):
         graph.node(str(index), word)
         if index > 0:
+            graph.edge(str(index - 1), str(index), label=word)
     return graph
 def display_relationship_graph(words):
         if after_word:
             graph.node(f'after{index}', after_word, shape='diamond')
         if before_word:
+            graph.edge(f'before{index}', f'high{index}', label=before_word)
         if after_word:
+            graph.edge(f'high{index}', f'after{index}', label=after_word)
     return graph
 def display_context_graph(context_words):
     st.markdown(table)
 def get_txt_files():
     excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
     txt_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
     df = pd.DataFrame({
         'File Name': txt_files,
         'Full Path': [os.path.abspath(f) for f in txt_files]
     })
     return df
 def cluster_sentences(sentences, num_clusters):
     sentences = [sentence for sentence in sentences if len(sentence) > 10]
     if len(sentences) < num_clusters:
         num_clusters = len(sentences)
     vectorizer = TfidfVectorizer()
     X = vectorizer.fit_transform(sentences)
     kmeans = KMeans(n_clusters=num_clusters, random_state=42)
     kmeans.fit(X)
     cluster_centers = kmeans.cluster_centers_
     clustered_sentences = [[] for _ in range(num_clusters)]
     for i, label in enumerate(kmeans.labels_):
         similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
         clustered_sentences[label].append((similarity, sentences[i]))
     for cluster in clustered_sentences:
+        cluster.sort(reverse=True)
     return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
 def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="💾 Save"):
         words = re.findall(r'\b[a-z]{4,}\b', cluster_text)
         word_freq = FreqDist(words)
         top_words = [word for word, _ in word_freq.most_common(20)]
         vectorizer = TfidfVectorizer()
         X = vectorizer.fit_transform(top_words)
         word_vectors = X.toarray()
         similarity_matrix = cosine_similarity(word_vectors)
         G = nx.from_numpy_array(similarity_matrix)
         pos = nx.spring_layout(G, k=0.5)
         plt.figure(figsize=(8, 6))
+        nx.draw_networkx(G, pos, node_size=500, font_size=12, font_weight='bold', with_labels=True, labels={i: word for i, word in enumerate(top_words)}, node_color='skyblue', edge_color='gray')
         plt.axis('off')
         plt.title(f"Cluster {i+1} Word Arrangement")
         st.pyplot(plt)
         st.markdown(f"**Cluster {i+1} Details:**")
         st.markdown(f"Top Words: {', '.join(top_words)}")
         st.markdown(f"Number of Sentences: {len(cluster)}")
 def process_file(file_path):
     with open(file_path, 'r', encoding="utf-8") as file:
         file_text = file.read()
     text_without_timestamps = remove_timestamps(file_text)
     top_words = extract_high_information_words(text_without_timestamps, 10)
     with st.expander("📊 Top 10 High Information Words"):
         st.write(top_words)
     with st.expander("📈 Relationship Graph"):
         display_relationship_graph(top_words)
     context_words = extract_context_words(text_without_timestamps, top_words)
     with st.expander("🔗 Context Graph"):
         display_context_graph(context_words)
     with st.expander("📑 Context Table"):
         display_context_table(context_words)
     sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
     num_sentences = len(sentences)
     st.write(f"Total Sentences: {num_sentences}")
     num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
     clustered_sentences = cluster_sentences(sentences, num_clusters)
     col1, col2 = st.columns(2)
     with col1:
         st.subheader("Original Text")
         original_text = "\n".join(sentences)
         st.text_area("Original Sentences", value=original_text, height=400)
     with col2:
         st.subheader("Clustered Text")
         clusters = ""
         clustered_text = ""
         cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
         for i, cluster in enumerate(clustered_sentences):
             cluster_text = "\n".join(cluster)
             high_info_words = ", ".join(cluster_high_info_words[i])
             clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
             clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
         st.text_area("Clusters", value=clusters, height=200)
         st.text_area("Clustered Sentences", value=clustered_text, height=200)
         clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
         if set(sentences) == set(clustered_sentences_flat):
             st.write("✅ All sentences are accounted for in the clustered output.")
         else:
             st.write("❌ Some sentences are missing in the clustered output.")
     plot_cluster_words(clustered_sentences)
+def perform_eda(file_name):
+    st.subheader(f"EDA for {file_name}")
+    process_file(os.path.abspath(file_name))
 st.title("📺 Transcript Analysis 📊")
 txt_files_df = get_txt_files()
 st.write("Available .txt files:")
+st.dataframe(txt_files_df[['File Name']])
+st.write("Select a file to perform EDA:")
+cols = st.columns(len(txt_files_df))
+for i, (_, row) in enumerate(txt_files_df.iterrows()):
+    if cols[i].button(f":file_folder: {row['File Name']}"):
+        perform_eda(row['File Name'])
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+if prompt := st.chat_input("Ask a question about the data"):
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    with st.chat_message("user"):
+        st.markdown(prompt)
+    response = f"You asked: {prompt}\n\nThis is a placeholder response. In a real application, you would process the user's question and provide an answer based on the data and EDA results."
+    st.session_state.messages.append({"role": "assistant", "content": response})
+    with st.chat_message("assistant"):
+        st.markdown(response)
 st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")