Spaces:

awacke1
/

Transcript-EDA-NLTK

Sleeping

App Files Files Community

awacke1 commited on Aug 14, 2024

Commit

34e2c53

verified ·

1 Parent(s): 9e06c9d

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -116

app.py CHANGED Viewed

@@ -32,134 +32,120 @@ st.markdown('''
 3. 📺 **Transcript Analysis** 📈:Speech recognition 🎙️ and thematic extraction 🌐, audiovisual content to actionable insights 🔑.
 ''')
 @st.cache_resource
 def download_nltk_data():
     try:
         nltk.data.find('tokenizers/punkt')
         nltk.data.find('corpora/stopwords')
     except LookupError:
-        nltk.download('punkt')
-        nltk.download('stopwords')
-download_nltk_data()
-def remove_timestamps(text):
-    return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
 def extract_high_information_words(text, top_n=10):
     try:
-        words = nltk.word_tokenize(text)
-        words = [word.lower() for word in words if word.isalpha()]
         stop_words = set(stopwords.words('english'))
         filtered_words = [word for word in words if word not in stop_words]
-        freq_dist = FreqDist(filtered_words)
-        return [word for word, _ in freq_dist.most_common(top_n)]
     except Exception as e:
         st.error(f"Error in extract_high_information_words: {str(e)}")
         return []
-def create_relationship_graph(words):
-    graph = Digraph()
-    for index, word in enumerate(words):
-        graph.node(str(index), word)
-        if index > 0:
-            graph.edge(str(index - 1), str(index), label=word)
-    return graph
-def display_relationship_graph(words):
-    graph = create_relationship_graph(words)
-    st.graphviz_chart(graph)
-def extract_context_words(text, high_information_words):
-    words = nltk.word_tokenize(text)
-    context_words = []
-    for index, word in enumerate(words):
-        if word.lower() in high_information_words:
-            before_word = words[index - 1] if index > 0 else None
-            after_word = words[index + 1] if index < len(words) - 1 else None
-            context_words.append((before_word, word, after_word))
-    return context_words
-def create_context_graph(context_words):
-    graph = Digraph()
-    for index, (before_word, high_info_word, after_word) in enumerate(context_words):
-        if before_word:
-            graph.node(f'before{index}', before_word, shape='box')
-        graph.node(f'high{index}', high_info_word, shape='ellipse')
-        if after_word:
-            graph.node(f'after{index}', after_word, shape='diamond')
-        if before_word:
-            graph.edge(f'before{index}', f'high{index}', label=before_word)
-        if after_word:
-            graph.edge(f'high{index}', f'after{index}', label=after_word)
-    return graph
-def display_context_graph(context_words):
-    graph = create_context_graph(context_words)
-    st.graphviz_chart(graph)
-def display_context_table(context_words):
-    table = "| Before | High Info Word | After |\n|--------|----------------|-------|\n"
-    for before, high, after in context_words:
-        table += f"| {before if before else ''} | {high} | {after if after else ''} |\n"
-    st.markdown(table)
 def get_txt_files():
     excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
     txt_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
-    df = pd.DataFrame({
-        'File Name': txt_files,
-        'Full Path': [os.path.abspath(f) for f in txt_files]
-    })
-    return df
-def cluster_sentences(sentences, num_clusters):
-    sentences = [sentence for sentence in sentences if len(sentence) > 10]
-    if len(sentences) < num_clusters:
-        num_clusters = len(sentences)
-    vectorizer = TfidfVectorizer()
-    X = vectorizer.fit_transform(sentences)
-    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
-    kmeans.fit(X)
-    cluster_centers = kmeans.cluster_centers_
-    clustered_sentences = [[] for _ in range(num_clusters)]
-    for i, label in enumerate(kmeans.labels_):
-        similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
-        clustered_sentences[label].append((similarity, sentences[i]))
-    for cluster in clustered_sentences:
-        cluster.sort(reverse=True)
-    return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
 def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="💾 Save"):
-    buffer = BytesIO()
-    buffer.write(text_to_download.encode())
-    buffer.seek(0)
-    b64 = base64.b64encode(buffer.read()).decode()
-    href = f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
-    return href
-def get_high_info_words_per_cluster(cluster_sentences, num_words=5):
-    cluster_high_info_words = []
-    for cluster in cluster_sentences:
-        cluster_text = " ".join(cluster)
-        high_info_words = extract_high_information_words(cluster_text, num_words)
-        cluster_high_info_words.append(high_info_words)
-    return cluster_high_info_words
 def plot_cluster_words(cluster_sentences):
     for i, cluster in enumerate(cluster_sentences):
-        cluster_text = " ".join(cluster)
-        words = re.findall(r'\b[a-z]{4,}\b', cluster_text)
         word_freq = FreqDist(words)
         top_words = [word for word, _ in word_freq.most_common(20)]
         vectorizer = TfidfVectorizer()
         X = vectorizer.fit_transform(top_words)
-        word_vectors = X.toarray()
-        similarity_matrix = cosine_similarity(word_vectors)
         G = nx.from_numpy_array(similarity_matrix)
         pos = nx.spring_layout(G, k=0.5)
         plt.figure(figsize=(8, 6))
-        nx.draw_networkx(G, pos, node_size=500, font_size=12, font_weight='bold', with_labels=True, labels={i: word for i, word in enumerate(top_words)}, node_color='skyblue', edge_color='gray')
         plt.axis('off')
         plt.title(f"Cluster {i+1} Word Arrangement")
         st.pyplot(plt)
@@ -168,55 +154,64 @@ def plot_cluster_words(cluster_sentences):
         st.markdown(f"Number of Sentences: {len(cluster)}")
         st.markdown("---")
 def process_file(file_path):
     try:
         with open(file_path, 'r', encoding="utf-8") as file:
             file_text = file.read()
         text_without_timestamps = remove_timestamps(file_text)
         top_words = extract_high_information_words(text_without_timestamps, 10)
         with st.expander("📊 Top 10 High Information Words"):
             st.write(top_words)
         with st.expander("📈 Relationship Graph"):
-            display_relationship_graph(top_words)
         context_words = extract_context_words(text_without_timestamps, top_words)
         with st.expander("🔗 Context Graph"):
-            display_context_graph(context_words)
         with st.expander("📑 Context Table"):
-            display_context_table(context_words)
         sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
-        num_sentences = len(sentences)
-        st.write(f"Total Sentences: {num_sentences}")
         num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
         clustered_sentences = cluster_sentences(sentences, num_clusters)
         col1, col2 = st.columns(2)
         with col1:
             st.subheader("Original Text")
-            original_text = "\n".join(sentences)
-            st.text_area("Original Sentences", value=original_text, height=400)
         with col2:
             st.subheader("Clustered Text")
-            clusters = ""
-            clustered_text = ""
             cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
-            for i, cluster in enumerate(clustered_sentences):
-                cluster_text = "\n".join(cluster)
-                high_info_words = ", ".join(cluster_high_info_words[i])
-                clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
-                clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
             st.text_area("Clusters", value=clusters, height=200)
             st.text_area("Clustered Sentences", value=clustered_text, height=200)
             clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
-            if set(sentences) == set(clustered_sentences_flat):
-                st.write("✅ All sentences are accounted for in the clustered output.")
-            else:
-                st.write("❌ Some sentences are missing in the clustered output.")
         plot_cluster_words(clustered_sentences)
     except Exception as e:
         st.error(f"Error processing file: {str(e)}")
-def perform_eda(file_name):
-    st.subheader(f"EDA for {file_name}")
-    process_file(os.path.abspath(file_name))
 st.title("📺 Transcript Analysis 📊")

 3. 📺 **Transcript Analysis** 📈:Speech recognition 🎙️ and thematic extraction 🌐, audiovisual content to actionable insights 🔑.
 ''')
+# 🧠 Cluster sentences using K-means
+def cluster_sentences(sentences, num_clusters):
+    sentences = [s for s in sentences if len(s) > 10]
+    num_clusters = min(num_clusters, len(sentences))
+    vectorizer = TfidfVectorizer()
+    X = vectorizer.fit_transform(sentences)
+    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
+    kmeans.fit(X)
+    clustered_sentences = [[] for _ in range(num_clusters)]
+    for i, label in enumerate(kmeans.labels_):
+        similarity = linear_kernel(kmeans.cluster_centers_[label:label+1], X[i:i+1]).flatten()[0]
+        clustered_sentences[label].append((similarity, sentences[i]))
+    return [[s for _, s in sorted(cluster, reverse=True)] for cluster in clustered_sentences]
+# 📊 Create context graph
+def create_context_graph(context_words):
+    graph = Digraph()
+    for i, (before, high, after) in enumerate(context_words):
+        if before:
+            graph.node(f'before{i}', before, shape='box')
+            graph.edge(f'before{i}', f'high{i}', label=before)
+        graph.node(f'high{i}', high, shape='ellipse')
+        if after:
+            graph.node(f'after{i}', after, shape='diamond')
+            graph.edge(f'high{i}', f'after{i}', label=after)
+    return graph
+# 🔗 Create relationship graph
+def create_relationship_graph(words):
+    graph = Digraph()
+    for i, word in enumerate(words):
+        graph.node(str(i), word)
+        if i > 0:
+            graph.edge(str(i-1), str(i), label=word)
+    return graph
+# 📈 Display context graph
+def display_context_graph(context_words):
+    st.graphviz_chart(create_context_graph(context_words))
+# 📊 Display context table
+def display_context_table(context_words):
+    table = "| Before | High Info Word | After |\n|--------|----------------|-------|\n"
+    table += "\n".join(f"| {b if b else ''} | {h} | {a if a else ''} |" for b, h, a in context_words)
+    st.markdown(table)
+# 📈 Display relationship graph
+def display_relationship_graph(words):
+    st.graphviz_chart(create_relationship_graph(words))
+# 📥 Download NLTK data
 @st.cache_resource
 def download_nltk_data():
     try:
         nltk.data.find('tokenizers/punkt')
         nltk.data.find('corpora/stopwords')
     except LookupError:
+        with st.spinner('Downloading required NLTK data...'):
+            nltk.download('punkt')
+            nltk.download('stopwords')
+    st.success('NLTK data is ready!')
+# 🔍 Extract context words
+def extract_context_words(text, high_information_words):
+    words = nltk.word_tokenize(text)
+    return [(words[i-1] if i > 0 else None, word, words[i+1] if i < len(words)-1 else None)
+            for i, word in enumerate(words) if word.lower() in high_information_words]
+# 📊 Extract high information words
 def extract_high_information_words(text, top_n=10):
     try:
+        words = [word.lower() for word in nltk.word_tokenize(text) if word.isalpha()]
         stop_words = set(stopwords.words('english'))
         filtered_words = [word for word in words if word not in stop_words]
+        return [word for word, _ in FreqDist(filtered_words).most_common(top_n)]
     except Exception as e:
         st.error(f"Error in extract_high_information_words: {str(e)}")
         return []
+# 📁 Get text files
 def get_txt_files():
     excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
     txt_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
+    return pd.DataFrame({'File Name': txt_files, 'Full Path': [os.path.abspath(f) for f in txt_files]})
+# 📊 Get high info words per cluster
+def get_high_info_words_per_cluster(cluster_sentences, num_words=5):
+    return [extract_high_information_words(" ".join(cluster), num_words) for cluster in cluster_sentences]
+# 💾 Get text file download link
 def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="💾 Save"):
+    b64 = base64.b64encode(text_to_download.encode()).decode()
+    return f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
+# 📊 Perform EDA
+def perform_eda(file_name):
+    st.subheader(f"EDA for {file_name}")
+    process_file(os.path.abspath(file_name))
+# 📊 Plot cluster words
 def plot_cluster_words(cluster_sentences):
     for i, cluster in enumerate(cluster_sentences):
+        words = re.findall(r'\b[a-z]{4,}\b', " ".join(cluster))
         word_freq = FreqDist(words)
         top_words = [word for word, _ in word_freq.most_common(20)]
         vectorizer = TfidfVectorizer()
         X = vectorizer.fit_transform(top_words)
+        similarity_matrix = cosine_similarity(X.toarray())
         G = nx.from_numpy_array(similarity_matrix)
         pos = nx.spring_layout(G, k=0.5)
         plt.figure(figsize=(8, 6))
+        nx.draw_networkx(G, pos, node_size=500, font_size=12, font_weight='bold', with_labels=True,
+                         labels={i: word for i, word in enumerate(top_words)},
+                         node_color='skyblue', edge_color='gray')
         plt.axis('off')
         plt.title(f"Cluster {i+1} Word Arrangement")
         st.pyplot(plt)
         st.markdown(f"Number of Sentences: {len(cluster)}")
         st.markdown("---")
+# 📝 Process file
 def process_file(file_path):
     try:
         with open(file_path, 'r', encoding="utf-8") as file:
             file_text = file.read()
         text_without_timestamps = remove_timestamps(file_text)
         top_words = extract_high_information_words(text_without_timestamps, 10)
         with st.expander("📊 Top 10 High Information Words"):
             st.write(top_words)
         with st.expander("📈 Relationship Graph"):
+            display_relationship_graph(top_words) if top_words else st.warning("Unable to generate relationship graph.")
         context_words = extract_context_words(text_without_timestamps, top_words)
         with st.expander("🔗 Context Graph"):
+            display_context_graph(context_words) if context_words else st.warning("Unable to generate context graph.")
         with st.expander("📑 Context Table"):
+            display_context_table(context_words) if context_words else st.warning("Unable to display context table.")
         sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
+        st.write(f"Total Sentences: {len(sentences)}")
         num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
         clustered_sentences = cluster_sentences(sentences, num_clusters)
         col1, col2 = st.columns(2)
         with col1:
             st.subheader("Original Text")
+            st.text_area("Original Sentences", value="\n".join(sentences), height=400)
         with col2:
             st.subheader("Clustered Text")
             cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
+            clusters = "\n".join(f"Cluster {i+1} (High Info Words: {', '.join(words)})"
+                                 for i, words in enumerate(cluster_high_info_words))
+            clustered_text = "\n\n".join(f"Cluster {i+1} (High Info Words: {', '.join(words)}):\n{cluster_text}"
+                                         for i, (words, cluster_text) in enumerate(zip(cluster_high_info_words,
+                                                                                       ["\n".join(cluster) for cluster in clustered_sentences])))
             st.text_area("Clusters", value=clusters, height=200)
             st.text_area("Clustered Sentences", value=clustered_text, height=200)
             clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
+            st.write("✅ All sentences are accounted for in the clustered output." if set(sentences) == set(clustered_sentences_flat)
+                     else "❌ Some sentences are missing in the clustered output.")
         plot_cluster_words(clustered_sentences)
     except Exception as e:
         st.error(f"Error processing file: {str(e)}")
+# 🕰️ Remove timestamps
+def remove_timestamps(text):
+    return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
+# Main execution
+download_nltk_data()
 st.title("📺 Transcript Analysis 📊")