Spaces:

awacke1
/

Transcript-EDA-NLTK

Sleeping

App Files Files Community

awacke1 commited on Aug 14, 2024

Commit

9e06c9d

verified ·

1 Parent(s): 96afb92

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -49

app.py CHANGED Viewed

@@ -32,19 +32,31 @@ st.markdown('''
 3. 📺 **Transcript Analysis** 📈:Speech recognition 🎙️ and thematic extraction 🌐, audiovisual content to actionable insights 🔑.
 ''')
-nltk.download('punkt')
-nltk.download('stopwords')
 def remove_timestamps(text):
     return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
 def extract_high_information_words(text, top_n=10):
-    words = nltk.word_tokenize(text)
-    words = [word.lower() for word in words if word.isalpha()]
-    stop_words = set(stopwords.words('english'))
-    filtered_words = [word for word in words if word not in stop_words]
-    freq_dist = FreqDist(filtered_words)
-    return [word for word, _ in freq_dist.most_common(top_n)]
 def create_relationship_graph(words):
     graph = Digraph()
@@ -157,47 +169,50 @@ def plot_cluster_words(cluster_sentences):
         st.markdown("---")
 def process_file(file_path):
-    with open(file_path, 'r', encoding="utf-8") as file:
-        file_text = file.read()
-    text_without_timestamps = remove_timestamps(file_text)
-    top_words = extract_high_information_words(text_without_timestamps, 10)
-    with st.expander("📊 Top 10 High Information Words"):
-        st.write(top_words)
-    with st.expander("📈 Relationship Graph"):
-        display_relationship_graph(top_words)
-    context_words = extract_context_words(text_without_timestamps, top_words)
-    with st.expander("🔗 Context Graph"):
-        display_context_graph(context_words)
-    with st.expander("📑 Context Table"):
-        display_context_table(context_words)
-    sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
-    num_sentences = len(sentences)
-    st.write(f"Total Sentences: {num_sentences}")
-    num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
-    clustered_sentences = cluster_sentences(sentences, num_clusters)
-    col1, col2 = st.columns(2)
-    with col1:
-        st.subheader("Original Text")
-        original_text = "\n".join(sentences)
-        st.text_area("Original Sentences", value=original_text, height=400)
-    with col2:
-        st.subheader("Clustered Text")
-        clusters = ""
-        clustered_text = ""
-        cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
-        for i, cluster in enumerate(clustered_sentences):
-            cluster_text = "\n".join(cluster)
-            high_info_words = ", ".join(cluster_high_info_words[i])
-            clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
-            clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
-        st.text_area("Clusters", value=clusters, height=200)
-        st.text_area("Clustered Sentences", value=clustered_text, height=200)
-        clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
-        if set(sentences) == set(clustered_sentences_flat):
-            st.write("✅ All sentences are accounted for in the clustered output.")
-        else:
-            st.write("❌ Some sentences are missing in the clustered output.")
-    plot_cluster_words(clustered_sentences)
 def perform_eda(file_name):
     st.subheader(f"EDA for {file_name}")

 3. 📺 **Transcript Analysis** 📈:Speech recognition 🎙️ and thematic extraction 🌐, audiovisual content to actionable insights 🔑.
 ''')
+@st.cache_resource
+def download_nltk_data():
+    try:
+        nltk.data.find('tokenizers/punkt')
+        nltk.data.find('corpora/stopwords')
+    except LookupError:
+        nltk.download('punkt')
+        nltk.download('stopwords')
+download_nltk_data()
 def remove_timestamps(text):
     return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
 def extract_high_information_words(text, top_n=10):
+    try:
+        words = nltk.word_tokenize(text)
+        words = [word.lower() for word in words if word.isalpha()]
+        stop_words = set(stopwords.words('english'))
+        filtered_words = [word for word in words if word not in stop_words]
+        freq_dist = FreqDist(filtered_words)
+        return [word for word, _ in freq_dist.most_common(top_n)]
+    except Exception as e:
+        st.error(f"Error in extract_high_information_words: {str(e)}")
+        return []
 def create_relationship_graph(words):
     graph = Digraph()
         st.markdown("---")
 def process_file(file_path):
+    try:
+        with open(file_path, 'r', encoding="utf-8") as file:
+            file_text = file.read()
+        text_without_timestamps = remove_timestamps(file_text)
+        top_words = extract_high_information_words(text_without_timestamps, 10)
+        with st.expander("📊 Top 10 High Information Words"):
+            st.write(top_words)
+        with st.expander("📈 Relationship Graph"):
+            display_relationship_graph(top_words)
+        context_words = extract_context_words(text_without_timestamps, top_words)
+        with st.expander("🔗 Context Graph"):
+            display_context_graph(context_words)
+        with st.expander("📑 Context Table"):
+            display_context_table(context_words)
+        sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
+        num_sentences = len(sentences)
+        st.write(f"Total Sentences: {num_sentences}")
+        num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
+        clustered_sentences = cluster_sentences(sentences, num_clusters)
+        col1, col2 = st.columns(2)
+        with col1:
+            st.subheader("Original Text")
+            original_text = "\n".join(sentences)
+            st.text_area("Original Sentences", value=original_text, height=400)
+        with col2:
+            st.subheader("Clustered Text")
+            clusters = ""
+            clustered_text = ""
+            cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
+            for i, cluster in enumerate(clustered_sentences):
+                cluster_text = "\n".join(cluster)
+                high_info_words = ", ".join(cluster_high_info_words[i])
+                clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
+                clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
+            st.text_area("Clusters", value=clusters, height=200)
+            st.text_area("Clustered Sentences", value=clustered_text, height=200)
+            clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
+            if set(sentences) == set(clustered_sentences_flat):
+                st.write("✅ All sentences are accounted for in the clustered output.")
+            else:
+                st.write("❌ Some sentences are missing in the clustered output.")
+        plot_cluster_words(clustered_sentences)
+    except Exception as e:
+        st.error(f"Error processing file: {str(e)}")
 def perform_eda(file_name):
     st.subheader(f"EDA for {file_name}")