awacke1 commited on
Commit
9e06c9d
Β·
verified Β·
1 Parent(s): 96afb92

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -49
app.py CHANGED
@@ -32,19 +32,31 @@ st.markdown('''
32
  3. πŸ“Ί **Transcript Analysis** πŸ“ˆ:Speech recognition πŸŽ™οΈ and thematic extraction 🌐, audiovisual content to actionable insights πŸ”‘.
33
  ''')
34
 
35
- nltk.download('punkt')
36
- nltk.download('stopwords')
 
 
 
 
 
 
 
 
37
 
38
  def remove_timestamps(text):
39
  return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
40
 
41
  def extract_high_information_words(text, top_n=10):
42
- words = nltk.word_tokenize(text)
43
- words = [word.lower() for word in words if word.isalpha()]
44
- stop_words = set(stopwords.words('english'))
45
- filtered_words = [word for word in words if word not in stop_words]
46
- freq_dist = FreqDist(filtered_words)
47
- return [word for word, _ in freq_dist.most_common(top_n)]
 
 
 
 
48
 
49
  def create_relationship_graph(words):
50
  graph = Digraph()
@@ -157,47 +169,50 @@ def plot_cluster_words(cluster_sentences):
157
  st.markdown("---")
158
 
159
  def process_file(file_path):
160
- with open(file_path, 'r', encoding="utf-8") as file:
161
- file_text = file.read()
162
- text_without_timestamps = remove_timestamps(file_text)
163
- top_words = extract_high_information_words(text_without_timestamps, 10)
164
- with st.expander("πŸ“Š Top 10 High Information Words"):
165
- st.write(top_words)
166
- with st.expander("πŸ“ˆ Relationship Graph"):
167
- display_relationship_graph(top_words)
168
- context_words = extract_context_words(text_without_timestamps, top_words)
169
- with st.expander("πŸ”— Context Graph"):
170
- display_context_graph(context_words)
171
- with st.expander("πŸ“‘ Context Table"):
172
- display_context_table(context_words)
173
- sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
174
- num_sentences = len(sentences)
175
- st.write(f"Total Sentences: {num_sentences}")
176
- num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
177
- clustered_sentences = cluster_sentences(sentences, num_clusters)
178
- col1, col2 = st.columns(2)
179
- with col1:
180
- st.subheader("Original Text")
181
- original_text = "\n".join(sentences)
182
- st.text_area("Original Sentences", value=original_text, height=400)
183
- with col2:
184
- st.subheader("Clustered Text")
185
- clusters = ""
186
- clustered_text = ""
187
- cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
188
- for i, cluster in enumerate(clustered_sentences):
189
- cluster_text = "\n".join(cluster)
190
- high_info_words = ", ".join(cluster_high_info_words[i])
191
- clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
192
- clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
193
- st.text_area("Clusters", value=clusters, height=200)
194
- st.text_area("Clustered Sentences", value=clustered_text, height=200)
195
- clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
196
- if set(sentences) == set(clustered_sentences_flat):
197
- st.write("βœ… All sentences are accounted for in the clustered output.")
198
- else:
199
- st.write("❌ Some sentences are missing in the clustered output.")
200
- plot_cluster_words(clustered_sentences)
 
 
 
201
 
202
  def perform_eda(file_name):
203
  st.subheader(f"EDA for {file_name}")
 
32
  3. πŸ“Ί **Transcript Analysis** πŸ“ˆ:Speech recognition πŸŽ™οΈ and thematic extraction 🌐, audiovisual content to actionable insights πŸ”‘.
33
  ''')
34
 
35
+ @st.cache_resource
36
+ def download_nltk_data():
37
+ try:
38
+ nltk.data.find('tokenizers/punkt')
39
+ nltk.data.find('corpora/stopwords')
40
+ except LookupError:
41
+ nltk.download('punkt')
42
+ nltk.download('stopwords')
43
+
44
+ download_nltk_data()
45
 
46
  def remove_timestamps(text):
47
  return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
48
 
49
  def extract_high_information_words(text, top_n=10):
50
+ try:
51
+ words = nltk.word_tokenize(text)
52
+ words = [word.lower() for word in words if word.isalpha()]
53
+ stop_words = set(stopwords.words('english'))
54
+ filtered_words = [word for word in words if word not in stop_words]
55
+ freq_dist = FreqDist(filtered_words)
56
+ return [word for word, _ in freq_dist.most_common(top_n)]
57
+ except Exception as e:
58
+ st.error(f"Error in extract_high_information_words: {str(e)}")
59
+ return []
60
 
61
  def create_relationship_graph(words):
62
  graph = Digraph()
 
169
  st.markdown("---")
170
 
171
  def process_file(file_path):
172
+ try:
173
+ with open(file_path, 'r', encoding="utf-8") as file:
174
+ file_text = file.read()
175
+ text_without_timestamps = remove_timestamps(file_text)
176
+ top_words = extract_high_information_words(text_without_timestamps, 10)
177
+ with st.expander("πŸ“Š Top 10 High Information Words"):
178
+ st.write(top_words)
179
+ with st.expander("πŸ“ˆ Relationship Graph"):
180
+ display_relationship_graph(top_words)
181
+ context_words = extract_context_words(text_without_timestamps, top_words)
182
+ with st.expander("πŸ”— Context Graph"):
183
+ display_context_graph(context_words)
184
+ with st.expander("πŸ“‘ Context Table"):
185
+ display_context_table(context_words)
186
+ sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
187
+ num_sentences = len(sentences)
188
+ st.write(f"Total Sentences: {num_sentences}")
189
+ num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
190
+ clustered_sentences = cluster_sentences(sentences, num_clusters)
191
+ col1, col2 = st.columns(2)
192
+ with col1:
193
+ st.subheader("Original Text")
194
+ original_text = "\n".join(sentences)
195
+ st.text_area("Original Sentences", value=original_text, height=400)
196
+ with col2:
197
+ st.subheader("Clustered Text")
198
+ clusters = ""
199
+ clustered_text = ""
200
+ cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
201
+ for i, cluster in enumerate(clustered_sentences):
202
+ cluster_text = "\n".join(cluster)
203
+ high_info_words = ", ".join(cluster_high_info_words[i])
204
+ clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
205
+ clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
206
+ st.text_area("Clusters", value=clusters, height=200)
207
+ st.text_area("Clustered Sentences", value=clustered_text, height=200)
208
+ clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
209
+ if set(sentences) == set(clustered_sentences_flat):
210
+ st.write("βœ… All sentences are accounted for in the clustered output.")
211
+ else:
212
+ st.write("❌ Some sentences are missing in the clustered output.")
213
+ plot_cluster_words(clustered_sentences)
214
+ except Exception as e:
215
+ st.error(f"Error processing file: {str(e)}")
216
 
217
  def perform_eda(file_name):
218
  st.subheader(f"EDA for {file_name}")