awacke1 commited on
Commit
96afb92
ยท
verified ยท
1 Parent(s): 868ef7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -79
app.py CHANGED
@@ -14,7 +14,6 @@ from io import BytesIO
14
  import networkx as nx
15
  import matplotlib.pyplot as plt
16
 
17
- # Set page configuration with a title and favicon
18
  st.set_page_config(
19
  page_title="๐Ÿ“บTranscript๐Ÿ“œEDA๐Ÿ”NLTK",
20
  page_icon="๐ŸŒ ",
@@ -33,7 +32,6 @@ st.markdown('''
33
  3. ๐Ÿ“บ **Transcript Analysis** ๐Ÿ“ˆ:Speech recognition ๐ŸŽ™๏ธ and thematic extraction ๐ŸŒ, audiovisual content to actionable insights ๐Ÿ”‘.
34
  ''')
35
 
36
- # Download NLTK resources
37
  nltk.download('punkt')
38
  nltk.download('stopwords')
39
 
@@ -53,7 +51,7 @@ def create_relationship_graph(words):
53
  for index, word in enumerate(words):
54
  graph.node(str(index), word)
55
  if index > 0:
56
- graph.edge(str(index - 1), str(index), label=word) # Add word as edge label
57
  return graph
58
 
59
  def display_relationship_graph(words):
@@ -79,9 +77,9 @@ def create_context_graph(context_words):
79
  if after_word:
80
  graph.node(f'after{index}', after_word, shape='diamond')
81
  if before_word:
82
- graph.edge(f'before{index}', f'high{index}', label=before_word) # Add before_word as edge label
83
  if after_word:
84
- graph.edge(f'high{index}', f'after{index}', label=after_word) # Add after_word as edge label
85
  return graph
86
 
87
  def display_context_graph(context_words):
@@ -95,51 +93,29 @@ def display_context_table(context_words):
95
  st.markdown(table)
96
 
97
  def get_txt_files():
98
- # Exclude specific files
99
  excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
100
-
101
- # List all .txt files excluding the ones in excluded_files
102
  txt_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
103
-
104
- # Create a dataframe with file names and full paths
105
  df = pd.DataFrame({
106
  'File Name': txt_files,
107
  'Full Path': [os.path.abspath(f) for f in txt_files]
108
  })
109
-
110
  return df
111
 
112
  def cluster_sentences(sentences, num_clusters):
113
- # Filter sentences with length over 10 characters
114
  sentences = [sentence for sentence in sentences if len(sentence) > 10]
115
-
116
- # Check if the number of sentences is less than the desired number of clusters
117
  if len(sentences) < num_clusters:
118
- # If so, adjust the number of clusters to match the number of sentences
119
  num_clusters = len(sentences)
120
-
121
- # Vectorize the sentences
122
  vectorizer = TfidfVectorizer()
123
  X = vectorizer.fit_transform(sentences)
124
-
125
- # Perform k-means clustering
126
  kmeans = KMeans(n_clusters=num_clusters, random_state=42)
127
  kmeans.fit(X)
128
-
129
- # Calculate the centroid of each cluster
130
  cluster_centers = kmeans.cluster_centers_
131
-
132
- # Group sentences by cluster and calculate similarity to centroid
133
  clustered_sentences = [[] for _ in range(num_clusters)]
134
  for i, label in enumerate(kmeans.labels_):
135
  similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
136
  clustered_sentences[label].append((similarity, sentences[i]))
137
-
138
- # Order sentences within each cluster based on their similarity to the centroid
139
  for cluster in clustered_sentences:
140
- cluster.sort(reverse=True) # Sort based on similarity (descending order)
141
-
142
- # Return the ordered clustered sentences without similarity scores for display
143
  return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
144
 
145
  def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="๐Ÿ’พ Save"):
@@ -164,23 +140,17 @@ def plot_cluster_words(cluster_sentences):
164
  words = re.findall(r'\b[a-z]{4,}\b', cluster_text)
165
  word_freq = FreqDist(words)
166
  top_words = [word for word, _ in word_freq.most_common(20)]
167
-
168
  vectorizer = TfidfVectorizer()
169
  X = vectorizer.fit_transform(top_words)
170
  word_vectors = X.toarray()
171
-
172
  similarity_matrix = cosine_similarity(word_vectors)
173
-
174
  G = nx.from_numpy_array(similarity_matrix)
175
  pos = nx.spring_layout(G, k=0.5)
176
-
177
  plt.figure(figsize=(8, 6))
178
- nx.draw_networkx(G, pos, node_size=500, font_size=12, font_weight='bold', with_labels=True, labels={i: word for i, word in enumerate(top_words)}, node_color='skyblue', edge_color='gray') # Add word labels to nodes
179
  plt.axis('off')
180
  plt.title(f"Cluster {i+1} Word Arrangement")
181
-
182
  st.pyplot(plt)
183
-
184
  st.markdown(f"**Cluster {i+1} Details:**")
185
  st.markdown(f"Top Words: {', '.join(top_words)}")
186
  st.markdown(f"Number of Sentences: {len(cluster)}")
@@ -189,94 +159,76 @@ def plot_cluster_words(cluster_sentences):
189
  def process_file(file_path):
190
  with open(file_path, 'r', encoding="utf-8") as file:
191
  file_text = file.read()
192
-
193
- # Process the selected file
194
  text_without_timestamps = remove_timestamps(file_text)
195
  top_words = extract_high_information_words(text_without_timestamps, 10)
196
-
197
  with st.expander("๐Ÿ“Š Top 10 High Information Words"):
198
  st.write(top_words)
199
-
200
  with st.expander("๐Ÿ“ˆ Relationship Graph"):
201
  display_relationship_graph(top_words)
202
-
203
  context_words = extract_context_words(text_without_timestamps, top_words)
204
-
205
  with st.expander("๐Ÿ”— Context Graph"):
206
  display_context_graph(context_words)
207
-
208
  with st.expander("๐Ÿ“‘ Context Table"):
209
  display_context_table(context_words)
210
-
211
  sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
212
-
213
  num_sentences = len(sentences)
214
  st.write(f"Total Sentences: {num_sentences}")
215
-
216
  num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
217
  clustered_sentences = cluster_sentences(sentences, num_clusters)
218
-
219
  col1, col2 = st.columns(2)
220
-
221
  with col1:
222
  st.subheader("Original Text")
223
  original_text = "\n".join(sentences)
224
  st.text_area("Original Sentences", value=original_text, height=400)
225
-
226
  with col2:
227
  st.subheader("Clustered Text")
228
  clusters = ""
229
  clustered_text = ""
230
  cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
231
-
232
  for i, cluster in enumerate(clustered_sentences):
233
  cluster_text = "\n".join(cluster)
234
  high_info_words = ", ".join(cluster_high_info_words[i])
235
  clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
236
  clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
237
-
238
  st.text_area("Clusters", value=clusters, height=200)
239
  st.text_area("Clustered Sentences", value=clustered_text, height=200)
240
-
241
- # Verify that all sentences are accounted for in the clustered output
242
  clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
243
  if set(sentences) == set(clustered_sentences_flat):
244
  st.write("โœ… All sentences are accounted for in the clustered output.")
245
  else:
246
  st.write("โŒ Some sentences are missing in the clustered output.")
247
-
248
  plot_cluster_words(clustered_sentences)
249
 
250
- def on_file_select():
251
- selected_rows = st.session_state.file_selector['selected_rows']
252
- if selected_rows:
253
- st.session_state.selected_file = selected_rows[0]['File Name']
254
- else:
255
- st.session_state.selected_file = None
256
 
257
- # Main code for UI
258
  st.title("๐Ÿ“บ Transcript Analysis ๐Ÿ“Š")
259
 
260
- # Display dataframe of .txt files
261
  txt_files_df = get_txt_files()
262
  st.write("Available .txt files:")
263
-
264
- # Use st.data_editor() with on_change callback
265
- st.data_editor(
266
- txt_files_df,
267
- hide_index=True,
268
- key="file_selector",
269
- on_change=on_file_select,
270
- disabled=("Full Path",)
271
- )
272
-
273
- # Display the selected file and process button
274
- if 'selected_file' in st.session_state and st.session_state.selected_file:
275
- st.write(f"Selected file: {st.session_state.selected_file}")
276
- if st.button(f"Process {st.session_state.selected_file}"):
277
- file_path = txt_files_df[txt_files_df['File Name'] == st.session_state.selected_file]['Full Path'].iloc[0]
278
- process_file(file_path)
279
- else:
280
- st.write("Please select a file to process.")
 
 
 
 
 
281
 
282
  st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")
 
14
  import networkx as nx
15
  import matplotlib.pyplot as plt
16
 
 
17
  st.set_page_config(
18
  page_title="๐Ÿ“บTranscript๐Ÿ“œEDA๐Ÿ”NLTK",
19
  page_icon="๐ŸŒ ",
 
32
  3. ๐Ÿ“บ **Transcript Analysis** ๐Ÿ“ˆ:Speech recognition ๐ŸŽ™๏ธ and thematic extraction ๐ŸŒ, audiovisual content to actionable insights ๐Ÿ”‘.
33
  ''')
34
 
 
35
  nltk.download('punkt')
36
  nltk.download('stopwords')
37
 
 
51
  for index, word in enumerate(words):
52
  graph.node(str(index), word)
53
  if index > 0:
54
+ graph.edge(str(index - 1), str(index), label=word)
55
  return graph
56
 
57
  def display_relationship_graph(words):
 
77
  if after_word:
78
  graph.node(f'after{index}', after_word, shape='diamond')
79
  if before_word:
80
+ graph.edge(f'before{index}', f'high{index}', label=before_word)
81
  if after_word:
82
+ graph.edge(f'high{index}', f'after{index}', label=after_word)
83
  return graph
84
 
85
  def display_context_graph(context_words):
 
93
  st.markdown(table)
94
 
95
  def get_txt_files():
 
96
  excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
 
 
97
  txt_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
 
 
98
  df = pd.DataFrame({
99
  'File Name': txt_files,
100
  'Full Path': [os.path.abspath(f) for f in txt_files]
101
  })
 
102
  return df
103
 
104
  def cluster_sentences(sentences, num_clusters):
 
105
  sentences = [sentence for sentence in sentences if len(sentence) > 10]
 
 
106
  if len(sentences) < num_clusters:
 
107
  num_clusters = len(sentences)
 
 
108
  vectorizer = TfidfVectorizer()
109
  X = vectorizer.fit_transform(sentences)
 
 
110
  kmeans = KMeans(n_clusters=num_clusters, random_state=42)
111
  kmeans.fit(X)
 
 
112
  cluster_centers = kmeans.cluster_centers_
 
 
113
  clustered_sentences = [[] for _ in range(num_clusters)]
114
  for i, label in enumerate(kmeans.labels_):
115
  similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
116
  clustered_sentences[label].append((similarity, sentences[i]))
 
 
117
  for cluster in clustered_sentences:
118
+ cluster.sort(reverse=True)
 
 
119
  return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
120
 
121
  def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="๐Ÿ’พ Save"):
 
140
  words = re.findall(r'\b[a-z]{4,}\b', cluster_text)
141
  word_freq = FreqDist(words)
142
  top_words = [word for word, _ in word_freq.most_common(20)]
 
143
  vectorizer = TfidfVectorizer()
144
  X = vectorizer.fit_transform(top_words)
145
  word_vectors = X.toarray()
 
146
  similarity_matrix = cosine_similarity(word_vectors)
 
147
  G = nx.from_numpy_array(similarity_matrix)
148
  pos = nx.spring_layout(G, k=0.5)
 
149
  plt.figure(figsize=(8, 6))
150
+ nx.draw_networkx(G, pos, node_size=500, font_size=12, font_weight='bold', with_labels=True, labels={i: word for i, word in enumerate(top_words)}, node_color='skyblue', edge_color='gray')
151
  plt.axis('off')
152
  plt.title(f"Cluster {i+1} Word Arrangement")
 
153
  st.pyplot(plt)
 
154
  st.markdown(f"**Cluster {i+1} Details:**")
155
  st.markdown(f"Top Words: {', '.join(top_words)}")
156
  st.markdown(f"Number of Sentences: {len(cluster)}")
 
159
  def process_file(file_path):
160
  with open(file_path, 'r', encoding="utf-8") as file:
161
  file_text = file.read()
 
 
162
  text_without_timestamps = remove_timestamps(file_text)
163
  top_words = extract_high_information_words(text_without_timestamps, 10)
 
164
  with st.expander("๐Ÿ“Š Top 10 High Information Words"):
165
  st.write(top_words)
 
166
  with st.expander("๐Ÿ“ˆ Relationship Graph"):
167
  display_relationship_graph(top_words)
 
168
  context_words = extract_context_words(text_without_timestamps, top_words)
 
169
  with st.expander("๐Ÿ”— Context Graph"):
170
  display_context_graph(context_words)
 
171
  with st.expander("๐Ÿ“‘ Context Table"):
172
  display_context_table(context_words)
 
173
  sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
 
174
  num_sentences = len(sentences)
175
  st.write(f"Total Sentences: {num_sentences}")
 
176
  num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
177
  clustered_sentences = cluster_sentences(sentences, num_clusters)
 
178
  col1, col2 = st.columns(2)
 
179
  with col1:
180
  st.subheader("Original Text")
181
  original_text = "\n".join(sentences)
182
  st.text_area("Original Sentences", value=original_text, height=400)
 
183
  with col2:
184
  st.subheader("Clustered Text")
185
  clusters = ""
186
  clustered_text = ""
187
  cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
 
188
  for i, cluster in enumerate(clustered_sentences):
189
  cluster_text = "\n".join(cluster)
190
  high_info_words = ", ".join(cluster_high_info_words[i])
191
  clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
192
  clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
 
193
  st.text_area("Clusters", value=clusters, height=200)
194
  st.text_area("Clustered Sentences", value=clustered_text, height=200)
 
 
195
  clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
196
  if set(sentences) == set(clustered_sentences_flat):
197
  st.write("โœ… All sentences are accounted for in the clustered output.")
198
  else:
199
  st.write("โŒ Some sentences are missing in the clustered output.")
 
200
  plot_cluster_words(clustered_sentences)
201
 
202
+ def perform_eda(file_name):
203
+ st.subheader(f"EDA for {file_name}")
204
+ process_file(os.path.abspath(file_name))
 
 
 
205
 
 
206
  st.title("๐Ÿ“บ Transcript Analysis ๐Ÿ“Š")
207
 
 
208
  txt_files_df = get_txt_files()
209
  st.write("Available .txt files:")
210
+ st.dataframe(txt_files_df[['File Name']])
211
+
212
+ st.write("Select a file to perform EDA:")
213
+ cols = st.columns(len(txt_files_df))
214
+ for i, (_, row) in enumerate(txt_files_df.iterrows()):
215
+ if cols[i].button(f":file_folder: {row['File Name']}"):
216
+ perform_eda(row['File Name'])
217
+
218
+ if "messages" not in st.session_state:
219
+ st.session_state.messages = []
220
+
221
+ for message in st.session_state.messages:
222
+ with st.chat_message(message["role"]):
223
+ st.markdown(message["content"])
224
+
225
+ if prompt := st.chat_input("Ask a question about the data"):
226
+ st.session_state.messages.append({"role": "user", "content": prompt})
227
+ with st.chat_message("user"):
228
+ st.markdown(prompt)
229
+ response = f"You asked: {prompt}\n\nThis is a placeholder response. In a real application, you would process the user's question and provide an answer based on the data and EDA results."
230
+ st.session_state.messages.append({"role": "assistant", "content": response})
231
+ with st.chat_message("assistant"):
232
+ st.markdown(response)
233
 
234
  st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")