awacke1 commited on
Commit
34e2c53
Β·
verified Β·
1 Parent(s): 9e06c9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -116
app.py CHANGED
@@ -32,134 +32,120 @@ st.markdown('''
32
  3. πŸ“Ί **Transcript Analysis** πŸ“ˆ:Speech recognition πŸŽ™οΈ and thematic extraction 🌐, audiovisual content to actionable insights πŸ”‘.
33
  ''')
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  @st.cache_resource
36
  def download_nltk_data():
37
  try:
38
  nltk.data.find('tokenizers/punkt')
39
  nltk.data.find('corpora/stopwords')
40
  except LookupError:
41
- nltk.download('punkt')
42
- nltk.download('stopwords')
43
-
44
- download_nltk_data()
45
 
46
- def remove_timestamps(text):
47
- return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
 
 
 
48
 
 
49
  def extract_high_information_words(text, top_n=10):
50
  try:
51
- words = nltk.word_tokenize(text)
52
- words = [word.lower() for word in words if word.isalpha()]
53
  stop_words = set(stopwords.words('english'))
54
  filtered_words = [word for word in words if word not in stop_words]
55
- freq_dist = FreqDist(filtered_words)
56
- return [word for word, _ in freq_dist.most_common(top_n)]
57
  except Exception as e:
58
  st.error(f"Error in extract_high_information_words: {str(e)}")
59
  return []
60
 
61
- def create_relationship_graph(words):
62
- graph = Digraph()
63
- for index, word in enumerate(words):
64
- graph.node(str(index), word)
65
- if index > 0:
66
- graph.edge(str(index - 1), str(index), label=word)
67
- return graph
68
-
69
- def display_relationship_graph(words):
70
- graph = create_relationship_graph(words)
71
- st.graphviz_chart(graph)
72
-
73
- def extract_context_words(text, high_information_words):
74
- words = nltk.word_tokenize(text)
75
- context_words = []
76
- for index, word in enumerate(words):
77
- if word.lower() in high_information_words:
78
- before_word = words[index - 1] if index > 0 else None
79
- after_word = words[index + 1] if index < len(words) - 1 else None
80
- context_words.append((before_word, word, after_word))
81
- return context_words
82
-
83
- def create_context_graph(context_words):
84
- graph = Digraph()
85
- for index, (before_word, high_info_word, after_word) in enumerate(context_words):
86
- if before_word:
87
- graph.node(f'before{index}', before_word, shape='box')
88
- graph.node(f'high{index}', high_info_word, shape='ellipse')
89
- if after_word:
90
- graph.node(f'after{index}', after_word, shape='diamond')
91
- if before_word:
92
- graph.edge(f'before{index}', f'high{index}', label=before_word)
93
- if after_word:
94
- graph.edge(f'high{index}', f'after{index}', label=after_word)
95
- return graph
96
-
97
- def display_context_graph(context_words):
98
- graph = create_context_graph(context_words)
99
- st.graphviz_chart(graph)
100
-
101
- def display_context_table(context_words):
102
- table = "| Before | High Info Word | After |\n|--------|----------------|-------|\n"
103
- for before, high, after in context_words:
104
- table += f"| {before if before else ''} | {high} | {after if after else ''} |\n"
105
- st.markdown(table)
106
-
107
  def get_txt_files():
108
  excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
109
  txt_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
110
- df = pd.DataFrame({
111
- 'File Name': txt_files,
112
- 'Full Path': [os.path.abspath(f) for f in txt_files]
113
- })
114
- return df
115
 
116
- def cluster_sentences(sentences, num_clusters):
117
- sentences = [sentence for sentence in sentences if len(sentence) > 10]
118
- if len(sentences) < num_clusters:
119
- num_clusters = len(sentences)
120
- vectorizer = TfidfVectorizer()
121
- X = vectorizer.fit_transform(sentences)
122
- kmeans = KMeans(n_clusters=num_clusters, random_state=42)
123
- kmeans.fit(X)
124
- cluster_centers = kmeans.cluster_centers_
125
- clustered_sentences = [[] for _ in range(num_clusters)]
126
- for i, label in enumerate(kmeans.labels_):
127
- similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
128
- clustered_sentences[label].append((similarity, sentences[i]))
129
- for cluster in clustered_sentences:
130
- cluster.sort(reverse=True)
131
- return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
132
 
 
133
  def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="πŸ’Ύ Save"):
134
- buffer = BytesIO()
135
- buffer.write(text_to_download.encode())
136
- buffer.seek(0)
137
- b64 = base64.b64encode(buffer.read()).decode()
138
- href = f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
139
- return href
140
 
141
- def get_high_info_words_per_cluster(cluster_sentences, num_words=5):
142
- cluster_high_info_words = []
143
- for cluster in cluster_sentences:
144
- cluster_text = " ".join(cluster)
145
- high_info_words = extract_high_information_words(cluster_text, num_words)
146
- cluster_high_info_words.append(high_info_words)
147
- return cluster_high_info_words
148
 
 
149
  def plot_cluster_words(cluster_sentences):
150
  for i, cluster in enumerate(cluster_sentences):
151
- cluster_text = " ".join(cluster)
152
- words = re.findall(r'\b[a-z]{4,}\b', cluster_text)
153
  word_freq = FreqDist(words)
154
  top_words = [word for word, _ in word_freq.most_common(20)]
155
  vectorizer = TfidfVectorizer()
156
  X = vectorizer.fit_transform(top_words)
157
- word_vectors = X.toarray()
158
- similarity_matrix = cosine_similarity(word_vectors)
159
  G = nx.from_numpy_array(similarity_matrix)
160
  pos = nx.spring_layout(G, k=0.5)
161
  plt.figure(figsize=(8, 6))
162
- nx.draw_networkx(G, pos, node_size=500, font_size=12, font_weight='bold', with_labels=True, labels={i: word for i, word in enumerate(top_words)}, node_color='skyblue', edge_color='gray')
 
 
163
  plt.axis('off')
164
  plt.title(f"Cluster {i+1} Word Arrangement")
165
  st.pyplot(plt)
@@ -168,55 +154,64 @@ def plot_cluster_words(cluster_sentences):
168
  st.markdown(f"Number of Sentences: {len(cluster)}")
169
  st.markdown("---")
170
 
 
171
  def process_file(file_path):
172
  try:
173
  with open(file_path, 'r', encoding="utf-8") as file:
174
  file_text = file.read()
175
  text_without_timestamps = remove_timestamps(file_text)
176
  top_words = extract_high_information_words(text_without_timestamps, 10)
 
177
  with st.expander("πŸ“Š Top 10 High Information Words"):
178
  st.write(top_words)
 
179
  with st.expander("πŸ“ˆ Relationship Graph"):
180
- display_relationship_graph(top_words)
 
181
  context_words = extract_context_words(text_without_timestamps, top_words)
 
182
  with st.expander("πŸ”— Context Graph"):
183
- display_context_graph(context_words)
 
184
  with st.expander("πŸ“‘ Context Table"):
185
- display_context_table(context_words)
 
186
  sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
187
- num_sentences = len(sentences)
188
- st.write(f"Total Sentences: {num_sentences}")
189
  num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
190
  clustered_sentences = cluster_sentences(sentences, num_clusters)
 
191
  col1, col2 = st.columns(2)
192
  with col1:
193
  st.subheader("Original Text")
194
- original_text = "\n".join(sentences)
195
- st.text_area("Original Sentences", value=original_text, height=400)
196
  with col2:
197
  st.subheader("Clustered Text")
198
- clusters = ""
199
- clustered_text = ""
200
  cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
201
- for i, cluster in enumerate(clustered_sentences):
202
- cluster_text = "\n".join(cluster)
203
- high_info_words = ", ".join(cluster_high_info_words[i])
204
- clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
205
- clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
206
  st.text_area("Clusters", value=clusters, height=200)
207
  st.text_area("Clustered Sentences", value=clustered_text, height=200)
 
208
  clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
209
- if set(sentences) == set(clustered_sentences_flat):
210
- st.write("βœ… All sentences are accounted for in the clustered output.")
211
- else:
212
- st.write("❌ Some sentences are missing in the clustered output.")
213
  plot_cluster_words(clustered_sentences)
214
  except Exception as e:
215
  st.error(f"Error processing file: {str(e)}")
216
 
217
- def perform_eda(file_name):
218
- st.subheader(f"EDA for {file_name}")
219
- process_file(os.path.abspath(file_name))
 
 
 
220
 
221
  st.title("πŸ“Ί Transcript Analysis πŸ“Š")
222
 
 
32
  3. πŸ“Ί **Transcript Analysis** πŸ“ˆ:Speech recognition πŸŽ™οΈ and thematic extraction 🌐, audiovisual content to actionable insights πŸ”‘.
33
  ''')
34
 
35
+ # 🧠 Cluster sentences using K-means
36
+ def cluster_sentences(sentences, num_clusters):
37
+ sentences = [s for s in sentences if len(s) > 10]
38
+ num_clusters = min(num_clusters, len(sentences))
39
+ vectorizer = TfidfVectorizer()
40
+ X = vectorizer.fit_transform(sentences)
41
+ kmeans = KMeans(n_clusters=num_clusters, random_state=42)
42
+ kmeans.fit(X)
43
+ clustered_sentences = [[] for _ in range(num_clusters)]
44
+ for i, label in enumerate(kmeans.labels_):
45
+ similarity = linear_kernel(kmeans.cluster_centers_[label:label+1], X[i:i+1]).flatten()[0]
46
+ clustered_sentences[label].append((similarity, sentences[i]))
47
+ return [[s for _, s in sorted(cluster, reverse=True)] for cluster in clustered_sentences]
48
+
49
+ # πŸ“Š Create context graph
50
+ def create_context_graph(context_words):
51
+ graph = Digraph()
52
+ for i, (before, high, after) in enumerate(context_words):
53
+ if before:
54
+ graph.node(f'before{i}', before, shape='box')
55
+ graph.edge(f'before{i}', f'high{i}', label=before)
56
+ graph.node(f'high{i}', high, shape='ellipse')
57
+ if after:
58
+ graph.node(f'after{i}', after, shape='diamond')
59
+ graph.edge(f'high{i}', f'after{i}', label=after)
60
+ return graph
61
+
62
+ # πŸ”— Create relationship graph
63
+ def create_relationship_graph(words):
64
+ graph = Digraph()
65
+ for i, word in enumerate(words):
66
+ graph.node(str(i), word)
67
+ if i > 0:
68
+ graph.edge(str(i-1), str(i), label=word)
69
+ return graph
70
+
71
+ # πŸ“ˆ Display context graph
72
+ def display_context_graph(context_words):
73
+ st.graphviz_chart(create_context_graph(context_words))
74
+
75
+ # πŸ“Š Display context table
76
+ def display_context_table(context_words):
77
+ table = "| Before | High Info Word | After |\n|--------|----------------|-------|\n"
78
+ table += "\n".join(f"| {b if b else ''} | {h} | {a if a else ''} |" for b, h, a in context_words)
79
+ st.markdown(table)
80
+
81
+ # πŸ“ˆ Display relationship graph
82
+ def display_relationship_graph(words):
83
+ st.graphviz_chart(create_relationship_graph(words))
84
+
85
+ # πŸ“₯ Download NLTK data
86
  @st.cache_resource
87
  def download_nltk_data():
88
  try:
89
  nltk.data.find('tokenizers/punkt')
90
  nltk.data.find('corpora/stopwords')
91
  except LookupError:
92
+ with st.spinner('Downloading required NLTK data...'):
93
+ nltk.download('punkt')
94
+ nltk.download('stopwords')
95
+ st.success('NLTK data is ready!')
96
 
97
+ # πŸ” Extract context words
98
+ def extract_context_words(text, high_information_words):
99
+ words = nltk.word_tokenize(text)
100
+ return [(words[i-1] if i > 0 else None, word, words[i+1] if i < len(words)-1 else None)
101
+ for i, word in enumerate(words) if word.lower() in high_information_words]
102
 
103
+ # πŸ“Š Extract high information words
104
  def extract_high_information_words(text, top_n=10):
105
  try:
106
+ words = [word.lower() for word in nltk.word_tokenize(text) if word.isalpha()]
 
107
  stop_words = set(stopwords.words('english'))
108
  filtered_words = [word for word in words if word not in stop_words]
109
+ return [word for word, _ in FreqDist(filtered_words).most_common(top_n)]
 
110
  except Exception as e:
111
  st.error(f"Error in extract_high_information_words: {str(e)}")
112
  return []
113
 
114
+ # πŸ“ Get text files
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  def get_txt_files():
116
  excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
117
  txt_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
118
+ return pd.DataFrame({'File Name': txt_files, 'Full Path': [os.path.abspath(f) for f in txt_files]})
 
 
 
 
119
 
120
+ # πŸ“Š Get high info words per cluster
121
+ def get_high_info_words_per_cluster(cluster_sentences, num_words=5):
122
+ return [extract_high_information_words(" ".join(cluster), num_words) for cluster in cluster_sentences]
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
+ # πŸ’Ύ Get text file download link
125
  def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="πŸ’Ύ Save"):
126
+ b64 = base64.b64encode(text_to_download.encode()).decode()
127
+ return f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
 
 
 
 
128
 
129
+ # πŸ“Š Perform EDA
130
+ def perform_eda(file_name):
131
+ st.subheader(f"EDA for {file_name}")
132
+ process_file(os.path.abspath(file_name))
 
 
 
133
 
134
+ # πŸ“Š Plot cluster words
135
  def plot_cluster_words(cluster_sentences):
136
  for i, cluster in enumerate(cluster_sentences):
137
+ words = re.findall(r'\b[a-z]{4,}\b', " ".join(cluster))
 
138
  word_freq = FreqDist(words)
139
  top_words = [word for word, _ in word_freq.most_common(20)]
140
  vectorizer = TfidfVectorizer()
141
  X = vectorizer.fit_transform(top_words)
142
+ similarity_matrix = cosine_similarity(X.toarray())
 
143
  G = nx.from_numpy_array(similarity_matrix)
144
  pos = nx.spring_layout(G, k=0.5)
145
  plt.figure(figsize=(8, 6))
146
+ nx.draw_networkx(G, pos, node_size=500, font_size=12, font_weight='bold', with_labels=True,
147
+ labels={i: word for i, word in enumerate(top_words)},
148
+ node_color='skyblue', edge_color='gray')
149
  plt.axis('off')
150
  plt.title(f"Cluster {i+1} Word Arrangement")
151
  st.pyplot(plt)
 
154
  st.markdown(f"Number of Sentences: {len(cluster)}")
155
  st.markdown("---")
156
 
157
+ # πŸ“ Process file
158
  def process_file(file_path):
159
  try:
160
  with open(file_path, 'r', encoding="utf-8") as file:
161
  file_text = file.read()
162
  text_without_timestamps = remove_timestamps(file_text)
163
  top_words = extract_high_information_words(text_without_timestamps, 10)
164
+
165
  with st.expander("πŸ“Š Top 10 High Information Words"):
166
  st.write(top_words)
167
+
168
  with st.expander("πŸ“ˆ Relationship Graph"):
169
+ display_relationship_graph(top_words) if top_words else st.warning("Unable to generate relationship graph.")
170
+
171
  context_words = extract_context_words(text_without_timestamps, top_words)
172
+
173
  with st.expander("πŸ”— Context Graph"):
174
+ display_context_graph(context_words) if context_words else st.warning("Unable to generate context graph.")
175
+
176
  with st.expander("πŸ“‘ Context Table"):
177
+ display_context_table(context_words) if context_words else st.warning("Unable to display context table.")
178
+
179
  sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
180
+ st.write(f"Total Sentences: {len(sentences)}")
181
+
182
  num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
183
  clustered_sentences = cluster_sentences(sentences, num_clusters)
184
+
185
  col1, col2 = st.columns(2)
186
  with col1:
187
  st.subheader("Original Text")
188
+ st.text_area("Original Sentences", value="\n".join(sentences), height=400)
189
+
190
  with col2:
191
  st.subheader("Clustered Text")
 
 
192
  cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
193
+ clusters = "\n".join(f"Cluster {i+1} (High Info Words: {', '.join(words)})"
194
+ for i, words in enumerate(cluster_high_info_words))
195
+ clustered_text = "\n\n".join(f"Cluster {i+1} (High Info Words: {', '.join(words)}):\n{cluster_text}"
196
+ for i, (words, cluster_text) in enumerate(zip(cluster_high_info_words,
197
+ ["\n".join(cluster) for cluster in clustered_sentences])))
198
  st.text_area("Clusters", value=clusters, height=200)
199
  st.text_area("Clustered Sentences", value=clustered_text, height=200)
200
+
201
  clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
202
+ st.write("βœ… All sentences are accounted for in the clustered output." if set(sentences) == set(clustered_sentences_flat)
203
+ else "❌ Some sentences are missing in the clustered output.")
204
+
 
205
  plot_cluster_words(clustered_sentences)
206
  except Exception as e:
207
  st.error(f"Error processing file: {str(e)}")
208
 
209
+ # πŸ•°οΈ Remove timestamps
210
+ def remove_timestamps(text):
211
+ return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
212
+
213
+ # Main execution
214
+ download_nltk_data()
215
 
216
  st.title("πŸ“Ί Transcript Analysis πŸ“Š")
217