awacke1 commited on
Commit
1ebbc73
Β·
verified Β·
1 Parent(s): 34e2c53

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -151
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import streamlit as st
2
- import pandas as pd
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.cluster import KMeans
5
  from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
@@ -32,56 +31,6 @@ st.markdown('''
32
  3. πŸ“Ί **Transcript Analysis** πŸ“ˆ:Speech recognition πŸŽ™οΈ and thematic extraction 🌐, audiovisual content to actionable insights πŸ”‘.
33
  ''')
34
 
35
- # 🧠 Cluster sentences using K-means
36
- def cluster_sentences(sentences, num_clusters):
37
- sentences = [s for s in sentences if len(s) > 10]
38
- num_clusters = min(num_clusters, len(sentences))
39
- vectorizer = TfidfVectorizer()
40
- X = vectorizer.fit_transform(sentences)
41
- kmeans = KMeans(n_clusters=num_clusters, random_state=42)
42
- kmeans.fit(X)
43
- clustered_sentences = [[] for _ in range(num_clusters)]
44
- for i, label in enumerate(kmeans.labels_):
45
- similarity = linear_kernel(kmeans.cluster_centers_[label:label+1], X[i:i+1]).flatten()[0]
46
- clustered_sentences[label].append((similarity, sentences[i]))
47
- return [[s for _, s in sorted(cluster, reverse=True)] for cluster in clustered_sentences]
48
-
49
- # πŸ“Š Create context graph
50
- def create_context_graph(context_words):
51
- graph = Digraph()
52
- for i, (before, high, after) in enumerate(context_words):
53
- if before:
54
- graph.node(f'before{i}', before, shape='box')
55
- graph.edge(f'before{i}', f'high{i}', label=before)
56
- graph.node(f'high{i}', high, shape='ellipse')
57
- if after:
58
- graph.node(f'after{i}', after, shape='diamond')
59
- graph.edge(f'high{i}', f'after{i}', label=after)
60
- return graph
61
-
62
- # πŸ”— Create relationship graph
63
- def create_relationship_graph(words):
64
- graph = Digraph()
65
- for i, word in enumerate(words):
66
- graph.node(str(i), word)
67
- if i > 0:
68
- graph.edge(str(i-1), str(i), label=word)
69
- return graph
70
-
71
- # πŸ“ˆ Display context graph
72
- def display_context_graph(context_words):
73
- st.graphviz_chart(create_context_graph(context_words))
74
-
75
- # πŸ“Š Display context table
76
- def display_context_table(context_words):
77
- table = "| Before | High Info Word | After |\n|--------|----------------|-------|\n"
78
- table += "\n".join(f"| {b if b else ''} | {h} | {a if a else ''} |" for b, h, a in context_words)
79
- st.markdown(table)
80
-
81
- # πŸ“ˆ Display relationship graph
82
- def display_relationship_graph(words):
83
- st.graphviz_chart(create_relationship_graph(words))
84
-
85
  # πŸ“₯ Download NLTK data
86
  @st.cache_resource
87
  def download_nltk_data():
@@ -94,11 +43,11 @@ def download_nltk_data():
94
  nltk.download('stopwords')
95
  st.success('NLTK data is ready!')
96
 
97
- # πŸ” Extract context words
98
- def extract_context_words(text, high_information_words):
99
- words = nltk.word_tokenize(text)
100
- return [(words[i-1] if i > 0 else None, word, words[i+1] if i < len(words)-1 else None)
101
- for i, word in enumerate(words) if word.lower() in high_information_words]
102
 
103
  # πŸ“Š Extract high information words
104
  def extract_high_information_words(text, top_n=10):
@@ -111,25 +60,91 @@ def extract_high_information_words(text, top_n=10):
111
  st.error(f"Error in extract_high_information_words: {str(e)}")
112
  return []
113
 
114
- # πŸ“ Get text files
115
- def get_txt_files():
116
- excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
117
- txt_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
118
- return pd.DataFrame({'File Name': txt_files, 'Full Path': [os.path.abspath(f) for f in txt_files]})
 
 
 
119
 
120
- # πŸ“Š Get high info words per cluster
121
- def get_high_info_words_per_cluster(cluster_sentences, num_words=5):
122
- return [extract_high_information_words(" ".join(cluster), num_words) for cluster in cluster_sentences]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
  # πŸ’Ύ Get text file download link
125
  def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="πŸ’Ύ Save"):
126
  b64 = base64.b64encode(text_to_download.encode()).decode()
127
  return f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
128
 
129
- # πŸ“Š Perform EDA
130
- def perform_eda(file_name):
131
- st.subheader(f"EDA for {file_name}")
132
- process_file(os.path.abspath(file_name))
133
 
134
  # πŸ“Š Plot cluster words
135
  def plot_cluster_words(cluster_sentences):
@@ -154,91 +169,72 @@ def plot_cluster_words(cluster_sentences):
154
  st.markdown(f"Number of Sentences: {len(cluster)}")
155
  st.markdown("---")
156
 
157
- # πŸ“ Process file
158
- def process_file(file_path):
159
- try:
160
- with open(file_path, 'r', encoding="utf-8") as file:
161
- file_text = file.read()
162
- text_without_timestamps = remove_timestamps(file_text)
163
- top_words = extract_high_information_words(text_without_timestamps, 10)
164
-
165
- with st.expander("πŸ“Š Top 10 High Information Words"):
166
- st.write(top_words)
167
-
168
- with st.expander("πŸ“ˆ Relationship Graph"):
169
- display_relationship_graph(top_words) if top_words else st.warning("Unable to generate relationship graph.")
170
-
171
- context_words = extract_context_words(text_without_timestamps, top_words)
172
-
173
- with st.expander("πŸ”— Context Graph"):
174
- display_context_graph(context_words) if context_words else st.warning("Unable to generate context graph.")
175
-
176
- with st.expander("πŸ“‘ Context Table"):
177
- display_context_table(context_words) if context_words else st.warning("Unable to display context table.")
178
-
179
- sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
180
- st.write(f"Total Sentences: {len(sentences)}")
181
-
182
- num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
183
- clustered_sentences = cluster_sentences(sentences, num_clusters)
184
-
185
- col1, col2 = st.columns(2)
186
- with col1:
187
- st.subheader("Original Text")
188
- st.text_area("Original Sentences", value="\n".join(sentences), height=400)
189
-
190
- with col2:
191
- st.subheader("Clustered Text")
192
- cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
193
- clusters = "\n".join(f"Cluster {i+1} (High Info Words: {', '.join(words)})"
194
- for i, words in enumerate(cluster_high_info_words))
195
- clustered_text = "\n\n".join(f"Cluster {i+1} (High Info Words: {', '.join(words)}):\n{cluster_text}"
196
- for i, (words, cluster_text) in enumerate(zip(cluster_high_info_words,
197
- ["\n".join(cluster) for cluster in clustered_sentences])))
198
- st.text_area("Clusters", value=clusters, height=200)
199
- st.text_area("Clustered Sentences", value=clustered_text, height=200)
200
-
201
- clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
202
- st.write("βœ… All sentences are accounted for in the clustered output." if set(sentences) == set(clustered_sentences_flat)
203
- else "❌ Some sentences are missing in the clustered output.")
204
-
205
- plot_cluster_words(clustered_sentences)
206
- except Exception as e:
207
- st.error(f"Error processing file: {str(e)}")
208
 
209
- # πŸ•°οΈ Remove timestamps
210
- def remove_timestamps(text):
211
- return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
212
 
213
- # Main execution
214
- download_nltk_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
- st.title("πŸ“Ί Transcript Analysis πŸ“Š")
217
-
218
- txt_files_df = get_txt_files()
219
- st.write("Available .txt files:")
220
- st.dataframe(txt_files_df[['File Name']])
221
-
222
- st.write("Select a file to perform EDA:")
223
- cols = st.columns(len(txt_files_df))
224
- for i, (_, row) in enumerate(txt_files_df.iterrows()):
225
- if cols[i].button(f":file_folder: {row['File Name']}"):
226
- perform_eda(row['File Name'])
227
-
228
- if "messages" not in st.session_state:
229
- st.session_state.messages = []
230
-
231
- for message in st.session_state.messages:
232
- with st.chat_message(message["role"]):
233
- st.markdown(message["content"])
234
-
235
- if prompt := st.chat_input("Ask a question about the data"):
236
- st.session_state.messages.append({"role": "user", "content": prompt})
237
- with st.chat_message("user"):
238
- st.markdown(prompt)
239
- response = f"You asked: {prompt}\n\nThis is a placeholder response. In a real application, you would process the user's question and provide an answer based on the data and EDA results."
240
- st.session_state.messages.append({"role": "assistant", "content": response})
241
- with st.chat_message("assistant"):
242
- st.markdown(response)
243
 
244
  st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")
 
1
  import streamlit as st
 
2
  from sklearn.feature_extraction.text import TfidfVectorizer
3
  from sklearn.cluster import KMeans
4
  from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
 
31
  3. πŸ“Ί **Transcript Analysis** πŸ“ˆ:Speech recognition πŸŽ™οΈ and thematic extraction 🌐, audiovisual content to actionable insights πŸ”‘.
32
  ''')
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # πŸ“₯ Download NLTK data
35
  @st.cache_resource
36
  def download_nltk_data():
 
43
  nltk.download('stopwords')
44
  st.success('NLTK data is ready!')
45
 
46
+ download_nltk_data()
47
+
48
+ # πŸ•°οΈ Remove timestamps
49
+ def remove_timestamps(text):
50
+ return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
51
 
52
  # πŸ“Š Extract high information words
53
  def extract_high_information_words(text, top_n=10):
 
60
  st.error(f"Error in extract_high_information_words: {str(e)}")
61
  return []
62
 
63
+ # πŸ”— Create relationship graph
64
+ def create_relationship_graph(words):
65
+ graph = Digraph()
66
+ for i, word in enumerate(words):
67
+ graph.node(str(i), word)
68
+ if i > 0:
69
+ graph.edge(str(i-1), str(i), label=word)
70
+ return graph
71
 
72
+ # πŸ“ˆ Display relationship graph
73
+ def display_relationship_graph(words):
74
+ try:
75
+ graph = create_relationship_graph(words)
76
+ st.graphviz_chart(graph)
77
+ except Exception as e:
78
+ st.error(f"Error displaying relationship graph: {str(e)}")
79
+
80
+ # πŸ” Extract context words
81
+ def extract_context_words(text, high_information_words):
82
+ words = nltk.word_tokenize(text)
83
+ return [(words[i-1] if i > 0 else None, word, words[i+1] if i < len(words)-1 else None)
84
+ for i, word in enumerate(words) if word.lower() in high_information_words]
85
+
86
+ # πŸ“Š Create context graph
87
+ def create_context_graph(context_words):
88
+ graph = Digraph()
89
+ for i, (before, high, after) in enumerate(context_words):
90
+ if before:
91
+ graph.node(f'before{i}', before, shape='box')
92
+ graph.edge(f'before{i}', f'high{i}', label=before)
93
+ graph.node(f'high{i}', high, shape='ellipse')
94
+ if after:
95
+ graph.node(f'after{i}', after, shape='diamond')
96
+ graph.edge(f'high{i}', f'after{i}', label=after)
97
+ return graph
98
+
99
+ # πŸ“ˆ Display context graph
100
+ def display_context_graph(context_words):
101
+ try:
102
+ graph = create_context_graph(context_words)
103
+ st.graphviz_chart(graph)
104
+ except Exception as e:
105
+ st.error(f"Error displaying context graph: {str(e)}")
106
+
107
+ # πŸ“Š Display context table
108
+ def display_context_table(context_words):
109
+ table = "| Before | High Info Word | After |\n|--------|----------------|-------|\n"
110
+ table += "\n".join(f"| {b if b else ''} | {h} | {a if a else ''} |" for b, h, a in context_words)
111
+ st.markdown(table)
112
+
113
+ # πŸ“ Load example files
114
+ def load_example_files():
115
+ excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
116
+ example_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
117
+ if example_files:
118
+ selected_file = st.selectbox("πŸ“„ Select an example file:", example_files)
119
+ if st.button(f"πŸ“‚ Load {selected_file}"):
120
+ with open(selected_file, 'r', encoding="utf-8") as file:
121
+ return file.read()
122
+ else:
123
+ st.write("No suitable example files found.")
124
+ return None
125
+
126
+ # 🧠 Cluster sentences
127
+ def cluster_sentences(sentences, num_clusters):
128
+ sentences = [s for s in sentences if len(s) > 10]
129
+ num_clusters = min(num_clusters, len(sentences))
130
+ vectorizer = TfidfVectorizer()
131
+ X = vectorizer.fit_transform(sentences)
132
+ kmeans = KMeans(n_clusters=num_clusters, random_state=42)
133
+ kmeans.fit(X)
134
+ clustered_sentences = [[] for _ in range(num_clusters)]
135
+ for i, label in enumerate(kmeans.labels_):
136
+ similarity = linear_kernel(kmeans.cluster_centers_[label:label+1], X[i:i+1]).flatten()[0]
137
+ clustered_sentences[label].append((similarity, sentences[i]))
138
+ return [[s for _, s in sorted(cluster, reverse=True)] for cluster in clustered_sentences]
139
 
140
  # πŸ’Ύ Get text file download link
141
  def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="πŸ’Ύ Save"):
142
  b64 = base64.b64encode(text_to_download.encode()).decode()
143
  return f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
144
 
145
+ # πŸ“Š Get high info words per cluster
146
+ def get_high_info_words_per_cluster(cluster_sentences, num_words=5):
147
+ return [extract_high_information_words(" ".join(cluster), num_words) for cluster in cluster_sentences]
 
148
 
149
  # πŸ“Š Plot cluster words
150
  def plot_cluster_words(cluster_sentences):
 
169
  st.markdown(f"Number of Sentences: {len(cluster)}")
170
  st.markdown("---")
171
 
172
+ # Main code for UI
173
+ uploaded_file = st.file_uploader("πŸ“ Choose a .txt file", type=['txt'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
+ example_text = load_example_files()
 
 
176
 
177
+ if example_text:
178
+ file_text = example_text
179
+ elif uploaded_file:
180
+ file_text = uploaded_file.read().decode("utf-8")
181
+ else:
182
+ file_text = ""
183
+
184
+ if file_text:
185
+ text_without_timestamps = remove_timestamps(file_text)
186
+ top_words = extract_high_information_words(text_without_timestamps, 10)
187
+
188
+ with st.expander("πŸ“Š Top 10 High Information Words"):
189
+ st.write(top_words)
190
+
191
+ with st.expander("πŸ“ˆ Relationship Graph"):
192
+ display_relationship_graph(top_words)
193
+
194
+ context_words = extract_context_words(text_without_timestamps, top_words)
195
+
196
+ with st.expander("πŸ”— Context Graph"):
197
+ display_context_graph(context_words)
198
+
199
+ with st.expander("πŸ“‘ Context Table"):
200
+ display_context_table(context_words)
201
+
202
+ sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
203
+
204
+ num_sentences = len(sentences)
205
+ st.write(f"Total Sentences: {num_sentences}")
206
+
207
+ num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
208
+ clustered_sentences = cluster_sentences(sentences, num_clusters)
209
+
210
+ col1, col2 = st.columns(2)
211
+
212
+ with col1:
213
+ st.subheader("Original Text")
214
+ original_text = "\n".join(sentences)
215
+ st.text_area("Original Sentences", value=original_text, height=400)
216
+
217
+ with col2:
218
+ st.subheader("Clustered Text")
219
+ clusters = ""
220
+ clustered_text = ""
221
+ cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
222
+
223
+ for i, cluster in enumerate(clustered_sentences):
224
+ cluster_text = "\n".join(cluster)
225
+ high_info_words = ", ".join(cluster_high_info_words[i])
226
+ clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
227
+ clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
228
+
229
+ st.text_area("Clusters", value=clusters, height=200)
230
+ st.text_area("Clustered Sentences", value=clustered_text, height=200)
231
 
232
+ clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
233
+ if set(sentences) == set(clustered_sentences_flat):
234
+ st.write("βœ… All sentences are accounted for in the clustered output.")
235
+ else:
236
+ st.write("❌ Some sentences are missing in the clustered output.")
237
+
238
+ plot_cluster_words(clustered_sentences)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
  st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")