Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -14,7 +14,6 @@ from io import BytesIO
|
|
14 |
import networkx as nx
|
15 |
import matplotlib.pyplot as plt
|
16 |
|
17 |
-
# Set page configuration with a title and favicon
|
18 |
st.set_page_config(
|
19 |
page_title="๐บTranscript๐EDA๐NLTK",
|
20 |
page_icon="๐ ",
|
@@ -33,7 +32,6 @@ st.markdown('''
|
|
33 |
3. ๐บ **Transcript Analysis** ๐:Speech recognition ๐๏ธ and thematic extraction ๐, audiovisual content to actionable insights ๐.
|
34 |
''')
|
35 |
|
36 |
-
# Download NLTK resources
|
37 |
nltk.download('punkt')
|
38 |
nltk.download('stopwords')
|
39 |
|
@@ -53,7 +51,7 @@ def create_relationship_graph(words):
|
|
53 |
for index, word in enumerate(words):
|
54 |
graph.node(str(index), word)
|
55 |
if index > 0:
|
56 |
-
graph.edge(str(index - 1), str(index), label=word)
|
57 |
return graph
|
58 |
|
59 |
def display_relationship_graph(words):
|
@@ -79,9 +77,9 @@ def create_context_graph(context_words):
|
|
79 |
if after_word:
|
80 |
graph.node(f'after{index}', after_word, shape='diamond')
|
81 |
if before_word:
|
82 |
-
graph.edge(f'before{index}', f'high{index}', label=before_word)
|
83 |
if after_word:
|
84 |
-
graph.edge(f'high{index}', f'after{index}', label=after_word)
|
85 |
return graph
|
86 |
|
87 |
def display_context_graph(context_words):
|
@@ -95,51 +93,29 @@ def display_context_table(context_words):
|
|
95 |
st.markdown(table)
|
96 |
|
97 |
def get_txt_files():
|
98 |
-
# Exclude specific files
|
99 |
excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
|
100 |
-
|
101 |
-
# List all .txt files excluding the ones in excluded_files
|
102 |
txt_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
|
103 |
-
|
104 |
-
# Create a dataframe with file names and full paths
|
105 |
df = pd.DataFrame({
|
106 |
'File Name': txt_files,
|
107 |
'Full Path': [os.path.abspath(f) for f in txt_files]
|
108 |
})
|
109 |
-
|
110 |
return df
|
111 |
|
112 |
def cluster_sentences(sentences, num_clusters):
|
113 |
-
# Filter sentences with length over 10 characters
|
114 |
sentences = [sentence for sentence in sentences if len(sentence) > 10]
|
115 |
-
|
116 |
-
# Check if the number of sentences is less than the desired number of clusters
|
117 |
if len(sentences) < num_clusters:
|
118 |
-
# If so, adjust the number of clusters to match the number of sentences
|
119 |
num_clusters = len(sentences)
|
120 |
-
|
121 |
-
# Vectorize the sentences
|
122 |
vectorizer = TfidfVectorizer()
|
123 |
X = vectorizer.fit_transform(sentences)
|
124 |
-
|
125 |
-
# Perform k-means clustering
|
126 |
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
|
127 |
kmeans.fit(X)
|
128 |
-
|
129 |
-
# Calculate the centroid of each cluster
|
130 |
cluster_centers = kmeans.cluster_centers_
|
131 |
-
|
132 |
-
# Group sentences by cluster and calculate similarity to centroid
|
133 |
clustered_sentences = [[] for _ in range(num_clusters)]
|
134 |
for i, label in enumerate(kmeans.labels_):
|
135 |
similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
|
136 |
clustered_sentences[label].append((similarity, sentences[i]))
|
137 |
-
|
138 |
-
# Order sentences within each cluster based on their similarity to the centroid
|
139 |
for cluster in clustered_sentences:
|
140 |
-
cluster.sort(reverse=True)
|
141 |
-
|
142 |
-
# Return the ordered clustered sentences without similarity scores for display
|
143 |
return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
|
144 |
|
145 |
def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="๐พ Save"):
|
@@ -164,23 +140,17 @@ def plot_cluster_words(cluster_sentences):
|
|
164 |
words = re.findall(r'\b[a-z]{4,}\b', cluster_text)
|
165 |
word_freq = FreqDist(words)
|
166 |
top_words = [word for word, _ in word_freq.most_common(20)]
|
167 |
-
|
168 |
vectorizer = TfidfVectorizer()
|
169 |
X = vectorizer.fit_transform(top_words)
|
170 |
word_vectors = X.toarray()
|
171 |
-
|
172 |
similarity_matrix = cosine_similarity(word_vectors)
|
173 |
-
|
174 |
G = nx.from_numpy_array(similarity_matrix)
|
175 |
pos = nx.spring_layout(G, k=0.5)
|
176 |
-
|
177 |
plt.figure(figsize=(8, 6))
|
178 |
-
nx.draw_networkx(G, pos, node_size=500, font_size=12, font_weight='bold', with_labels=True, labels={i: word for i, word in enumerate(top_words)}, node_color='skyblue', edge_color='gray')
|
179 |
plt.axis('off')
|
180 |
plt.title(f"Cluster {i+1} Word Arrangement")
|
181 |
-
|
182 |
st.pyplot(plt)
|
183 |
-
|
184 |
st.markdown(f"**Cluster {i+1} Details:**")
|
185 |
st.markdown(f"Top Words: {', '.join(top_words)}")
|
186 |
st.markdown(f"Number of Sentences: {len(cluster)}")
|
@@ -189,94 +159,76 @@ def plot_cluster_words(cluster_sentences):
|
|
189 |
def process_file(file_path):
|
190 |
with open(file_path, 'r', encoding="utf-8") as file:
|
191 |
file_text = file.read()
|
192 |
-
|
193 |
-
# Process the selected file
|
194 |
text_without_timestamps = remove_timestamps(file_text)
|
195 |
top_words = extract_high_information_words(text_without_timestamps, 10)
|
196 |
-
|
197 |
with st.expander("๐ Top 10 High Information Words"):
|
198 |
st.write(top_words)
|
199 |
-
|
200 |
with st.expander("๐ Relationship Graph"):
|
201 |
display_relationship_graph(top_words)
|
202 |
-
|
203 |
context_words = extract_context_words(text_without_timestamps, top_words)
|
204 |
-
|
205 |
with st.expander("๐ Context Graph"):
|
206 |
display_context_graph(context_words)
|
207 |
-
|
208 |
with st.expander("๐ Context Table"):
|
209 |
display_context_table(context_words)
|
210 |
-
|
211 |
sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
|
212 |
-
|
213 |
num_sentences = len(sentences)
|
214 |
st.write(f"Total Sentences: {num_sentences}")
|
215 |
-
|
216 |
num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
|
217 |
clustered_sentences = cluster_sentences(sentences, num_clusters)
|
218 |
-
|
219 |
col1, col2 = st.columns(2)
|
220 |
-
|
221 |
with col1:
|
222 |
st.subheader("Original Text")
|
223 |
original_text = "\n".join(sentences)
|
224 |
st.text_area("Original Sentences", value=original_text, height=400)
|
225 |
-
|
226 |
with col2:
|
227 |
st.subheader("Clustered Text")
|
228 |
clusters = ""
|
229 |
clustered_text = ""
|
230 |
cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
|
231 |
-
|
232 |
for i, cluster in enumerate(clustered_sentences):
|
233 |
cluster_text = "\n".join(cluster)
|
234 |
high_info_words = ", ".join(cluster_high_info_words[i])
|
235 |
clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
|
236 |
clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
|
237 |
-
|
238 |
st.text_area("Clusters", value=clusters, height=200)
|
239 |
st.text_area("Clustered Sentences", value=clustered_text, height=200)
|
240 |
-
|
241 |
-
# Verify that all sentences are accounted for in the clustered output
|
242 |
clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
|
243 |
if set(sentences) == set(clustered_sentences_flat):
|
244 |
st.write("โ
All sentences are accounted for in the clustered output.")
|
245 |
else:
|
246 |
st.write("โ Some sentences are missing in the clustered output.")
|
247 |
-
|
248 |
plot_cluster_words(clustered_sentences)
|
249 |
|
250 |
-
def
|
251 |
-
|
252 |
-
|
253 |
-
st.session_state.selected_file = selected_rows[0]['File Name']
|
254 |
-
else:
|
255 |
-
st.session_state.selected_file = None
|
256 |
|
257 |
-
# Main code for UI
|
258 |
st.title("๐บ Transcript Analysis ๐")
|
259 |
|
260 |
-
# Display dataframe of .txt files
|
261 |
txt_files_df = get_txt_files()
|
262 |
st.write("Available .txt files:")
|
263 |
-
|
264 |
-
|
265 |
-
st.
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
st.
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
281 |
|
282 |
st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")
|
|
|
14 |
import networkx as nx
|
15 |
import matplotlib.pyplot as plt
|
16 |
|
|
|
17 |
st.set_page_config(
|
18 |
page_title="๐บTranscript๐EDA๐NLTK",
|
19 |
page_icon="๐ ",
|
|
|
32 |
3. ๐บ **Transcript Analysis** ๐:Speech recognition ๐๏ธ and thematic extraction ๐, audiovisual content to actionable insights ๐.
|
33 |
''')
|
34 |
|
|
|
35 |
nltk.download('punkt')
|
36 |
nltk.download('stopwords')
|
37 |
|
|
|
51 |
for index, word in enumerate(words):
|
52 |
graph.node(str(index), word)
|
53 |
if index > 0:
|
54 |
+
graph.edge(str(index - 1), str(index), label=word)
|
55 |
return graph
|
56 |
|
57 |
def display_relationship_graph(words):
|
|
|
77 |
if after_word:
|
78 |
graph.node(f'after{index}', after_word, shape='diamond')
|
79 |
if before_word:
|
80 |
+
graph.edge(f'before{index}', f'high{index}', label=before_word)
|
81 |
if after_word:
|
82 |
+
graph.edge(f'high{index}', f'after{index}', label=after_word)
|
83 |
return graph
|
84 |
|
85 |
def display_context_graph(context_words):
|
|
|
93 |
st.markdown(table)
|
94 |
|
95 |
def get_txt_files():
|
|
|
96 |
excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
|
|
|
|
|
97 |
txt_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
|
|
|
|
|
98 |
df = pd.DataFrame({
|
99 |
'File Name': txt_files,
|
100 |
'Full Path': [os.path.abspath(f) for f in txt_files]
|
101 |
})
|
|
|
102 |
return df
|
103 |
|
104 |
def cluster_sentences(sentences, num_clusters):
|
|
|
105 |
sentences = [sentence for sentence in sentences if len(sentence) > 10]
|
|
|
|
|
106 |
if len(sentences) < num_clusters:
|
|
|
107 |
num_clusters = len(sentences)
|
|
|
|
|
108 |
vectorizer = TfidfVectorizer()
|
109 |
X = vectorizer.fit_transform(sentences)
|
|
|
|
|
110 |
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
|
111 |
kmeans.fit(X)
|
|
|
|
|
112 |
cluster_centers = kmeans.cluster_centers_
|
|
|
|
|
113 |
clustered_sentences = [[] for _ in range(num_clusters)]
|
114 |
for i, label in enumerate(kmeans.labels_):
|
115 |
similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
|
116 |
clustered_sentences[label].append((similarity, sentences[i]))
|
|
|
|
|
117 |
for cluster in clustered_sentences:
|
118 |
+
cluster.sort(reverse=True)
|
|
|
|
|
119 |
return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
|
120 |
|
121 |
def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="๐พ Save"):
|
|
|
140 |
words = re.findall(r'\b[a-z]{4,}\b', cluster_text)
|
141 |
word_freq = FreqDist(words)
|
142 |
top_words = [word for word, _ in word_freq.most_common(20)]
|
|
|
143 |
vectorizer = TfidfVectorizer()
|
144 |
X = vectorizer.fit_transform(top_words)
|
145 |
word_vectors = X.toarray()
|
|
|
146 |
similarity_matrix = cosine_similarity(word_vectors)
|
|
|
147 |
G = nx.from_numpy_array(similarity_matrix)
|
148 |
pos = nx.spring_layout(G, k=0.5)
|
|
|
149 |
plt.figure(figsize=(8, 6))
|
150 |
+
nx.draw_networkx(G, pos, node_size=500, font_size=12, font_weight='bold', with_labels=True, labels={i: word for i, word in enumerate(top_words)}, node_color='skyblue', edge_color='gray')
|
151 |
plt.axis('off')
|
152 |
plt.title(f"Cluster {i+1} Word Arrangement")
|
|
|
153 |
st.pyplot(plt)
|
|
|
154 |
st.markdown(f"**Cluster {i+1} Details:**")
|
155 |
st.markdown(f"Top Words: {', '.join(top_words)}")
|
156 |
st.markdown(f"Number of Sentences: {len(cluster)}")
|
|
|
159 |
def process_file(file_path):
|
160 |
with open(file_path, 'r', encoding="utf-8") as file:
|
161 |
file_text = file.read()
|
|
|
|
|
162 |
text_without_timestamps = remove_timestamps(file_text)
|
163 |
top_words = extract_high_information_words(text_without_timestamps, 10)
|
|
|
164 |
with st.expander("๐ Top 10 High Information Words"):
|
165 |
st.write(top_words)
|
|
|
166 |
with st.expander("๐ Relationship Graph"):
|
167 |
display_relationship_graph(top_words)
|
|
|
168 |
context_words = extract_context_words(text_without_timestamps, top_words)
|
|
|
169 |
with st.expander("๐ Context Graph"):
|
170 |
display_context_graph(context_words)
|
|
|
171 |
with st.expander("๐ Context Table"):
|
172 |
display_context_table(context_words)
|
|
|
173 |
sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
|
|
|
174 |
num_sentences = len(sentences)
|
175 |
st.write(f"Total Sentences: {num_sentences}")
|
|
|
176 |
num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
|
177 |
clustered_sentences = cluster_sentences(sentences, num_clusters)
|
|
|
178 |
col1, col2 = st.columns(2)
|
|
|
179 |
with col1:
|
180 |
st.subheader("Original Text")
|
181 |
original_text = "\n".join(sentences)
|
182 |
st.text_area("Original Sentences", value=original_text, height=400)
|
|
|
183 |
with col2:
|
184 |
st.subheader("Clustered Text")
|
185 |
clusters = ""
|
186 |
clustered_text = ""
|
187 |
cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
|
|
|
188 |
for i, cluster in enumerate(clustered_sentences):
|
189 |
cluster_text = "\n".join(cluster)
|
190 |
high_info_words = ", ".join(cluster_high_info_words[i])
|
191 |
clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
|
192 |
clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
|
|
|
193 |
st.text_area("Clusters", value=clusters, height=200)
|
194 |
st.text_area("Clustered Sentences", value=clustered_text, height=200)
|
|
|
|
|
195 |
clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
|
196 |
if set(sentences) == set(clustered_sentences_flat):
|
197 |
st.write("โ
All sentences are accounted for in the clustered output.")
|
198 |
else:
|
199 |
st.write("โ Some sentences are missing in the clustered output.")
|
|
|
200 |
plot_cluster_words(clustered_sentences)
|
201 |
|
202 |
+
def perform_eda(file_name):
|
203 |
+
st.subheader(f"EDA for {file_name}")
|
204 |
+
process_file(os.path.abspath(file_name))
|
|
|
|
|
|
|
205 |
|
|
|
206 |
st.title("๐บ Transcript Analysis ๐")
|
207 |
|
|
|
208 |
txt_files_df = get_txt_files()
|
209 |
st.write("Available .txt files:")
|
210 |
+
st.dataframe(txt_files_df[['File Name']])
|
211 |
+
|
212 |
+
st.write("Select a file to perform EDA:")
|
213 |
+
cols = st.columns(len(txt_files_df))
|
214 |
+
for i, (_, row) in enumerate(txt_files_df.iterrows()):
|
215 |
+
if cols[i].button(f":file_folder: {row['File Name']}"):
|
216 |
+
perform_eda(row['File Name'])
|
217 |
+
|
218 |
+
if "messages" not in st.session_state:
|
219 |
+
st.session_state.messages = []
|
220 |
+
|
221 |
+
for message in st.session_state.messages:
|
222 |
+
with st.chat_message(message["role"]):
|
223 |
+
st.markdown(message["content"])
|
224 |
+
|
225 |
+
if prompt := st.chat_input("Ask a question about the data"):
|
226 |
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
227 |
+
with st.chat_message("user"):
|
228 |
+
st.markdown(prompt)
|
229 |
+
response = f"You asked: {prompt}\n\nThis is a placeholder response. In a real application, you would process the user's question and provide an answer based on the data and EDA results."
|
230 |
+
st.session_state.messages.append({"role": "assistant", "content": response})
|
231 |
+
with st.chat_message("assistant"):
|
232 |
+
st.markdown(response)
|
233 |
|
234 |
st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")
|