awacke1 commited on
Commit
3f1ac1e
ยท
verified ยท
1 Parent(s): 069bed5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +192 -13
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import streamlit as st
2
  import pandas as pd
3
- import os
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.cluster import KMeans
6
  from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
@@ -8,13 +7,92 @@ import nltk
8
  from nltk.corpus import stopwords
9
  from nltk import FreqDist
10
  import re
 
11
  import base64
12
  from graphviz import Digraph
13
  from io import BytesIO
14
  import networkx as nx
15
  import matplotlib.pyplot as plt
16
 
17
- # ... [Keep all the existing imports and configurations] ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  def get_txt_files():
20
  # Exclude specific files
@@ -31,21 +109,84 @@ def get_txt_files():
31
 
32
  return df
33
 
34
- # ... [Keep all the existing functions] ...
 
 
35
 
36
- # Main code for UI
37
- st.title("๐Ÿ“บ Transcript Analysis ๐Ÿ“Š")
 
 
38
 
39
- # Display dataframe of .txt files
40
- txt_files_df = get_txt_files()
41
- st.write("Available .txt files:")
42
- st.dataframe(txt_files_df)
 
 
 
 
 
 
43
 
44
- # Allow user to select a file from the dataframe
45
- selected_file = st.selectbox("Select a file to process:", txt_files_df['File Name'])
 
 
 
46
 
47
- if st.button(f"Process {selected_file}"):
48
- file_path = txt_files_df[txt_files_df['File Name'] == selected_file]['Full Path'].iloc[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  with open(file_path, 'r', encoding="utf-8") as file:
50
  file_text = file.read()
51
 
@@ -106,4 +247,42 @@ if st.button(f"Process {selected_file}"):
106
 
107
  plot_cluster_words(clustered_sentences)
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")
 
1
  import streamlit as st
2
  import pandas as pd
 
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.cluster import KMeans
5
  from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
 
7
  from nltk.corpus import stopwords
8
  from nltk import FreqDist
9
  import re
10
+ import os
11
  import base64
12
  from graphviz import Digraph
13
  from io import BytesIO
14
  import networkx as nx
15
  import matplotlib.pyplot as plt
16
 
17
+ # Set page configuration with a title and favicon
18
+ st.set_page_config(
19
+ page_title="๐Ÿ“บTranscript๐Ÿ“œEDA๐Ÿ”NLTK",
20
+ page_icon="๐ŸŒ ",
21
+ layout="wide",
22
+ initial_sidebar_state="expanded",
23
+ menu_items={
24
+ 'Get Help': 'https://huggingface.co/awacke1',
25
+ 'Report a bug': "https://huggingface.co/awacke1",
26
+ 'About': "https://huggingface.co/awacke1"
27
+ }
28
+ )
29
+
30
+ st.markdown('''
31
+ 1. ๐Ÿ” **Transcript Insights Using Exploratory Data Analysis (EDA)** ๐Ÿ“Š - Unveil hidden patterns ๐Ÿ•ต๏ธโ€โ™‚๏ธ and insights ๐Ÿง  in your transcripts. ๐Ÿ†.
32
+ 2. ๐Ÿ“œ **Natural Language Toolkit (NLTK)** ๐Ÿ› ๏ธ:- your compass ๐Ÿงญ in the vast landscape of NLP.
33
+ 3. ๐Ÿ“บ **Transcript Analysis** ๐Ÿ“ˆ:Speech recognition ๐ŸŽ™๏ธ and thematic extraction ๐ŸŒ, audiovisual content to actionable insights ๐Ÿ”‘.
34
+ ''')
35
+
36
+ # Download NLTK resources
37
+ nltk.download('punkt')
38
+ nltk.download('stopwords')
39
+
40
+ def remove_timestamps(text):
41
+ return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
42
+
43
+ def extract_high_information_words(text, top_n=10):
44
+ words = nltk.word_tokenize(text)
45
+ words = [word.lower() for word in words if word.isalpha()]
46
+ stop_words = set(stopwords.words('english'))
47
+ filtered_words = [word for word in words if word not in stop_words]
48
+ freq_dist = FreqDist(filtered_words)
49
+ return [word for word, _ in freq_dist.most_common(top_n)]
50
+
51
+ def create_relationship_graph(words):
52
+ graph = Digraph()
53
+ for index, word in enumerate(words):
54
+ graph.node(str(index), word)
55
+ if index > 0:
56
+ graph.edge(str(index - 1), str(index), label=word) # Add word as edge label
57
+ return graph
58
+
59
+ def display_relationship_graph(words):
60
+ graph = create_relationship_graph(words)
61
+ st.graphviz_chart(graph)
62
+
63
+ def extract_context_words(text, high_information_words):
64
+ words = nltk.word_tokenize(text)
65
+ context_words = []
66
+ for index, word in enumerate(words):
67
+ if word.lower() in high_information_words:
68
+ before_word = words[index - 1] if index > 0 else None
69
+ after_word = words[index + 1] if index < len(words) - 1 else None
70
+ context_words.append((before_word, word, after_word))
71
+ return context_words
72
+
73
+ def create_context_graph(context_words):
74
+ graph = Digraph()
75
+ for index, (before_word, high_info_word, after_word) in enumerate(context_words):
76
+ if before_word:
77
+ graph.node(f'before{index}', before_word, shape='box')
78
+ graph.node(f'high{index}', high_info_word, shape='ellipse')
79
+ if after_word:
80
+ graph.node(f'after{index}', after_word, shape='diamond')
81
+ if before_word:
82
+ graph.edge(f'before{index}', f'high{index}', label=before_word) # Add before_word as edge label
83
+ if after_word:
84
+ graph.edge(f'high{index}', f'after{index}', label=after_word) # Add after_word as edge label
85
+ return graph
86
+
87
+ def display_context_graph(context_words):
88
+ graph = create_context_graph(context_words)
89
+ st.graphviz_chart(graph)
90
+
91
+ def display_context_table(context_words):
92
+ table = "| Before | High Info Word | After |\n|--------|----------------|-------|\n"
93
+ for before, high, after in context_words:
94
+ table += f"| {before if before else ''} | {high} | {after if after else ''} |\n"
95
+ st.markdown(table)
96
 
97
  def get_txt_files():
98
  # Exclude specific files
 
109
 
110
  return df
111
 
112
+ def cluster_sentences(sentences, num_clusters):
113
+ # Filter sentences with length over 10 characters
114
+ sentences = [sentence for sentence in sentences if len(sentence) > 10]
115
 
116
+ # Check if the number of sentences is less than the desired number of clusters
117
+ if len(sentences) < num_clusters:
118
+ # If so, adjust the number of clusters to match the number of sentences
119
+ num_clusters = len(sentences)
120
 
121
+ # Vectorize the sentences
122
+ vectorizer = TfidfVectorizer()
123
+ X = vectorizer.fit_transform(sentences)
124
+
125
+ # Perform k-means clustering
126
+ kmeans = KMeans(n_clusters=num_clusters, random_state=42)
127
+ kmeans.fit(X)
128
+
129
+ # Calculate the centroid of each cluster
130
+ cluster_centers = kmeans.cluster_centers_
131
 
132
+ # Group sentences by cluster and calculate similarity to centroid
133
+ clustered_sentences = [[] for _ in range(num_clusters)]
134
+ for i, label in enumerate(kmeans.labels_):
135
+ similarity = linear_kernel(cluster_centers[label:label+1], X[i:i+1]).flatten()[0]
136
+ clustered_sentences[label].append((similarity, sentences[i]))
137
 
138
+ # Order sentences within each cluster based on their similarity to the centroid
139
+ for cluster in clustered_sentences:
140
+ cluster.sort(reverse=True) # Sort based on similarity (descending order)
141
+
142
+ # Return the ordered clustered sentences without similarity scores for display
143
+ return [[sentence for _, sentence in cluster] for cluster in clustered_sentences]
144
+
145
+ def get_text_file_download_link(text_to_download, filename='Output.txt', button_label="๐Ÿ’พ Save"):
146
+ buffer = BytesIO()
147
+ buffer.write(text_to_download.encode())
148
+ buffer.seek(0)
149
+ b64 = base64.b64encode(buffer.read()).decode()
150
+ href = f'<a href="data:file/txt;base64,{b64}" download="{filename}" style="margin-top:20px;">{button_label}</a>'
151
+ return href
152
+
153
+ def get_high_info_words_per_cluster(cluster_sentences, num_words=5):
154
+ cluster_high_info_words = []
155
+ for cluster in cluster_sentences:
156
+ cluster_text = " ".join(cluster)
157
+ high_info_words = extract_high_information_words(cluster_text, num_words)
158
+ cluster_high_info_words.append(high_info_words)
159
+ return cluster_high_info_words
160
+
161
+ def plot_cluster_words(cluster_sentences):
162
+ for i, cluster in enumerate(cluster_sentences):
163
+ cluster_text = " ".join(cluster)
164
+ words = re.findall(r'\b[a-z]{4,}\b', cluster_text)
165
+ word_freq = FreqDist(words)
166
+ top_words = [word for word, _ in word_freq.most_common(20)]
167
+
168
+ vectorizer = TfidfVectorizer()
169
+ X = vectorizer.fit_transform(top_words)
170
+ word_vectors = X.toarray()
171
+
172
+ similarity_matrix = cosine_similarity(word_vectors)
173
+
174
+ G = nx.from_numpy_array(similarity_matrix)
175
+ pos = nx.spring_layout(G, k=0.5)
176
+
177
+ plt.figure(figsize=(8, 6))
178
+ nx.draw_networkx(G, pos, node_size=500, font_size=12, font_weight='bold', with_labels=True, labels={i: word for i, word in enumerate(top_words)}, node_color='skyblue', edge_color='gray') # Add word labels to nodes
179
+ plt.axis('off')
180
+ plt.title(f"Cluster {i+1} Word Arrangement")
181
+
182
+ st.pyplot(plt)
183
+
184
+ st.markdown(f"**Cluster {i+1} Details:**")
185
+ st.markdown(f"Top Words: {', '.join(top_words)}")
186
+ st.markdown(f"Number of Sentences: {len(cluster)}")
187
+ st.markdown("---")
188
+
189
+ def process_file(file_path):
190
  with open(file_path, 'r', encoding="utf-8") as file:
191
  file_text = file.read()
192
 
 
247
 
248
  plot_cluster_words(clustered_sentences)
249
 
250
+ # Main code for UI
251
+ st.title("๐Ÿ“บ Transcript Analysis ๐Ÿ“Š")
252
+
253
+ # Display dataframe of .txt files
254
+ txt_files_df = get_txt_files()
255
+ st.write("Available .txt files:")
256
+
257
+ # Use st.empty() to create a placeholder for the DataFrame
258
+ df_placeholder = st.empty()
259
+
260
+ # Display the DataFrame and get the selected indices
261
+ selected_indices = df_placeholder.data_editor(
262
+ txt_files_df,
263
+ hide_index=True,
264
+ key="file_selector"
265
+ )
266
+
267
+ # Initialize session state for selected file if it doesn't exist
268
+ if 'selected_file' not in st.session_state:
269
+ st.session_state.selected_file = None
270
+
271
+ # Check if a new row is selected
272
+ if selected_indices:
273
+ selected_index = list(selected_indices.keys())[0]
274
+ selected_file = txt_files_df.iloc[selected_index]['File Name']
275
+
276
+ # Update session state only if a new file is selected
277
+ if st.session_state.selected_file != selected_file:
278
+ st.session_state.selected_file = selected_file
279
+ st.experimental_rerun()
280
+
281
+ # Display the selected file and process button
282
+ if st.session_state.selected_file:
283
+ st.write(f"Selected file: {st.session_state.selected_file}")
284
+ if st.button(f"Process {st.session_state.selected_file}"):
285
+ file_path = txt_files_df[txt_files_df['File Name'] == st.session_state.selected_file]['Full Path'].iloc[0]
286
+ process_file(file_path)
287
+
288
  st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")