Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -32,19 +32,31 @@ st.markdown('''
|
|
32 |
3. πΊ **Transcript Analysis** π:Speech recognition ποΈ and thematic extraction π, audiovisual content to actionable insights π.
|
33 |
''')
|
34 |
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
def remove_timestamps(text):
|
39 |
return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
|
40 |
|
41 |
def extract_high_information_words(text, top_n=10):
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
48 |
|
49 |
def create_relationship_graph(words):
|
50 |
graph = Digraph()
|
@@ -157,47 +169,50 @@ def plot_cluster_words(cluster_sentences):
|
|
157 |
st.markdown("---")
|
158 |
|
159 |
def process_file(file_path):
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
st.
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
|
|
|
|
|
|
201 |
|
202 |
def perform_eda(file_name):
|
203 |
st.subheader(f"EDA for {file_name}")
|
|
|
32 |
3. πΊ **Transcript Analysis** π:Speech recognition ποΈ and thematic extraction π, audiovisual content to actionable insights π.
|
33 |
''')
|
34 |
|
35 |
+
@st.cache_resource
|
36 |
+
def download_nltk_data():
|
37 |
+
try:
|
38 |
+
nltk.data.find('tokenizers/punkt')
|
39 |
+
nltk.data.find('corpora/stopwords')
|
40 |
+
except LookupError:
|
41 |
+
nltk.download('punkt')
|
42 |
+
nltk.download('stopwords')
|
43 |
+
|
44 |
+
download_nltk_data()
|
45 |
|
46 |
def remove_timestamps(text):
|
47 |
return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
|
48 |
|
49 |
def extract_high_information_words(text, top_n=10):
|
50 |
+
try:
|
51 |
+
words = nltk.word_tokenize(text)
|
52 |
+
words = [word.lower() for word in words if word.isalpha()]
|
53 |
+
stop_words = set(stopwords.words('english'))
|
54 |
+
filtered_words = [word for word in words if word not in stop_words]
|
55 |
+
freq_dist = FreqDist(filtered_words)
|
56 |
+
return [word for word, _ in freq_dist.most_common(top_n)]
|
57 |
+
except Exception as e:
|
58 |
+
st.error(f"Error in extract_high_information_words: {str(e)}")
|
59 |
+
return []
|
60 |
|
61 |
def create_relationship_graph(words):
|
62 |
graph = Digraph()
|
|
|
169 |
st.markdown("---")
|
170 |
|
171 |
def process_file(file_path):
|
172 |
+
try:
|
173 |
+
with open(file_path, 'r', encoding="utf-8") as file:
|
174 |
+
file_text = file.read()
|
175 |
+
text_without_timestamps = remove_timestamps(file_text)
|
176 |
+
top_words = extract_high_information_words(text_without_timestamps, 10)
|
177 |
+
with st.expander("π Top 10 High Information Words"):
|
178 |
+
st.write(top_words)
|
179 |
+
with st.expander("π Relationship Graph"):
|
180 |
+
display_relationship_graph(top_words)
|
181 |
+
context_words = extract_context_words(text_without_timestamps, top_words)
|
182 |
+
with st.expander("π Context Graph"):
|
183 |
+
display_context_graph(context_words)
|
184 |
+
with st.expander("π Context Table"):
|
185 |
+
display_context_table(context_words)
|
186 |
+
sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
|
187 |
+
num_sentences = len(sentences)
|
188 |
+
st.write(f"Total Sentences: {num_sentences}")
|
189 |
+
num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
|
190 |
+
clustered_sentences = cluster_sentences(sentences, num_clusters)
|
191 |
+
col1, col2 = st.columns(2)
|
192 |
+
with col1:
|
193 |
+
st.subheader("Original Text")
|
194 |
+
original_text = "\n".join(sentences)
|
195 |
+
st.text_area("Original Sentences", value=original_text, height=400)
|
196 |
+
with col2:
|
197 |
+
st.subheader("Clustered Text")
|
198 |
+
clusters = ""
|
199 |
+
clustered_text = ""
|
200 |
+
cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
|
201 |
+
for i, cluster in enumerate(clustered_sentences):
|
202 |
+
cluster_text = "\n".join(cluster)
|
203 |
+
high_info_words = ", ".join(cluster_high_info_words[i])
|
204 |
+
clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
|
205 |
+
clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
|
206 |
+
st.text_area("Clusters", value=clusters, height=200)
|
207 |
+
st.text_area("Clustered Sentences", value=clustered_text, height=200)
|
208 |
+
clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
|
209 |
+
if set(sentences) == set(clustered_sentences_flat):
|
210 |
+
st.write("β
All sentences are accounted for in the clustered output.")
|
211 |
+
else:
|
212 |
+
st.write("β Some sentences are missing in the clustered output.")
|
213 |
+
plot_cluster_words(clustered_sentences)
|
214 |
+
except Exception as e:
|
215 |
+
st.error(f"Error processing file: {str(e)}")
|
216 |
|
217 |
def perform_eda(file_name):
|
218 |
st.subheader(f"EDA for {file_name}")
|