awacke1 commited on
Commit
069bed5
ยท
verified ยท
1 Parent(s): df130bb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -0
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import os
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.cluster import KMeans
6
+ from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
7
+ import nltk
8
+ from nltk.corpus import stopwords
9
+ from nltk import FreqDist
10
+ import re
11
+ import base64
12
+ from graphviz import Digraph
13
+ from io import BytesIO
14
+ import networkx as nx
15
+ import matplotlib.pyplot as plt
16
+
17
+ # ... [Keep all the existing imports and configurations] ...
18
+
19
+ def get_txt_files():
20
+ # Exclude specific files
21
+ excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
22
+
23
+ # List all .txt files excluding the ones in excluded_files
24
+ txt_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
25
+
26
+ # Create a dataframe with file names and full paths
27
+ df = pd.DataFrame({
28
+ 'File Name': txt_files,
29
+ 'Full Path': [os.path.abspath(f) for f in txt_files]
30
+ })
31
+
32
+ return df
33
+
34
+ # ... [Keep all the existing functions] ...
35
+
36
+ # Main code for UI
37
+ st.title("๐Ÿ“บ Transcript Analysis ๐Ÿ“Š")
38
+
39
+ # Display dataframe of .txt files
40
+ txt_files_df = get_txt_files()
41
+ st.write("Available .txt files:")
42
+ st.dataframe(txt_files_df)
43
+
44
+ # Allow user to select a file from the dataframe
45
+ selected_file = st.selectbox("Select a file to process:", txt_files_df['File Name'])
46
+
47
+ if st.button(f"Process {selected_file}"):
48
+ file_path = txt_files_df[txt_files_df['File Name'] == selected_file]['Full Path'].iloc[0]
49
+ with open(file_path, 'r', encoding="utf-8") as file:
50
+ file_text = file.read()
51
+
52
+ # Process the selected file
53
+ text_without_timestamps = remove_timestamps(file_text)
54
+ top_words = extract_high_information_words(text_without_timestamps, 10)
55
+
56
+ with st.expander("๐Ÿ“Š Top 10 High Information Words"):
57
+ st.write(top_words)
58
+
59
+ with st.expander("๐Ÿ“ˆ Relationship Graph"):
60
+ display_relationship_graph(top_words)
61
+
62
+ context_words = extract_context_words(text_without_timestamps, top_words)
63
+
64
+ with st.expander("๐Ÿ”— Context Graph"):
65
+ display_context_graph(context_words)
66
+
67
+ with st.expander("๐Ÿ“‘ Context Table"):
68
+ display_context_table(context_words)
69
+
70
+ sentences = [line.strip() for line in file_text.split('\n') if len(line.strip()) > 10]
71
+
72
+ num_sentences = len(sentences)
73
+ st.write(f"Total Sentences: {num_sentences}")
74
+
75
+ num_clusters = st.slider("Number of Clusters", min_value=2, max_value=10, value=5)
76
+ clustered_sentences = cluster_sentences(sentences, num_clusters)
77
+
78
+ col1, col2 = st.columns(2)
79
+
80
+ with col1:
81
+ st.subheader("Original Text")
82
+ original_text = "\n".join(sentences)
83
+ st.text_area("Original Sentences", value=original_text, height=400)
84
+
85
+ with col2:
86
+ st.subheader("Clustered Text")
87
+ clusters = ""
88
+ clustered_text = ""
89
+ cluster_high_info_words = get_high_info_words_per_cluster(clustered_sentences)
90
+
91
+ for i, cluster in enumerate(clustered_sentences):
92
+ cluster_text = "\n".join(cluster)
93
+ high_info_words = ", ".join(cluster_high_info_words[i])
94
+ clusters += f"Cluster {i+1} (High Info Words: {high_info_words})\n"
95
+ clustered_text += f"Cluster {i+1} (High Info Words: {high_info_words}):\n{cluster_text}\n\n"
96
+
97
+ st.text_area("Clusters", value=clusters, height=200)
98
+ st.text_area("Clustered Sentences", value=clustered_text, height=200)
99
+
100
+ # Verify that all sentences are accounted for in the clustered output
101
+ clustered_sentences_flat = [sentence for cluster in clustered_sentences for sentence in cluster]
102
+ if set(sentences) == set(clustered_sentences_flat):
103
+ st.write("โœ… All sentences are accounted for in the clustered output.")
104
+ else:
105
+ st.write("โŒ Some sentences are missing in the clustered output.")
106
+
107
+ plot_cluster_words(clustered_sentences)
108
+
109
+ st.markdown("For more information and updates, visit our [help page](https://huggingface.co/awacke1).")