Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import necessary libraries
|
2 |
+
import streamlit as st
|
3 |
+
import re
|
4 |
+
import nltk
|
5 |
+
import os
|
6 |
+
from nltk.corpus import stopwords
|
7 |
+
from nltk import FreqDist
|
8 |
+
from graphviz import Digraph
|
9 |
+
|
10 |
+
# Set page configuration with a title and favicon
|
11 |
+
st.set_page_config(
|
12 |
+
page_title="๐บTranscript๐EDA๐NLTK",
|
13 |
+
page_icon="๐ ",
|
14 |
+
layout="wide",
|
15 |
+
initial_sidebar_state="expanded",
|
16 |
+
menu_items={
|
17 |
+
'Get Help': 'https://huggingface.co/awacke1',
|
18 |
+
'Report a bug': "https://huggingface.co/spaces/awacke1/WebDataDownload",
|
19 |
+
'About': "# Midjourney: https://discord.com/channels/@me/997514686608191558"
|
20 |
+
}
|
21 |
+
)
|
22 |
+
|
23 |
+
st.markdown('''๐ **Exploratory Data Analysis (EDA)** ๐: - Dive deep into the sea of data with our EDA feature, unveiling hidden patterns ๐ต๏ธโโ๏ธ and insights ๐ง in your transcripts. Transform raw data into a treasure trove of information ๐.
|
24 |
+
๐ **Natural Language Toolkit (NLTK)** ๐ ๏ธ: - Harness the power of NLTK to process and understand human language ๐ฃ๏ธ. From tokenization to sentiment analysis, our toolkit is your compass ๐งญ in the vast landscape of natural language processing (NLP).
|
25 |
+
๐บ **Transcript Analysis** ๐: - Elevate your text analysis with our advanced transcript analysis tools. Whether it's speech recognition ๐๏ธ or thematic extraction ๐, turn your audiovisual content into actionable insights ๐.''')
|
26 |
+
|
27 |
+
# Download NLTK resources
|
28 |
+
nltk.download('punkt')
|
29 |
+
nltk.download('stopwords')
|
30 |
+
|
31 |
+
def remove_timestamps(text):
|
32 |
+
return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
|
33 |
+
|
34 |
+
def extract_high_information_words(text, top_n=10):
|
35 |
+
words = nltk.word_tokenize(text)
|
36 |
+
words = [word.lower() for word in words if word.isalpha()]
|
37 |
+
stop_words = set(stopwords.words('english'))
|
38 |
+
filtered_words = [word for word in words if word not in stop_words]
|
39 |
+
freq_dist = FreqDist(filtered_words)
|
40 |
+
return [word for word, _ in freq_dist.most_common(top_n)]
|
41 |
+
|
42 |
+
def create_relationship_graph(words):
|
43 |
+
graph = Digraph()
|
44 |
+
for index, word in enumerate(words):
|
45 |
+
graph.node(str(index), word)
|
46 |
+
if index > 0:
|
47 |
+
graph.edge(str(index - 1), str(index), label=str(index))
|
48 |
+
return graph
|
49 |
+
|
50 |
+
def display_relationship_graph(words):
|
51 |
+
graph = create_relationship_graph(words)
|
52 |
+
st.graphviz_chart(graph)
|
53 |
+
|
54 |
+
def extract_context_words(text, high_information_words):
|
55 |
+
words = nltk.word_tokenize(text)
|
56 |
+
context_words = []
|
57 |
+
for index, word in enumerate(words):
|
58 |
+
if word.lower() in high_information_words:
|
59 |
+
before_word = words[index - 1] if index > 0 else None
|
60 |
+
after_word = words[index + 1] if index < len(words) - 1 else None
|
61 |
+
context_words.append((before_word, word, after_word))
|
62 |
+
return context_words
|
63 |
+
|
64 |
+
def create_context_graph(context_words):
|
65 |
+
graph = Digraph()
|
66 |
+
for index, (before_word, high_info_word, after_word) in enumerate(context_words):
|
67 |
+
#graph.node(f'before{index}', before_word, shape='box') if before_word else None
|
68 |
+
if before_word: graph.node(f'before{index}', before_word, shape='box') # else None
|
69 |
+
graph.node(f'high{index}', high_info_word, shape='ellipse')
|
70 |
+
#graph.node(f'after{index}', after_word, shape='diamond') if after_word else None
|
71 |
+
if after_word: graph.node(f'after{index}', after_word, shape='diamond') # else None
|
72 |
+
if before_word:
|
73 |
+
graph.edge(f'before{index}', f'high{index}')
|
74 |
+
if after_word:
|
75 |
+
graph.edge(f'high{index}', f'after{index}')
|
76 |
+
return graph
|
77 |
+
|
78 |
+
def display_context_graph(context_words):
|
79 |
+
graph = create_context_graph(context_words)
|
80 |
+
st.graphviz_chart(graph)
|
81 |
+
|
82 |
+
def display_context_table(context_words):
|
83 |
+
table = "| Before | High Info Word | After |\n|--------|----------------|-------|\n"
|
84 |
+
for before, high, after in context_words:
|
85 |
+
table += f"| {before if before else ''} | {high} | {after if after else ''} |\n"
|
86 |
+
st.markdown(table)
|
87 |
+
|
88 |
+
|
89 |
+
def load_example_files():
|
90 |
+
# Exclude specific files
|
91 |
+
excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
|
92 |
+
|
93 |
+
# List all .txt files excluding the ones in excluded_files
|
94 |
+
example_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
|
95 |
+
|
96 |
+
# Check if there are any files to select from
|
97 |
+
if example_files:
|
98 |
+
selected_file = st.selectbox("๐ Select an example file:", example_files)
|
99 |
+
if st.button(f"๐ Load {selected_file}"):
|
100 |
+
with open(selected_file, 'r', encoding="utf-8") as file:
|
101 |
+
return file.read()
|
102 |
+
else:
|
103 |
+
st.write("No suitable example files found.")
|
104 |
+
|
105 |
+
return None
|
106 |
+
|
107 |
+
# Load example files
|
108 |
+
def load_example_files_old():
|
109 |
+
example_files = [f for f in os.listdir() if f.endswith('.txt')]
|
110 |
+
selected_file = st.selectbox("๐ Select an example file:", example_files)
|
111 |
+
if st.button(f"๐ Load {selected_file}"):
|
112 |
+
with open(selected_file, 'r', encoding="utf-8") as file:
|
113 |
+
return file.read()
|
114 |
+
return None
|
115 |
+
|
116 |
+
# Main code for UI
|
117 |
+
uploaded_file = st.file_uploader("๐ Choose a .txt file", type=['txt'])
|
118 |
+
|
119 |
+
example_text = load_example_files()
|
120 |
+
|
121 |
+
if example_text:
|
122 |
+
file_text = example_text
|
123 |
+
elif uploaded_file:
|
124 |
+
file_text = uploaded_file.read().decode("utf-8")
|
125 |
+
else:
|
126 |
+
file_text = ""
|
127 |
+
|
128 |
+
if file_text:
|
129 |
+
text_without_timestamps = remove_timestamps(file_text)
|
130 |
+
top_words = extract_high_information_words(text_without_timestamps, 10)
|
131 |
+
|
132 |
+
with st.expander("๐ Top 10 High Information Words"):
|
133 |
+
st.write(top_words)
|
134 |
+
|
135 |
+
with st.expander("๐ Relationship Graph"):
|
136 |
+
display_relationship_graph(top_words)
|
137 |
+
|
138 |
+
context_words = extract_context_words(text_without_timestamps, top_words)
|
139 |
+
|
140 |
+
with st.expander("๐ Context Graph"):
|
141 |
+
display_context_graph(context_words)
|
142 |
+
|
143 |
+
with st.expander("๐ Context Table"):
|
144 |
+
display_context_table(context_words)
|
145 |
+
|
146 |
+
with st.expander("Innovation Outlines"):
|
147 |
+
showInnovationOutlines()
|