awacke1 commited on
Commit
e5c46ba
ยท
verified ยท
1 Parent(s): 689259a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -0
app.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries
2
+ import streamlit as st
3
+ import re
4
+ import nltk
5
+ import os
6
+ from nltk.corpus import stopwords
7
+ from nltk import FreqDist
8
+ from graphviz import Digraph
9
+
10
+ # Set page configuration with a title and favicon
11
+ st.set_page_config(
12
+ page_title="๐Ÿ“บTranscript๐Ÿ“œEDA๐Ÿ”NLTK",
13
+ page_icon="๐ŸŒ ",
14
+ layout="wide",
15
+ initial_sidebar_state="expanded",
16
+ menu_items={
17
+ 'Get Help': 'https://huggingface.co/awacke1',
18
+ 'Report a bug': "https://huggingface.co/spaces/awacke1/WebDataDownload",
19
+ 'About': "# Midjourney: https://discord.com/channels/@me/997514686608191558"
20
+ }
21
+ )
22
+
23
+ st.markdown('''๐Ÿ” **Exploratory Data Analysis (EDA)** ๐Ÿ“Š: - Dive deep into the sea of data with our EDA feature, unveiling hidden patterns ๐Ÿ•ต๏ธโ€โ™‚๏ธ and insights ๐Ÿง  in your transcripts. Transform raw data into a treasure trove of information ๐Ÿ†.
24
+ ๐Ÿ“œ **Natural Language Toolkit (NLTK)** ๐Ÿ› ๏ธ: - Harness the power of NLTK to process and understand human language ๐Ÿ—ฃ๏ธ. From tokenization to sentiment analysis, our toolkit is your compass ๐Ÿงญ in the vast landscape of natural language processing (NLP).
25
+ ๐Ÿ“บ **Transcript Analysis** ๐Ÿ“ˆ: - Elevate your text analysis with our advanced transcript analysis tools. Whether it's speech recognition ๐ŸŽ™๏ธ or thematic extraction ๐ŸŒ, turn your audiovisual content into actionable insights ๐Ÿ”‘.''')
26
+
27
+ # Download NLTK resources
28
+ nltk.download('punkt')
29
+ nltk.download('stopwords')
30
+
31
+ def remove_timestamps(text):
32
+ return re.sub(r'\d{1,2}:\d{2}\n.*\n', '', text)
33
+
34
+ def extract_high_information_words(text, top_n=10):
35
+ words = nltk.word_tokenize(text)
36
+ words = [word.lower() for word in words if word.isalpha()]
37
+ stop_words = set(stopwords.words('english'))
38
+ filtered_words = [word for word in words if word not in stop_words]
39
+ freq_dist = FreqDist(filtered_words)
40
+ return [word for word, _ in freq_dist.most_common(top_n)]
41
+
42
+ def create_relationship_graph(words):
43
+ graph = Digraph()
44
+ for index, word in enumerate(words):
45
+ graph.node(str(index), word)
46
+ if index > 0:
47
+ graph.edge(str(index - 1), str(index), label=str(index))
48
+ return graph
49
+
50
+ def display_relationship_graph(words):
51
+ graph = create_relationship_graph(words)
52
+ st.graphviz_chart(graph)
53
+
54
+ def extract_context_words(text, high_information_words):
55
+ words = nltk.word_tokenize(text)
56
+ context_words = []
57
+ for index, word in enumerate(words):
58
+ if word.lower() in high_information_words:
59
+ before_word = words[index - 1] if index > 0 else None
60
+ after_word = words[index + 1] if index < len(words) - 1 else None
61
+ context_words.append((before_word, word, after_word))
62
+ return context_words
63
+
64
+ def create_context_graph(context_words):
65
+ graph = Digraph()
66
+ for index, (before_word, high_info_word, after_word) in enumerate(context_words):
67
+ #graph.node(f'before{index}', before_word, shape='box') if before_word else None
68
+ if before_word: graph.node(f'before{index}', before_word, shape='box') # else None
69
+ graph.node(f'high{index}', high_info_word, shape='ellipse')
70
+ #graph.node(f'after{index}', after_word, shape='diamond') if after_word else None
71
+ if after_word: graph.node(f'after{index}', after_word, shape='diamond') # else None
72
+ if before_word:
73
+ graph.edge(f'before{index}', f'high{index}')
74
+ if after_word:
75
+ graph.edge(f'high{index}', f'after{index}')
76
+ return graph
77
+
78
+ def display_context_graph(context_words):
79
+ graph = create_context_graph(context_words)
80
+ st.graphviz_chart(graph)
81
+
82
+ def display_context_table(context_words):
83
+ table = "| Before | High Info Word | After |\n|--------|----------------|-------|\n"
84
+ for before, high, after in context_words:
85
+ table += f"| {before if before else ''} | {high} | {after if after else ''} |\n"
86
+ st.markdown(table)
87
+
88
+
89
+ def load_example_files():
90
+ # Exclude specific files
91
+ excluded_files = {'freeze.txt', 'requirements.txt', 'packages.txt', 'pre-requirements.txt'}
92
+
93
+ # List all .txt files excluding the ones in excluded_files
94
+ example_files = [f for f in os.listdir() if f.endswith('.txt') and f not in excluded_files]
95
+
96
+ # Check if there are any files to select from
97
+ if example_files:
98
+ selected_file = st.selectbox("๐Ÿ“„ Select an example file:", example_files)
99
+ if st.button(f"๐Ÿ“‚ Load {selected_file}"):
100
+ with open(selected_file, 'r', encoding="utf-8") as file:
101
+ return file.read()
102
+ else:
103
+ st.write("No suitable example files found.")
104
+
105
+ return None
106
+
107
+ # Load example files
108
+ def load_example_files_old():
109
+ example_files = [f for f in os.listdir() if f.endswith('.txt')]
110
+ selected_file = st.selectbox("๐Ÿ“„ Select an example file:", example_files)
111
+ if st.button(f"๐Ÿ“‚ Load {selected_file}"):
112
+ with open(selected_file, 'r', encoding="utf-8") as file:
113
+ return file.read()
114
+ return None
115
+
116
+ # Main code for UI
117
+ uploaded_file = st.file_uploader("๐Ÿ“ Choose a .txt file", type=['txt'])
118
+
119
+ example_text = load_example_files()
120
+
121
+ if example_text:
122
+ file_text = example_text
123
+ elif uploaded_file:
124
+ file_text = uploaded_file.read().decode("utf-8")
125
+ else:
126
+ file_text = ""
127
+
128
+ if file_text:
129
+ text_without_timestamps = remove_timestamps(file_text)
130
+ top_words = extract_high_information_words(text_without_timestamps, 10)
131
+
132
+ with st.expander("๐Ÿ“Š Top 10 High Information Words"):
133
+ st.write(top_words)
134
+
135
+ with st.expander("๐Ÿ“ˆ Relationship Graph"):
136
+ display_relationship_graph(top_words)
137
+
138
+ context_words = extract_context_words(text_without_timestamps, top_words)
139
+
140
+ with st.expander("๐Ÿ”— Context Graph"):
141
+ display_context_graph(context_words)
142
+
143
+ with st.expander("๐Ÿ“‘ Context Table"):
144
+ display_context_table(context_words)
145
+
146
+ with st.expander("Innovation Outlines"):
147
+ showInnovationOutlines()