Carlo Moro commited on
Commit
b4f77f4
·
1 Parent(s): a91104a

Simplifying code

Browse files
Files changed (2) hide show
  1. app.py +13 -159
  2. requirements.txt +1 -15
app.py CHANGED
@@ -1,166 +1,15 @@
1
- from sklearn.feature_extraction.text import CountVectorizer
2
- from sklearn.decomposition import LatentDirichletAllocation
3
- import tiktoken, nltk, numpy as np, fasttext, pickle, re
4
- from minivectordb.embedding_model import EmbeddingModel
5
- from sklearn.metrics.pairwise import cosine_similarity
6
- from nltk.tokenize import sent_tokenize
7
  import gradio as gr
8
 
9
- nltk.download('punkt')
10
- nltk.download('stopwords')
11
- nltk.download('punkt_tab')
12
-
13
- langdetect_model = fasttext.load_model('lid.176.ftz')
14
- embedding_model = EmbeddingModel(onnx_model_cpu_core_count=2)
15
- english_stopwords = pickle.load(open("en_stopwords.pkl", "rb"))
16
- portuguese_stopwords = pickle.load(open("pt_stopwords.pkl", "rb"))
17
- tokenizer = tiktoken.encoding_for_model("gpt-4")
18
-
19
- def count_tokens_tiktoken(text):
20
- return len(tokenizer.encode(text))
21
-
22
- def detect_language(text):
23
- detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
24
- return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
25
-
26
- def clean_and_standardize_text(text):
27
- # 1. Standardize spacing around punctuation
28
- text = re.sub(r'\s([.,;:!?])\s', r'\1 ', text)
29
-
30
- # 2. Remove extra spaces
31
- text = re.sub(r'\s+', ' ', text).strip()
32
-
33
- # 3. Capitalize sentences
34
- sentences = sent_tokenize(text)
35
- text = '. '.join(sentence.capitalize() for sentence in sentences)
36
-
37
- # 4. Standardize number formatting
38
- text = re.sub(r'(\d+)\s+(\d+)', r'\1.\2', text)
39
-
40
- # 5. Ensure proper spacing after closing parentheses
41
- text = re.sub(r'\)\s*([a-zA-Z])', r') \1', text)
42
-
43
- # 6. Preserve bullet points
44
- text = re.sub(r'•\s*', '• ', text)
45
-
46
- # 7. Preserve numbered lists
47
- text = re.sub(r'(\d+)\.\s*', r'\1. ', text)
48
-
49
- # 8. Standardize date formatting
50
- text = re.sub(r'(\d{2})\s+(\d{2})\s+(\d{4})', r'\1/\2/\3', text)
51
-
52
- # 9. Remove extra periods
53
- text = re.sub(r'\.\s+\.', '. ', text)
54
-
55
- # 10. Remove spacing around parentheses
56
- text = re.sub(r'\(\s*', '(', text)
57
- text = re.sub(r'\s*\)', ')', text)
58
-
59
- # 11. Improve spacing around punctuations
60
- while ' .' in text:
61
- text = text.replace(' .', '.')
62
-
63
- while '..' in text:
64
- text = text.replace('..', '.')
65
-
66
- while ' ' in text:
67
- text = text.replace(' ', ' ')
68
-
69
- text = text.replace(' :', ':')
70
- text = text.replace('- -', '-')
71
- text = text.replace('. -', '.')
72
-
73
- # 12. Detect two punctuation marks in a row, keeping the last
74
- text = re.sub(r'([.,]){2,}', r'\1', text)
75
- text = re.sub(r'(?<=[:.])[:.]+', '', text)
76
-
77
- return text
78
-
79
- def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
80
- def calculate_similarity(embed1, embed2):
81
- return cosine_similarity([embed1], [embed2])[0][0]
82
-
83
- def create_lda_model(texts, stopwords):
84
- vectorizer = CountVectorizer(stop_words=stopwords)
85
- doc_term_matrix = vectorizer.fit_transform(texts)
86
- lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
87
- lda.fit(doc_term_matrix)
88
- return lda, vectorizer
89
-
90
- def get_topic_distribution(text, lda, vectorizer):
91
- vec = vectorizer.transform([text])
92
- return lda.transform(vec)[0]
93
-
94
- def sentence_importance(sentence, doc_embedding, lda_model, vectorizer, stopwords):
95
- sentence_embedding = embedding_model.extract_embeddings(sentence)
96
- semantic_similarity = calculate_similarity(doc_embedding, sentence_embedding)
97
-
98
- topic_dist = get_topic_distribution(sentence, lda_model, vectorizer)
99
- topic_importance = np.max(topic_dist)
100
-
101
- # Calculate lexical diversity
102
- words = sentence.split()
103
- unique_words = set([word.lower() for word in words if word.lower() not in stopwords])
104
- lexical_diversity = len(unique_words) / len(words) if words else 0
105
-
106
- # Combine factors
107
- importance = (0.6 * semantic_similarity) + (0.3 * topic_importance) + (0.2 * lexical_diversity)
108
- return importance
109
-
110
- # Split the text into sentences
111
- sentences = sent_tokenize(full_text)
112
- final_sentences = []
113
- for s in sentences:
114
- broken_sentences = s.split('\n')
115
- final_sentences.extend(broken_sentences)
116
- sentences = final_sentences
117
-
118
- text_lang = detect_language(full_text)
119
-
120
- # Create LDA model
121
- lda_model, vectorizer = create_lda_model(sentences, portuguese_stopwords if text_lang == 'pt' else english_stopwords)
122
-
123
- # Get document-level embedding
124
- doc_embedding = embedding_model.extract_embeddings(full_text)
125
-
126
- # Calculate importance for each sentence
127
- sentence_scores = [(sentence, sentence_importance(sentence, doc_embedding, lda_model, vectorizer, portuguese_stopwords if text_lang == 'pt' else english_stopwords))
128
- for sentence in sentences]
129
-
130
- # Sort sentences by importance
131
- sorted_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
132
-
133
- # Determine how many words to keep
134
- total_words = sum(len(sentence.split()) for sentence in sentences)
135
- target_words = int(total_words * compression_rate)
136
-
137
- # Reconstruct the compressed text
138
- compressed_text = []
139
- current_words = 0
140
- for sentence, _ in sorted_sentences:
141
- sentence_words = len(sentence.split())
142
- if current_words + sentence_words <= target_words:
143
- compressed_text.append(sentence)
144
- current_words += sentence_words
145
- else:
146
- break
147
-
148
- # Reorder sentences to maintain original flow
149
- compressed_text.sort(key=lambda x: sentences.index(x))
150
-
151
- joined_compressed_text = ' '.join(compressed_text)
152
- joined_compressed_text_cleaned = clean_and_standardize_text(joined_compressed_text)
153
- return joined_compressed_text_cleaned
154
-
155
- async def predict(text, word_reduction_factor):
156
- if len(text.split()) > 5000:
157
- return "Text is too long for this demo. Please provide a text with less than 5000 words."
158
 
159
  if word_reduction_factor is None:
160
  word_reduction_factor = 0.5
161
 
162
- compressed = semantic_compress_text(text, compression_rate= 1 - word_reduction_factor)
163
- perc_reduction = round(100 - (count_tokens_tiktoken(compressed) / count_tokens_tiktoken(text)) * 100, 2)
164
 
165
  return f"{compressed}\n\nToken Reduction: {perc_reduction}%"
166
 
@@ -182,13 +31,18 @@ reduction_factor = gr.Slider(
182
  interactive=True,
183
  label="Reduction Factor"
184
  )
 
185
  # Create the gradio interface
186
  gr.Interface(
187
  fn=predict,
188
- inputs=[gr.Textbox(lines=10, label="Input Text"), reduction_factor],
 
 
 
 
189
  outputs=[gr.Textbox(label="Compressed Text")],
190
  title=gradio_title,
191
  description=gradio_description,
192
  examples=gradio_examples,
193
- allow_flagging="never"
194
  ).launch()
 
1
+ from compressor.semantic import compress_text, count_tokens
 
 
 
 
 
2
  import gradio as gr
3
 
4
+ async def predict(text, word_reduction_factor, reference_text_steering):
5
+ if len(text.split()) > 10000:
6
+ return "Text is too long for this demo. Please provide a text with less than 10000 words."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  if word_reduction_factor is None:
9
  word_reduction_factor = 0.5
10
 
11
+ compressed = compress_text(text, compression_rate= word_reduction_factor, reference_text_steering=reference_text_steering)
12
+ perc_reduction = round(100 - (count_tokens(compressed) / count_tokens(text)) * 100, 2)
13
 
14
  return f"{compressed}\n\nToken Reduction: {perc_reduction}%"
15
 
 
31
  interactive=True,
32
  label="Reduction Factor"
33
  )
34
+
35
  # Create the gradio interface
36
  gr.Interface(
37
  fn=predict,
38
+ inputs=[
39
+ gr.Textbox(lines=10, label="Input Text"),
40
+ reduction_factor,
41
+ gr.Textbox(lines=5, label="Reference text to steer compression (Optional)", placeholder="Enter reference text to steer compression towards this text")
42
+ ],
43
  outputs=[gr.Textbox(label="Compressed Text")],
44
  title=gradio_title,
45
  description=gradio_description,
46
  examples=gradio_examples,
47
+ flagging_mode="never"
48
  ).launch()
requirements.txt CHANGED
@@ -1,17 +1,3 @@
1
  huggingface_hub==0.22.2
2
- tiktoken
3
- fasttext
4
- minivectordb==1.5.5
5
  gradio==4.31.4
6
- nltk
7
- scikit-learn
8
- numpy==1.26.4
9
- onnx
10
- onnxruntime
11
- onnxruntime-extensions
12
- transformers==4.37.2
13
- torch
14
- faiss-cpu
15
- thefuzz[speedup]
16
- FlagEmbedding
17
- peft
 
1
  huggingface_hub==0.22.2
 
 
 
2
  gradio==4.31.4
3
+ semantic-compressor