Spaces:

cnmoro
/

SemanticCompression

Running

App Files Files Community

Carlo Moro commited on 16 days ago

Commit

b4f77f4

1 Parent(s): a91104a

Simplifying code

Browse files

Files changed (2) hide show

app.py +13 -159
requirements.txt +1 -15

app.py CHANGED Viewed

@@ -1,166 +1,15 @@
-from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.decomposition import LatentDirichletAllocation
-import tiktoken, nltk, numpy as np, fasttext, pickle, re
-from minivectordb.embedding_model import EmbeddingModel
-from sklearn.metrics.pairwise import cosine_similarity
-from nltk.tokenize import sent_tokenize
 import gradio as gr
-nltk.download('punkt')
-nltk.download('stopwords')
-nltk.download('punkt_tab')
-langdetect_model = fasttext.load_model('lid.176.ftz')
-embedding_model = EmbeddingModel(onnx_model_cpu_core_count=2)
-english_stopwords = pickle.load(open("en_stopwords.pkl", "rb"))
-portuguese_stopwords = pickle.load(open("pt_stopwords.pkl", "rb"))
-tokenizer = tiktoken.encoding_for_model("gpt-4")
-def count_tokens_tiktoken(text):
-    return len(tokenizer.encode(text))
-def detect_language(text):
-    detected_lang = langdetect_model.predict(text.replace('\n', ' '), k=1)[0][0]
-    return 'pt' if (str(detected_lang) == '__label__pt' or str(detected_lang) == 'portuguese') else 'en'
-def clean_and_standardize_text(text):
-    # 1. Standardize spacing around punctuation
-    text = re.sub(r'\s([.,;:!?])\s', r'\1 ', text)
-    # 2. Remove extra spaces
-    text = re.sub(r'\s+', ' ', text).strip()
-    # 3. Capitalize sentences
-    sentences = sent_tokenize(text)
-    text = '. '.join(sentence.capitalize() for sentence in sentences)
-    # 4. Standardize number formatting
-    text = re.sub(r'(\d+)\s+(\d+)', r'\1.\2', text)
-    # 5. Ensure proper spacing after closing parentheses
-    text = re.sub(r'\)\s*([a-zA-Z])', r') \1', text)
-    # 6. Preserve bullet points
-    text = re.sub(r'•\s*', '• ', text)
-    # 7. Preserve numbered lists
-    text = re.sub(r'(\d+)\.\s*', r'\1. ', text)
-    # 8. Standardize date formatting
-    text = re.sub(r'(\d{2})\s+(\d{2})\s+(\d{4})', r'\1/\2/\3', text)
-    # 9. Remove extra periods
-    text = re.sub(r'\.\s+\.', '. ', text)
-    # 10. Remove spacing around parentheses
-    text = re.sub(r'\(\s*', '(', text)
-    text = re.sub(r'\s*\)', ')', text)
-    # 11. Improve spacing around punctuations
-    while ' .' in text:
-        text = text.replace(' .', '.')
-    while '..' in text:
-        text = text.replace('..', '.')
-    while '  ' in text:
-        text = text.replace('  ', ' ')
-    text = text.replace(' :', ':')
-    text = text.replace('- -', '-')
-    text = text.replace('. -', '.')
-    # 12. Detect two punctuation marks in a row, keeping the last
-    text = re.sub(r'([.,]){2,}', r'\1', text)
-    text = re.sub(r'(?<=[:.])[:.]+', '', text)
-    return text
-def semantic_compress_text(full_text, compression_rate=0.7, num_topics=5):
-    def calculate_similarity(embed1, embed2):
-        return cosine_similarity([embed1], [embed2])[0][0]
-    def create_lda_model(texts, stopwords):
-        vectorizer = CountVectorizer(stop_words=stopwords)
-        doc_term_matrix = vectorizer.fit_transform(texts)
-        lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
-        lda.fit(doc_term_matrix)
-        return lda, vectorizer
-    def get_topic_distribution(text, lda, vectorizer):
-        vec = vectorizer.transform([text])
-        return lda.transform(vec)[0]
-    def sentence_importance(sentence, doc_embedding, lda_model, vectorizer, stopwords):
-        sentence_embedding = embedding_model.extract_embeddings(sentence)
-        semantic_similarity = calculate_similarity(doc_embedding, sentence_embedding)
-        topic_dist = get_topic_distribution(sentence, lda_model, vectorizer)
-        topic_importance = np.max(topic_dist)
-        # Calculate lexical diversity
-        words = sentence.split()
-        unique_words = set([word.lower() for word in words if word.lower() not in stopwords])
-        lexical_diversity = len(unique_words) / len(words) if words else 0
-        # Combine factors
-        importance = (0.6 * semantic_similarity) + (0.3 * topic_importance) + (0.2 * lexical_diversity)
-        return importance
-    # Split the text into sentences
-    sentences = sent_tokenize(full_text)
-    final_sentences = []
-    for s in sentences:
-        broken_sentences = s.split('\n')
-        final_sentences.extend(broken_sentences)
-    sentences = final_sentences
-    text_lang = detect_language(full_text)
-    # Create LDA model
-    lda_model, vectorizer = create_lda_model(sentences, portuguese_stopwords if text_lang == 'pt' else english_stopwords)
-    # Get document-level embedding
-    doc_embedding = embedding_model.extract_embeddings(full_text)
-    # Calculate importance for each sentence
-    sentence_scores = [(sentence, sentence_importance(sentence, doc_embedding, lda_model, vectorizer, portuguese_stopwords if text_lang == 'pt' else english_stopwords))
-                       for sentence in sentences]
-    # Sort sentences by importance
-    sorted_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
-    # Determine how many words to keep
-    total_words = sum(len(sentence.split()) for sentence in sentences)
-    target_words = int(total_words * compression_rate)
-    # Reconstruct the compressed text
-    compressed_text = []
-    current_words = 0
-    for sentence, _ in sorted_sentences:
-        sentence_words = len(sentence.split())
-        if current_words + sentence_words <= target_words:
-            compressed_text.append(sentence)
-            current_words += sentence_words
-        else:
-            break
-    # Reorder sentences to maintain original flow
-    compressed_text.sort(key=lambda x: sentences.index(x))
-    joined_compressed_text = ' '.join(compressed_text)
-    joined_compressed_text_cleaned = clean_and_standardize_text(joined_compressed_text)
-    return joined_compressed_text_cleaned
-async def predict(text, word_reduction_factor):
-    if len(text.split()) > 5000:
-        return "Text is too long for this demo. Please provide a text with less than 5000 words."
     if word_reduction_factor is None:
         word_reduction_factor = 0.5
-    compressed = semantic_compress_text(text, compression_rate= 1 - word_reduction_factor)
-    perc_reduction = round(100 - (count_tokens_tiktoken(compressed) / count_tokens_tiktoken(text)) * 100, 2)
     return f"{compressed}\n\nToken Reduction: {perc_reduction}%"
@@ -182,13 +31,18 @@ reduction_factor = gr.Slider(
     interactive=True,
     label="Reduction Factor"
 )
 # Create the gradio interface
 gr.Interface(
     fn=predict,
-    inputs=[gr.Textbox(lines=10, label="Input Text"), reduction_factor],
     outputs=[gr.Textbox(label="Compressed Text")],
     title=gradio_title,
     description=gradio_description,
     examples=gradio_examples,
-    allow_flagging="never"
 ).launch()

+from compressor.semantic import compress_text, count_tokens
 import gradio as gr
+async def predict(text, word_reduction_factor, reference_text_steering):
+    if len(text.split()) > 10000:
+        return "Text is too long for this demo. Please provide a text with less than 10000 words."
     if word_reduction_factor is None:
         word_reduction_factor = 0.5
+    compressed = compress_text(text, compression_rate= word_reduction_factor, reference_text_steering=reference_text_steering)
+    perc_reduction = round(100 - (count_tokens(compressed) / count_tokens(text)) * 100, 2)
     return f"{compressed}\n\nToken Reduction: {perc_reduction}%"
     interactive=True,
     label="Reduction Factor"
 )
 # Create the gradio interface
 gr.Interface(
     fn=predict,
+    inputs=[
+        gr.Textbox(lines=10, label="Input Text"),
+        reduction_factor,
+        gr.Textbox(lines=5, label="Reference text to steer compression (Optional)", placeholder="Enter reference text to steer compression towards this text")
+    ],
     outputs=[gr.Textbox(label="Compressed Text")],
     title=gradio_title,
     description=gradio_description,
     examples=gradio_examples,
+    flagging_mode="never"
 ).launch()

requirements.txt CHANGED Viewed

@@ -1,17 +1,3 @@
 huggingface_hub==0.22.2
-tiktoken
-fasttext
-minivectordb==1.5.5
 gradio==4.31.4
-nltk
-scikit-learn
-numpy==1.26.4
-onnx
-onnxruntime
-onnxruntime-extensions
-transformers==4.37.2
-torch
-faiss-cpu
-thefuzz[speedup]
-FlagEmbedding
-peft

 huggingface_hub==0.22.2
 gradio==4.31.4
+semantic-compressor