Spaces:
Sleeping
Sleeping
File size: 3,982 Bytes
8efe659 bd2022e b10c920 c447545 13ac7c2 86fc40d bd2022e 9de97c6 bd2022e adc4ff3 bd2022e b10c920 86fc40d 2fe2a42 c447545 13ac7c2 5777a9a 9de97c6 5777a9a f9779a0 a6daf3c 86fc40d a6daf3c 767cd38 86fc40d 767cd38 bd2022e 767cd38 bd2022e b10c920 c447545 1efe83d 5777a9a f9779a0 a02ed2b b10c920 13ac7c2 56c79b1 767cd38 56c79b1 767cd38 1efe83d a6daf3c 9de97c6 3fd92e9 767cd38 9de97c6 c447545 767cd38 a6daf3c 9de97c6 3fd92e9 b10c920 3fd92e9 c447545 3fd92e9 e8878ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import gradio as gr
from transformers import T5Tokenizer, T5ForConditionalGeneration
from mailparser import parse_from_file
from bs4 import BeautifulSoup
from gliner import GLiNER
from typing import Dict, Union, List
import spacy
import re
import os
import en_core_web_sm
nlp = en_core_web_sm.load()
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
_MODEL = {}
_CACHE_DIR = os.environ.get("CACHE_DIR", None)
def accept_mail(file_path):
email = parse_from_file(file_path)
return email
def clean_email(email):
soup = BeautifulSoup(email.body, 'html.parser')
for tag in soup.find_all(['style', 'link']):
tag.decompose()
cleaned_text = ' '.join(soup.get_text(separator=' ').split())
return cleaned_text
def remove_special_characters(text):
pattern = r'[=_-]+'
cleaned_text = re.sub(pattern, '', text)
return cleaned_text
def get_sentences(further_cleaned_text):
doc = nlp(further_cleaned_text)
sentences = [sent.text for sent in doc.sents]
return sentences
def get_model(model_name: str = None, multilingual: bool = False):
if model_name is None:
model_name = "urchade/gliner_base" if not multilingual else "urchade/gliner_multilingual"
if model_name not in _MODEL:
_MODEL[model_name] = GLiNER.from_pretrained(model_name, cache_dir=_CACHE_DIR)
return _MODEL[model_name]
def parse_query(sentences, labels, threshold=0.3, nested_ner=False, model_name=None, multilingual=False):
model = get_model(model_name, multilingual)
results = []
for sentence in sentences:
_entities = model.predict_entities(sentence, labels, threshold=threshold)
results.extend([{"text": entity["text"], "label": entity["label"]} for entity in _entities])
return results
def refine_entities_with_t5(entities):
inputs = "refine entities: " + " ; ".join([f"{entity['text']} as {entity['label']}" for entity in entities])
input_ids = t5_tokenizer.encode(inputs, return_tensors="pt", add_special_tokens=True)
outputs = t5_model.generate(input_ids)
result = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
return result
def present(email_file, labels, multilingual=False):
email = accept_mail(email_file)
cleaned_text = clean_email(email)
further_cleaned_text = remove_special_characters(cleaned_text)
sentence_list = get_sentences(further_cleaned_text)
entities = parse_query(sentence_list, labels, threshold=0.3, nested_ner=False, model_name="urchade/gliner_base", multilingual=multilingual)
refined_entities = refine_entities_with_t5(entities)
email_info = {
"Subject": email.subject,
"From": email.from_,
"To": email.to,
"Date": email.date,
"Extracted Entities": entities, # Prepare entities for DataFrame if needed
"Refined Entities": refined_entities
}
return [email_info[key] for key in ["Subject", "From", "To", "Date", "Extracted Entities", "Refined Entities"]]
labels = ["PERSON", "PRODUCT", "DEAL", "ORDER", "ORDER PAYMENT METHOD", "STORE", "LEGAL ENTITY", "MERCHANT", "FINANCIAL TRANSACTION", "UNCATEGORIZED", "DATE"]
demo = gr.Interface(
fn=present,
inputs=[
gr.components.File(label="Upload Email (.eml file)"),
gr.components.CheckboxGroup(choices=labels, label="Labels to Detect", value=labels),
gr.components.Checkbox(label="Use Multilingual Model")
],
outputs=[
gr.components.Textbox(label="Subject"),
gr.components.Textbox(label="From"),
gr.components.Textbox(label="To"),
gr.components.Textbox(label="Date"),
gr.components.Dataframe(headers=["Text", "Label"], label="Extracted Entities"),
gr.components.Textbox(label="Refined Entities")
],
title="Email Info Extractor",
description="Upload an email file (.eml) to extract its details and detected entities."
)
demo.launch(share=True)
|