Spaces:
Sleeping
Sleeping
File size: 5,080 Bytes
d9acf37 ce9dc81 d9acf37 b2fb02e d9acf37 b2fb02e ce9dc81 b2fb02e ce9dc81 b2fb02e d9acf37 b2fb02e ce9dc81 b2fb02e d9acf37 ce9dc81 d9acf37 ce9dc81 d9acf37 ce9dc81 d9acf37 ce9dc81 d9acf37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import streamlit as st
from io import StringIO
import PyPDF4
import pdfplumber
import docx2txt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import difflib
from huggingface_hub import InferenceClient # Import Hugging Face API
# ========== CONFIG ==========
st.set_page_config(page_title="📑 Contract Analyzer", layout="wide")
# ========== FUNCTIONS ==========
# Tải mô hình Hugging Face từ Hub
@st.cache_resource
def load_inference_client():
return InferenceClient(repo_id="HuggingFaceH4/zephyr-7b-beta") # Mô hình Zephyr
inference_client = load_inference_client()
def extract_text_from_pdf(uploaded_file):
try:
with pdfplumber.open(uploaded_file) as pdf:
return "\n".join(page.extract_text() or "" for page in pdf.pages)
except:
try:
reader = PyPDF4.PdfFileReader(uploaded_file)
return "\n".join([reader.getPage(i).extractText() for i in range(reader.numPages)])
except Exception as e:
st.error(f"Error reading PDF: {e}")
return ""
def load_text(file):
if not file:
return ""
try:
ext = file.name.split('.')[-1].lower()
if ext == 'txt':
return StringIO(file.getvalue().decode("utf-8")).read()
elif ext == 'pdf':
return extract_text_from_pdf(file)
elif ext == 'docx':
return docx2txt.process(file)
else:
st.warning(f"Unsupported file type: {ext}")
return ""
except Exception as e:
st.error(f"Error loading file: {e}")
return ""
def highlight_diff(text1, text2):
differ = difflib.Differ()
diff = differ.compare(text1.split(), text2.split())
html = ""
for word in diff:
if word.startswith("- "):
html += f'<span style="background-color:#ffcccc">{word[2:]}</span> '
elif word.startswith("+ "):
html += f'<span style="background-color:#ccffcc">{word[2:]}</span> '
else:
html += word[2:] + " "
return html
def compute_similarity(text1, text2):
if not text1.strip() or not text2.strip():
return 0.0
try:
tfidf = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
tfidf_matrix = tfidf.fit_transform([text1, text2])
sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
return sim[0][0] * 100
except:
return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
# Hàm truy vấn Zephyr từ Hugging Face
def query_zephyr_model(text1, text2, question):
prompt = f"Compare the following two contracts and answer the question:\nText 1: {text1}\nText 2: {text2}\nQuestion: {question}"
try:
result = inference_client(inputs=prompt)
return result['generated_text']
except Exception as e:
st.error(f"Error querying the model: {e}")
return None
# ========== MAIN ==========
def main():
st.title("📑 Contract Analyzer")
st.markdown("Upload two contracts, compare them, and ask any question!")
# Upload documents
st.header("1. Upload Documents")
col1, col2 = st.columns(2)
with col1:
file1 = st.file_uploader("Upload First Document", type=["txt", "pdf", "docx"], key="file1")
with col2:
file2 = st.file_uploader("Upload Second Document", type=["txt", "pdf", "docx"], key="file2")
text1, text2 = "", ""
if file1: text1 = load_text(file1)
if file2: text2 = load_text(file2)
if not (text1 and text2):
st.warning("Please upload both documents to continue.")
return
# Display uploaded texts
st.header("2. Documents Content")
col1, col2 = st.columns(2)
with col1:
st.subheader("First Document")
st.text_area("Content of first document:", text1, height=300)
with col2:
st.subheader("Second Document")
st.text_area("Content of second document:", text2, height=300)
# Compare documents
st.header("3. Compare Documents")
if st.button("Compare Documents"):
sim_score = compute_similarity(text1, text2)
st.metric("Similarity Score", f"{sim_score:.2f}%")
diff_html = highlight_diff(text1, text2)
st.markdown("**Differences Highlighted:**", unsafe_allow_html=True)
st.markdown(f"<div style='border:1px solid #ccc; padding:10px; max-height:400px; overflow:auto'>{diff_html}</div>", unsafe_allow_html=True)
# Ask any question
st.header("4. Ask a Question")
user_question = st.text_input("Enter your question about the contracts:")
if user_question and st.button("Analyze Question"):
col = st.columns(1)
with col:
st.subheader("Answer from Document")
with st.spinner("Analyzing..."):
try:
pred = query_zephyr_model(text1, text2, user_question)
st.success(pred)
except Exception as e:
st.error(f"Failed on Document: {e}")
if __name__ == "__main__":
main()
|