Spaces:
Running
Running
File size: 5,398 Bytes
d9acf37 8d69bdb ce9dc81 d9acf37 b2fb02e d9acf37 b2fb02e 8d69bdb b2fb02e ce9dc81 14db4b1 b94fbeb 14db4b1 b2fb02e ce9dc81 b2fb02e d9acf37 96d8c6a d9acf37 96d8c6a d9acf37 b2fb02e 4cb6887 b2fb02e 4cb6887 b2fb02e d9acf37 96d8c6a ce9dc81 96d8c6a d9acf37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import streamlit as st
from io import StringIO
import PyPDF4
import pdfplumber
import docx2txt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import difflib
import os
from huggingface_hub import InferenceClient # Import Hugging Face API
# ========== CONFIG ==========
st.set_page_config(page_title="📑 Contract Analyzer", layout="wide")
# ========== FUNCTIONS ==========
token = os.environ.get("HF_TOKEN")
# Tải mô hình Hugging Face từ Hub
def load_inference_client():
try:
return InferenceClient(
model="HuggingFaceH4/zephyr-7b-beta",
token=token)
except Exception as e:
st.error(f"Error loading InferenceClient: {e}")
return None
inference_client = load_inference_client()
def extract_text_from_pdf(uploaded_file):
try:
with pdfplumber.open(uploaded_file) as pdf:
text = "\n".join(page.extract_text() or "" for page in pdf.pages)
if not text.strip():
raise ValueError("No extractable text found in the PDF")
return text
except Exception as e:
st.error(f"Error reading PDF: {e}")
return ""
def load_text(file):
if not file:
return ""
try:
# Check file size (e.g., limit to 10MB)
if file.size > 10 * 1024 * 1024: # 10MB
st.warning("File is too large. Please upload a smaller file.")
return ""
ext = file.name.split('.')[-1].lower()
if ext == 'txt':
return StringIO(file.getvalue().decode("utf-8")).read()
elif ext == 'pdf':
return extract_text_from_pdf(file)
elif ext == 'docx':
return docx2txt.process(file)
else:
st.warning(f"Unsupported file type: {ext}")
return ""
except Exception as e:
st.error(f"Error loading file: {e}")
return ""
def highlight_diff(text1, text2):
differ = difflib.Differ()
diff = differ.compare(text1.split(), text2.split())
html = ""
for word in diff:
if word.startswith("- "):
html += f'<span style="background-color:#ffcccc">{word[2:]}</span> '
elif word.startswith("+ "):
html += f'<span style="background-color:#ccffcc">{word[2:]}</span> '
else:
html += word[2:] + " "
return html
def compute_similarity(text1, text2):
if not text1.strip() or not text2.strip():
return 0.0
try:
tfidf = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')
tfidf_matrix = tfidf.fit_transform([text1, text2])
sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
return sim[0][0] * 100
except:
return difflib.SequenceMatcher(None, text1, text2).ratio() * 100
# Hàm truy vấn Zephyr từ Hugging Face
def query_zephyr_model(text1, text2, question):
prompt = f"Compare the following two contracts and answer the question:\nText 1: {text1}\nText 2: {text2}\nQuestion: {question}"
try:
result = inference_client.text_generation(prompt)
return result.generated_text
except Exception as e:
st.error(f"Error querying the model: {e}")
return None
# ========== MAIN ==========
def main():
st.title("📑 Contract Analyzer")
st.markdown("Upload two contracts, compare them, and ask any question!")
# Upload documents
st.header("1. Upload Documents")
col1, col2 = st.columns(2)
with col1:
file1 = st.file_uploader("Upload First Document", type=["txt", "pdf", "docx"], key="file1")
with col2:
file2 = st.file_uploader("Upload Second Document", type=["txt", "pdf", "docx"], key="file2")
text1, text2 = "", ""
if file1: text1 = load_text(file1)
if file2: text2 = load_text(file2)
if not (text1 and text2):
st.warning("Please upload both documents to continue.")
return
# Display uploaded texts
st.header("2. Documents Content")
col1, col2 = st.columns(2)
with col1:
st.subheader("First Document")
st.text_area("Content of first document:", text1, height=300)
with col2:
st.subheader("Second Document")
st.text_area("Content of second document:", text2, height=300)
# Compare documents
st.header("3. Compare Documents")
if st.button("Compare Documents"):
sim_score = compute_similarity(text1, text2)
st.metric("Similarity Score", f"{sim_score:.2f}%")
diff_html = highlight_diff(text1, text2)
st.markdown("**Differences Highlighted:**", unsafe_allow_html=True)
st.markdown(f"<div style='border:1px solid #ccc; padding:10px; max-height:400px; overflow:auto'>{diff_html}</div>", unsafe_allow_html=True)
# Ask any question
st.header("4. Ask a Question")
user_question = st.text_input("Enter your question about the contracts:")
if user_question and st.button("Analyze Question"):
with st.spinner("Analyzing..."):
try:
pred = query_zephyr_model(text1, text2, user_question)
if pred:
st.success(pred)
else:
st.error("Failed to get a valid answer from the model.")
except Exception as e:
st.error(f"Failed on Document: {e}")
if __name__ == "__main__":
main()
|