Spaces:

ikraamkb
/

Summarization

Running

File size: 2,466 Bytes

4ab997d

### ✅ app.py — Document QA Backend (Cleaned)
from fastapi import FastAPI
from fastapi.responses import FileResponse, JSONResponse
import fitz  # PyMuPDF
import easyocr
import openpyxl
import pptx
import docx
from transformers import pipeline
from gtts import gTTS
import tempfile
import os

app = FastAPI()

qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
reader = easyocr.Reader(['en', 'fr'])

def extract_text_from_pdf(pdf_file):
    try:
        with fitz.open(pdf_file) as doc:
            return "\n".join(page.get_text("text") for page in doc)
    except Exception as e:
        return f"Error reading PDF: {e}"

def extract_text_from_docx(docx_file):
    doc = docx.Document(docx_file)
    return "\n".join(p.text for p in doc.paragraphs if p.text.strip())

def extract_text_from_pptx(pptx_file):
    try:
        prs = pptx.Presentation(pptx_file)
        return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text"))
    except Exception as e:
        return f"Error reading PPTX: {e}"

def extract_text_from_xlsx(xlsx_file):
    try:
        wb = openpyxl.load_workbook(xlsx_file)
        return "\n".join(" ".join(str(cell) for cell in row if cell) for sheet in wb.sheetnames for row in wb[sheet].iter_rows(values_only=True))
    except Exception as e:
        return f"Error reading XLSX: {e}"

def answer_question_from_doc(file, question):
    ext = file.filename.split(".")[-1].lower()
    file_path = f"/tmp/{file.filename}"

    with open(file_path, "wb") as f:
        f.write(file.read())

    if ext == "pdf":
        context = extract_text_from_pdf(file_path)
    elif ext == "docx":
        context = extract_text_from_docx(file_path)
    elif ext == "pptx":
        context = extract_text_from_pptx(file_path)
    elif ext == "xlsx":
        context = extract_text_from_xlsx(file_path)
    else:
        return "Unsupported file format.", None

    if not context.strip():
        return "No text found in the document.", None

    try:
        result = qa_model({"question": question, "context": context})
        answer = result["answer"]
        tts = gTTS(answer)
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
            tts.save(tmp.name)
            return answer, tmp.name
    except Exception as e:
        return f"Error generating answer: {e}", None