Spaces:

MrSimple01
/

SimpleLearn_2

Running

File size: 3,039 Bytes

4cc0ea8
 
bcf9fd7
 
 
 
3a765ef
bcf9fd7
b559d3b
 
4cc0ea8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa8d0c9
4cc0ea8
fa8d0c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cc0ea8
 
 
 
 
 
 
 
fa8d0c9
4cc0ea8
 
 
fa8d0c9
 
4cc0ea8
fa8d0c9
4cc0ea8
fa8d0c9
4cc0ea8
 
 
fa8d0c9

import docx
import PyPDF2
import os
import re
import json
import time
import tempfile
from typing import Dict, Any, List, Optional
from src.quiz_processing import analyze_document


def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text() + "\n"
        return text
    except Exception as e:
        raise Exception(f"Error extracting text from PDF: {str(e)}")

def extract_text_from_docx(docx_path):
    try:
        doc = docx.Document(docx_path)
        text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
        return text
    except Exception as e:
        raise Exception(f"Error extracting text from DOCX: {str(e)}")

def extract_text_from_txt(txt_path):
    try:
        with open(txt_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return text
    except Exception as e:
        raise Exception(f"Error extracting text from TXT: {str(e)}")

def process_document(document_path, gemini_api_key, language, content_type):
    try:
        # Create a temporary file
        file_extension = os.path.splitext(document_path.name)[-1].lower()
        temp_file = tempfile.mktemp(suffix=file_extension)
        
        # Handle different file-like objects
        if hasattr(document_path, 'read'):
            # If it's a file-like object with read method
            with open(temp_file, 'wb') as f:
                f.write(document_path.read())
        elif hasattr(document_path, 'file'):
            # If it's a Django or similar web framework file upload
            with open(temp_file, 'wb') as f:
                for chunk in document_path.file.chunks():
                    f.write(chunk)
        elif isinstance(document_path, str):
            # If it's a file path string
            temp_file = document_path
        else:
            raise Exception("Unsupported document_path type")

        # Process based on file type
        if file_extension == '.pdf':
            text = extract_text_from_pdf(temp_file)
        elif file_extension == '.docx':
            text = extract_text_from_docx(temp_file)
        elif file_extension == '.txt':
            text = extract_text_from_txt(temp_file)
        else:
            raise Exception(f"Unsupported file type: {file_extension}")

        text_file_path = tempfile.mktemp(suffix='.txt')
        with open(text_file_path, 'w', encoding='utf-8') as f:
            f.write(text)

        # Assume this function is defined elsewhere
        formatted_output, json_path, txt_path = analyze_document(
            text, gemini_api_key, language, content_type
        )

        return f"Document processed successfully", text_file_path, formatted_output, txt_path, json_path
    except Exception as e:
        error_message = f"Error processing document: {str(e)}"
        return error_message, None, error_message, None, None