File size: 3,039 Bytes
4cc0ea8
 
bcf9fd7
 
 
 
3a765ef
bcf9fd7
b559d3b
 
4cc0ea8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa8d0c9
4cc0ea8
fa8d0c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cc0ea8
 
 
 
 
 
 
 
fa8d0c9
4cc0ea8
 
 
fa8d0c9
 
4cc0ea8
fa8d0c9
4cc0ea8
fa8d0c9
4cc0ea8
 
 
fa8d0c9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import docx
import PyPDF2
import os
import re
import json
import time
import tempfile
from typing import Dict, Any, List, Optional
from src.quiz_processing import analyze_document


def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text() + "\n"
        return text
    except Exception as e:
        raise Exception(f"Error extracting text from PDF: {str(e)}")

def extract_text_from_docx(docx_path):
    try:
        doc = docx.Document(docx_path)
        text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
        return text
    except Exception as e:
        raise Exception(f"Error extracting text from DOCX: {str(e)}")

def extract_text_from_txt(txt_path):
    try:
        with open(txt_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return text
    except Exception as e:
        raise Exception(f"Error extracting text from TXT: {str(e)}")

def process_document(document_path, gemini_api_key, language, content_type):
    try:
        # Create a temporary file
        file_extension = os.path.splitext(document_path.name)[-1].lower()
        temp_file = tempfile.mktemp(suffix=file_extension)
        
        # Handle different file-like objects
        if hasattr(document_path, 'read'):
            # If it's a file-like object with read method
            with open(temp_file, 'wb') as f:
                f.write(document_path.read())
        elif hasattr(document_path, 'file'):
            # If it's a Django or similar web framework file upload
            with open(temp_file, 'wb') as f:
                for chunk in document_path.file.chunks():
                    f.write(chunk)
        elif isinstance(document_path, str):
            # If it's a file path string
            temp_file = document_path
        else:
            raise Exception("Unsupported document_path type")

        # Process based on file type
        if file_extension == '.pdf':
            text = extract_text_from_pdf(temp_file)
        elif file_extension == '.docx':
            text = extract_text_from_docx(temp_file)
        elif file_extension == '.txt':
            text = extract_text_from_txt(temp_file)
        else:
            raise Exception(f"Unsupported file type: {file_extension}")

        text_file_path = tempfile.mktemp(suffix='.txt')
        with open(text_file_path, 'w', encoding='utf-8') as f:
            f.write(text)

        # Assume this function is defined elsewhere
        formatted_output, json_path, txt_path = analyze_document(
            text, gemini_api_key, language, content_type
        )

        return f"Document processed successfully", text_file_path, formatted_output, txt_path, json_path
    except Exception as e:
        error_message = f"Error processing document: {str(e)}"
        return error_message, None, error_message, None, None