File size: 14,477 Bytes
9d13a5a
 
 
 
 
 
f67e43c
45e4b1d
9d13a5a
d2e3ccd
827b95e
d2e3ccd
9d13a5a
 
a3b8412
9d13a5a
827b95e
f67e43c
 
 
 
 
2dddabe
f67e43c
827b95e
 
 
9d13a5a
 
 
 
 
45e4b1d
 
 
 
 
 
 
 
 
 
f67e43c
45e4b1d
f67e43c
9d13a5a
 
 
f67e43c
 
2dddabe
 
f67e43c
 
2dddabe
9d13a5a
7905cfc
9d13a5a
 
2dddabe
7905cfc
 
9d13a5a
 
7905cfc
9d13a5a
2dddabe
9d13a5a
 
 
 
2dddabe
9d13a5a
 
 
f67e43c
9d13a5a
 
 
f67e43c
9d13a5a
 
 
827b95e
9d13a5a
 
 
827b95e
 
 
 
 
 
 
 
 
 
 
 
 
9d13a5a
 
d2e3ccd
 
 
 
 
 
827b95e
 
 
d2e3ccd
 
 
 
 
 
 
 
 
 
827b95e
d2e3ccd
827b95e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d13a5a
 
 
 
 
7905cfc
 
 
 
 
f67e43c
 
9d13a5a
 
 
f67e43c
 
827b95e
 
 
9d13a5a
 
f67e43c
9d13a5a
 
 
 
 
d2e3ccd
f67e43c
 
9d13a5a
d2e3ccd
9d13a5a
f67e43c
9d13a5a
 
 
 
f67e43c
9d13a5a
 
 
d2e3ccd
f67e43c
 
 
 
9d13a5a
d2e3ccd
f67e43c
d2e3ccd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d13a5a
f67e43c
 
 
 
 
 
45e4b1d
 
 
 
 
f67e43c
45e4b1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f67e43c
 
 
 
d2e3ccd
f67e43c
 
 
 
d2e3ccd
f67e43c
d2e3ccd
f67e43c
9d13a5a
f67e43c
 
 
 
 
 
 
9d13a5a
 
 
 
f67e43c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d13a5a
f67e43c
9d13a5a
f67e43c
 
 
 
9d13a5a
f67e43c
 
9d13a5a
f67e43c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d13a5a
f67e43c
 
 
 
 
 
 
 
827b95e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
import os
import time
import tempfile
import jinja2
import pdfkit
import torch
import logging
import subprocess
from threading import Thread
from flask import Flask, request, send_file, jsonify
from flask_cors import CORS
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Configure cache directories
os.environ['HF_HOME'] = '/app/.cache'
os.environ['XDG_CACHE_HOME'] = '/app/.cache'

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s'
)

# Initialize Flask app
app = Flask(__name__)
CORS(app)

# Global state tracking
model_loaded = False
load_error = None
generator = None

# Find wkhtmltopdf path
WKHTMLTOPDF_PATH = '/usr/bin/wkhtmltopdf'
if not os.path.exists(WKHTMLTOPDF_PATH):
    # Try to find it using which
    try:
        WKHTMLTOPDF_PATH = subprocess.check_output(['which', 'wkhtmltopdf']).decode().strip()
    except:
        app.logger.warning("Could not find wkhtmltopdf path. Using default.")
        WKHTMLTOPDF_PATH = 'wkhtmltopdf'

# Configure wkhtmltopdf
pdf_config = pdfkit.configuration(wkhtmltopdf=WKHTMLTOPDF_PATH)

def load_model():
    global model_loaded, load_error, generator
    try:
        app.logger.info("Starting model loading process")
        
        # Detect device and dtype automatically
        dtype = torch.float16 if torch.cuda.is_available() else torch.float32
        device = "cuda" if torch.cuda.is_available() else "cpu"
        app.logger.info(f"Device set to use {device}")
        
        model = AutoModelForCausalLM.from_pretrained(
            "gpt2",
            use_safetensors=True,
            device_map="auto",
            torch_dtype=dtype,
            low_cpu_mem_usage=True,
            offload_folder="offload"
        )
        
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        
        # Initialize pipeline without explicit device assignment
        generator = pipeline(
            'text-generation',
            model=model,
            tokenizer=tokenizer,
            torch_dtype=dtype
        )
        
        model_loaded = True
        app.logger.info(f"Model loaded successfully on {model.device}")
        
    except Exception as e:
        load_error = str(e)
        app.logger.error(f"Model loading failed: {load_error}", exc_info=True)

# Start model loading in background thread
Thread(target=load_model).start()

# --------------------------------------------------
# IEEE Format Template
# --------------------------------------------------
IEEE_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <title>{{ title }}</title>
    <style>
        @page { margin: 0.75in; }
        body { 
            font-family: 'Times New Roman', Times, serif;
            font-size: 12pt;
            line-height: 1.5;
        }
        .header { text-align: center; margin-bottom: 24pt; }
        .two-column { column-count: 2; column-gap: 0.5in; }
        h1 { font-size: 14pt; margin: 12pt 0; }
        h2 { font-size: 12pt; margin: 12pt 0 6pt 0; }
        .abstract { margin-bottom: 24pt; }
        .keywords { font-weight: bold; margin: 12pt 0; }
        .references { margin-top: 24pt; }
        .reference-item { text-indent: -0.5in; padding-left: 0.5in; }
    </style>
</head>
<body>
    <div class="header">
        <h1>{{ title }}</h1>
        <div class="author-info">
            {% for author in authors %}
            {{ author.name }}<br>
            {% if author.institution %}{{ author.institution }}<br>{% endif %}
            {% if author.email %}Email: {{ author.email }}{% endif %}
            {% if not loop.last %}<br>{% endif %}
            {% endfor %}
        </div>
    </div>
    
    <div class="abstract">
        <h2>Abstract</h2>
        {{ abstract }}
        <div class="keywords">Keywords— {{ keywords }}</div>
    </div>
    <div class="two-column">
        {% for section in sections %}
        <h2>{{ section.title }}</h2>
        {{ section.content }}
        {% endfor %}
    </div>
    <div class="references">
        <h2>References</h2>
        {% for ref in references %}
        <div class="reference-item">[{{ loop.index }}] {{ ref }}</div>
        {% endfor %}
    </div>
</body>
</html>
"""

# --------------------------------------------------
# API Endpoints
# --------------------------------------------------
@app.route('/health', methods=['GET'])
def health_check():
    return jsonify({
        "status": "ok",
        "model_loaded": model_loaded,
        "device": "cuda" if torch.cuda.is_available() else "cpu"
    }), 200
    
    app.logger.info(f"Health check returning status: {'ready' if model_loaded else 'loading'}, device: {device_info}")
    return jsonify({
        "status": "ready" if model_loaded else "loading",
        "model_loaded": model_loaded,
        "device": device_info
    }), status_code

@app.route('/generate', methods=['POST'])
def generate_pdf():
    # Check model status
    if not model_loaded:
        app.logger.error("PDF generation requested but model not loaded")
        return jsonify({
            "error": "Model not loaded yet",
            "status": "loading"
        }), 503
        
    try:
        app.logger.info("Processing PDF generation request")
        
        # Validate input
        data = request.json
        if not data:
            app.logger.error("No data provided in request")
            return jsonify({"error": "No data provided"}), 400
            
        required = ['title', 'authors', 'content']
        if missing := [field for field in required if field not in data]:
            app.logger.error(f"Missing required fields: {missing}")
            return jsonify({
                "error": f"Missing fields: {', '.join(missing)}"
            }), 400

        app.logger.info(f"Received request with title: {data['title']}")
        
        # Format content with model
        app.logger.info("Formatting content using the model")
        formatted = format_content(data['content'])
        
        app.logger.info("Creating HTML from template")
        # Generate HTML
        html = jinja2.Template(IEEE_TEMPLATE).render(
            title=data['title'],
            authors=data['authors'],
            abstract=formatted.get('abstract', ''),
            keywords=', '.join(formatted.get('keywords', [])),
            sections=formatted.get('sections', []),
            references=formatted.get('references', [])
        )

        # PDF options
        options = {
            'page-size': 'Letter',
            'margin-top': '0.75in',
            'margin-right': '0.75in',
            'margin-bottom': '0.75in',
            'margin-left': '0.75in',
            'encoding': 'UTF-8',
            'quiet': ''
        }

        # Create temporary PDF
        app.logger.info("Generating PDF file")
        pdf_path = None
        
        try:
            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as f:
                pdf_path = f.name
            
            # Generate PDF using xvfb-run as a separate process
            html_path = pdf_path + '.html'
            with open(html_path, 'w', encoding='utf-8') as f:
                f.write(html)
                
            command = ['xvfb-run', '-a', WKHTMLTOPDF_PATH] + \
                      [f'--{k}={v}' for k, v in options.items() if v] + \
                      [html_path, pdf_path]
            
            app.logger.info(f"Running command: {' '.join(command)}")
            result = subprocess.run(command, capture_output=True, text=True)
            
            if result.returncode != 0:
                app.logger.error(f"PDF generation command failed: {result.stderr}")
                # Fallback to direct pdfkit if available
                app.logger.info("Trying fallback PDF generation with pdfkit")
                pdfkit.from_string(html, pdf_path, options=options, configuration=pdf_config)
            
            # Clean up HTML file
            os.remove(html_path)
            
            app.logger.info(f"PDF generated successfully at {pdf_path}")
            return send_file(pdf_path, mimetype='application/pdf', as_attachment=True, 
                             download_name=f"{data['title'].replace(' ', '_')}.pdf")

        except Exception as e:
            app.logger.error(f"PDF generation failed: {str(e)}", exc_info=True)
            raise
            
    except Exception as e:
        app.logger.error(f"Request processing failed: {str(e)}", exc_info=True)
        return jsonify({"error": str(e)}), 500
        
    finally:
        # Clean up temporary file
        if 'pdf_path' in locals() and pdf_path:
            try:
                app.logger.info(f"Cleaning up temporary file {pdf_path}")
                os.remove(pdf_path)
            except Exception as e:
                app.logger.warning(f"Failed to remove temporary file: {str(e)}")

# --------------------------------------------------
# Content Formatting
# --------------------------------------------------
def parse_formatted_content(text):
    """Parse the generated text into structured sections"""
    app.logger.info("Parsing formatted content")
    
    try:
        lines = text.split('\n')
        
        # Default structure
        result = {
            'abstract': '',
            'keywords': ['IEEE', 'format', 'research', 'paper'],
            'sections': [],
            'references': []
        }
        
        # Extract abstract (simple approach - first paragraph after "Abstract")
        abstract_start = None
        for i, line in enumerate(lines):
            if line.strip().lower() == 'abstract':
                abstract_start = i + 1
                break
        
        if abstract_start:
            abstract_text = []
            i = abstract_start
            while i < len(lines) and not lines[i].strip().lower().startswith('keyword'):
                if lines[i].strip():
                    abstract_text.append(lines[i].strip())
                i += 1
            result['abstract'] = ' '.join(abstract_text)
        
        # Extract keywords
        for line in lines:
            if line.strip().lower().startswith('keyword'):
                # Extract keywords from the line
                keyword_parts = line.split('—')
                if len(keyword_parts) > 1:
                    keywords = keyword_parts[1].strip().split(',')
                    result['keywords'] = [k.strip() for k in keywords if k.strip()]
                break
        
        # Extract sections
        current_section = None
        section_content = []
        
        # Skip lines until we find a section heading
        started = False
        for line in lines:
            # Very basic heuristic for Roman numerals section headings
            if line.strip() and (line.strip()[0].isupper() or line.strip()[0].isdigit()):
                started = True
            if not started:
                continue
                
            if line.strip() and (line.strip()[0].isupper() or line.strip()[0].isdigit()) and len(line.strip().split()) <= 6:
                # This is likely a section heading
                if current_section:
                    # Save the previous section
                    result['sections'].append({
                        'title': current_section,
                        'content': '\n'.join(section_content)
                    })
                    section_content = []
                
                current_section = line.strip()
            elif current_section and line.strip().lower() == 'references':
                # We've reached the references section
                if current_section:
                    # Save the last section
                    result['sections'].append({
                        'title': current_section,
                        'content': '\n'.join(section_content)
                    })
                break
            elif current_section:
                # Add to current section content
                section_content.append(line)
        
        # Extract references
        in_references = False
        for line in lines:
            if line.strip().lower() == 'references':
                in_references = True
                continue
            
            if in_references and line.strip():
                result['references'].append(line.strip())
        
        app.logger.info(f"Content parsed into {len(result['sections'])} sections and {len(result['references'])} references")
        return result
        
    except Exception as e:
        app.logger.error(f"Error parsing formatted content: {str(e)}", exc_info=True)
        # Return a basic structure if parsing fails
        return {
            'abstract': 'Error parsing content.',
            'keywords': ['IEEE', 'format'],
            'sections': [{'title': 'Content', 'content': text}],
            'references': []
        }

def format_content(content):
    """Format the content using the ML model"""
    try:
        app.logger.info("Formatting content with ML model")
        prompt = f"Format this research content to IEEE standards with sections, abstract, and references:\n\n{str(content)}"
        
        response = generator(
            prompt,
            max_new_tokens=1024,  # Increased for more complete generation
            temperature=0.5,      # More deterministic output
            do_sample=True,
            truncation=True,
            num_return_sequences=1
        )
        
        generated_text = response[0]['generated_text']
        
        # Remove the prompt from the generated text
        if prompt in generated_text:
            formatted_text = generated_text[len(prompt):].strip()
        else:
            formatted_text = generated_text
            
        app.logger.info("Content formatted successfully")
        
        # Parse the formatted text into structured sections
        return parse_formatted_content(formatted_text)
        
    except Exception as e:
        app.logger.error(f"Error formatting content: {str(e)}", exc_info=True)
        # Return the original content if formatting fails
        return {
            'abstract': 'Content processing error.',
            'keywords': ['IEEE', 'format'],
            'sections': [{'title': 'Content', 'content': str(content)}],
            'references': []
        }

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)