Omarrran commited on
Commit
2b40e0f
·
verified ·
1 Parent(s): e095dcf

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +1272 -0
app.py ADDED
@@ -0,0 +1,1272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Comprehensive Unstructured Document Processing with Gradio Interface
2
+ # This notebook demonstrates a robust implementation of Unstructured's features with a Gradio UI
3
+
4
+ # Cell 1: Install required packages
5
+ !pip install -q unstructured "unstructured[all-docs]" gradio pandas numpy matplotlib seaborn plotly sentence-transformers nltk langchain google-colab tqdm huggingface_hub python-magic pdfminer.six pdf2image tabulate pytesseract pillow
6
+
7
+ # Also install spaCy for NER and other text processing
8
+ !pip install -q spacy
9
+ !python -m spacy download en_core_web_sm
10
+
11
+ # Cell 2: Import necessary libraries
12
+ import os
13
+ import re
14
+ import json
15
+ import time
16
+ import nltk
17
+ import spacy
18
+ import numpy as np
19
+ import pandas as pd
20
+ import matplotlib.pyplot as plt
21
+ import seaborn as sns
22
+ import plotly.express as px
23
+ import plotly.graph_objects as go
24
+ from IPython.display import display, HTML, clear_output
25
+ from datetime import datetime
26
+ from tqdm.auto import tqdm
27
+ import tempfile
28
+ import shutil
29
+ import logging
30
+ import warnings
31
+ from pathlib import Path
32
+ import gradio as gr
33
+
34
+ # Set up logging
35
+ logging.basicConfig(
36
+ level=logging.INFO,
37
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
38
+ handlers=[logging.StreamHandler()]
39
+ )
40
+ logger = logging.getLogger("UnstructuredApp")
41
+
42
+ # Suppress warnings
43
+ warnings.filterwarnings('ignore')
44
+
45
+ # Download required NLTK data
46
+ nltk.download('punkt', quiet=True)
47
+ nltk.download('stopwords', quiet=True)
48
+ nltk.download('wordnet', quiet=True)
49
+
50
+ # Load spaCy model
51
+ nlp = spacy.load("en_core_web_sm")
52
+
53
+ # Import Unstructured components
54
+ from unstructured.partition.auto import partition
55
+ from unstructured.partition.pdf import partition_pdf
56
+ from unstructured.partition.html import partition_html
57
+ from unstructured.partition.pptx import partition_pptx
58
+ from unstructured.partition.docx import partition_docx
59
+ from unstructured.partition.xlsx import partition_xlsx
60
+ from unstructured.partition.image import partition_image
61
+ from unstructured.partition.email import partition_email
62
+ from unstructured.partition.json import partition_json
63
+ from unstructured.partition.csv import partition_csv
64
+ from unstructured.partition.xml import partition_xml
65
+ from unstructured.cleaners.core import (
66
+ clean_extra_whitespace,
67
+ replace_unicode_quotes,
68
+ clean_bullets,
69
+ group_broken_paragraphs,
70
+ clean_dashes,
71
+ remove_punctuation
72
+ )
73
+ # Use regex patterns instead of unavailable extract functions
74
+ import re
75
+ from unstructured.staging.base import elements_to_json
76
+ from unstructured.chunking.title import chunk_by_title
77
+ from unstructured.staging.base import convert_to_dict
78
+ from unstructured.documents.elements import (
79
+ Title, Text, NarrativeText, ListItem,
80
+ Table, Image, PageBreak, Footer, Header,
81
+ Address
82
+ )
83
+
84
+ # Define our own regex patterns for extraction
85
+ EMAIL_PATTERN = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
86
+ URL_PATTERN = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[/\w\.-]*/?'
87
+ PHONE_PATTERN = r'(\+\d{1,3}[- ]?)?\(?\d{3}\)?[- ]?\d{3}[- ]?\d{4}'
88
+ IP_PATTERN = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
89
+ from sentence_transformers import SentenceTransformer, util
90
+
91
+ # Cell 3: Define utility functions for file handling and processing
92
+ def create_temp_dir():
93
+ """Create a temporary directory for file uploads"""
94
+ temp_dir = tempfile.mkdtemp()
95
+ return temp_dir
96
+
97
+ def save_uploaded_file(file, temp_dir):
98
+ """Save uploaded file to temporary directory"""
99
+ if file is None:
100
+ return None
101
+
102
+ file_path = os.path.join(temp_dir, file.name)
103
+ with open(file_path, 'wb') as f:
104
+ f.write(file.read())
105
+ return file_path
106
+
107
+ def get_file_extension(file_path):
108
+ """Get file extension from path"""
109
+ if file_path is None:
110
+ return None
111
+ return os.path.splitext(file_path)[1].lower()
112
+
113
+ def identify_file_type(file_path):
114
+ """Identify file type based on extension"""
115
+ if file_path is None:
116
+ return None
117
+
118
+ ext = get_file_extension(file_path)
119
+ file_types = {
120
+ '.pdf': 'PDF',
121
+ '.html': 'HTML',
122
+ '.htm': 'HTML',
123
+ '.docx': 'DOCX',
124
+ '.doc': 'DOC',
125
+ '.pptx': 'PPTX',
126
+ '.ppt': 'PPT',
127
+ '.xlsx': 'XLSX',
128
+ '.xls': 'XLS',
129
+ '.txt': 'TXT',
130
+ '.csv': 'CSV',
131
+ '.json': 'JSON',
132
+ '.xml': 'XML',
133
+ '.eml': 'EMAIL',
134
+ '.msg': 'EMAIL',
135
+ '.jpg': 'IMAGE',
136
+ '.jpeg': 'IMAGE',
137
+ '.png': 'IMAGE',
138
+ '.tiff': 'IMAGE',
139
+ '.tif': 'IMAGE'
140
+ }
141
+
142
+ return file_types.get(ext, 'UNKNOWN')
143
+
144
+ def partition_file(file_path, partition_kwargs=None):
145
+ """
146
+ Partition file using appropriate method based on file type
147
+
148
+ Args:
149
+ file_path: Path to the file
150
+ partition_kwargs: Dictionary of kwargs for partition function
151
+
152
+ Returns:
153
+ List of elements
154
+ """
155
+ if file_path is None:
156
+ return []
157
+
158
+ if partition_kwargs is None:
159
+ partition_kwargs = {}
160
+
161
+ file_type = identify_file_type(file_path)
162
+
163
+ try:
164
+ if file_type == 'PDF':
165
+ # Add PDF-specific kwargs
166
+ pdf_kwargs = {
167
+ 'extract_images': True,
168
+ 'infer_table_structure': True,
169
+ 'include_page_breaks': True,
170
+ **partition_kwargs
171
+ }
172
+ return partition_pdf(filename=file_path, **pdf_kwargs)
173
+
174
+ elif file_type == 'HTML':
175
+ # Add HTML-specific kwargs
176
+ html_kwargs = {
177
+ 'extract_links': True,
178
+ **partition_kwargs
179
+ }
180
+ return partition_html(filename=file_path, **html_kwargs)
181
+
182
+ elif file_type == 'DOCX':
183
+ return partition_docx(filename=file_path, **partition_kwargs)
184
+
185
+ elif file_type == 'PPTX':
186
+ return partition_pptx(filename=file_path, **partition_kwargs)
187
+
188
+ elif file_type == 'XLSX':
189
+ return partition_xlsx(filename=file_path, **partition_kwargs)
190
+
191
+ elif file_type == 'IMAGE':
192
+ # Add image-specific kwargs
193
+ image_kwargs = {
194
+ 'strategy': 'hi_res',
195
+ 'languages': ['eng'],
196
+ **partition_kwargs
197
+ }
198
+ return partition_image(filename=file_path, **image_kwargs)
199
+
200
+ elif file_type == 'EMAIL':
201
+ return partition_email(filename=file_path, **partition_kwargs)
202
+
203
+ elif file_type == 'JSON':
204
+ return partition_json(filename=file_path, **partition_kwargs)
205
+
206
+ elif file_type == 'CSV':
207
+ return partition_csv(filename=file_path, **partition_kwargs)
208
+
209
+ elif file_type == 'XML':
210
+ return partition_xml(filename=file_path, **partition_kwargs)
211
+
212
+ else:
213
+ # Use auto partition for other file types
214
+ return partition(filename=file_path, **partition_kwargs)
215
+
216
+ except Exception as e:
217
+ logger.error(f"Error partitioning file {file_path}: {str(e)}")
218
+ raise Exception(f"Error processing {file_path}: {str(e)}")
219
+
220
+ # Cell 4: Define element cleaning and processing functions
221
+ def clean_elements(elements, cleaning_options=None):
222
+ """
223
+ Clean elements based on selected options
224
+
225
+ Args:
226
+ elements: List of elements to clean
227
+ cleaning_options: Dictionary of cleaning options to apply
228
+
229
+ Returns:
230
+ Cleaned elements
231
+ """
232
+ if cleaning_options is None or not elements:
233
+ return elements
234
+
235
+ cleaned_elements = []
236
+ for element in elements:
237
+ # Skip non-text elements
238
+ if not hasattr(element, 'text'):
239
+ cleaned_elements.append(element)
240
+ continue
241
+
242
+ # Apply cleaning operations based on selected options
243
+ cleaned_text = element.text
244
+
245
+ if cleaning_options.get('extra_whitespace', False):
246
+ cleaned_text = clean_extra_whitespace(cleaned_text)
247
+
248
+ if cleaning_options.get('unicode_quotes', False):
249
+ cleaned_text = replace_unicode_quotes(cleaned_text)
250
+
251
+ if cleaning_options.get('bullets', False):
252
+ cleaned_text = clean_bullets(cleaned_text)
253
+
254
+ if cleaning_options.get('dashes', False):
255
+ cleaned_text = clean_dashes(cleaned_text)
256
+
257
+ if cleaning_options.get('group_paragraphs', False):
258
+ cleaned_text = group_broken_paragraphs(cleaned_text)
259
+
260
+ if cleaning_options.get('remove_punctuation', False):
261
+ cleaned_text = remove_punctuation(cleaned_text)
262
+
263
+ # Update the element's text
264
+ element.text = cleaned_text
265
+ cleaned_elements.append(element)
266
+
267
+ return cleaned_elements
268
+
269
+ def extract_entities(elements, extraction_options=None):
270
+ """
271
+ Extract entities from elements based on selected options using regex
272
+
273
+ Args:
274
+ elements: List of elements
275
+ extraction_options: Dictionary of extraction options to apply
276
+
277
+ Returns:
278
+ Elements with extracted entities in metadata
279
+ """
280
+ if extraction_options is None or not elements:
281
+ return elements
282
+
283
+ processed_elements = []
284
+
285
+ for element in elements:
286
+ # Skip non-text elements
287
+ if not hasattr(element, 'text'):
288
+ processed_elements.append(element)
289
+ continue
290
+
291
+ # Initialize metadata if doesn't exist
292
+ if not hasattr(element, 'metadata'):
293
+ element.metadata = {}
294
+
295
+ element.metadata['extracted_entities'] = {}
296
+
297
+ # Extract entities based on selected options using regex
298
+ if extraction_options.get('emails', False):
299
+ element.metadata['extracted_entities']['emails'] = re.findall(EMAIL_PATTERN, element.text)
300
+
301
+ if extraction_options.get('urls', False):
302
+ element.metadata['extracted_entities']['urls'] = re.findall(URL_PATTERN, element.text)
303
+
304
+ if extraction_options.get('phone_numbers', False):
305
+ element.metadata['extracted_entities']['phone_numbers'] = re.findall(PHONE_PATTERN, element.text)
306
+
307
+ if extraction_options.get('ip_addresses', False):
308
+ element.metadata['extracted_entities']['ip_addresses'] = re.findall(IP_PATTERN, element.text)
309
+
310
+ # Use spaCy for NER if selected
311
+ if extraction_options.get('ner', False):
312
+ doc = nlp(element.text)
313
+ element.metadata['extracted_entities']['named_entities'] = [
314
+ {'text': ent.text, 'label': ent.label_} for ent in doc.ents
315
+ ]
316
+
317
+ processed_elements.append(element)
318
+
319
+ return processed_elements
320
+
321
+ def categorize_elements(elements):
322
+ """
323
+ Categorize elements by type and provide statistics
324
+
325
+ Args:
326
+ elements: List of elements
327
+
328
+ Returns:
329
+ Dictionary with element statistics
330
+ """
331
+ if not elements:
332
+ return {}
333
+
334
+ element_types = {}
335
+ for element in elements:
336
+ element_type = type(element).__name__
337
+ if element_type not in element_types:
338
+ element_types[element_type] = 0
339
+ element_types[element_type] += 1
340
+
341
+ total_elements = len(elements)
342
+ element_stats = {
343
+ 'total': total_elements,
344
+ 'by_type': element_types,
345
+ 'type_percentages': {k: round(v/total_elements*100, 2) for k, v in element_types.items()}
346
+ }
347
+
348
+ return element_stats
349
+
350
+ def chunk_elements(elements, chunking_method, **kwargs):
351
+ """
352
+ Chunk elements using specified method
353
+
354
+ Args:
355
+ elements: List of elements to chunk
356
+ chunking_method: Method to use for chunking
357
+ **kwargs: Additional arguments for chunking method
358
+
359
+ Returns:
360
+ List of chunks
361
+ """
362
+ if not elements:
363
+ return []
364
+
365
+ try:
366
+ if chunking_method == 'by_title':
367
+ return chunk_by_title(elements, **kwargs)
368
+ elif chunking_method == 'by_token':
369
+ # Implement a simple version of token-based chunking
370
+ from unstructured.chunking.base import Chunk
371
+
372
+ max_chars = kwargs.get('max_characters', 2000)
373
+
374
+ chunks = []
375
+ current_chunk = []
376
+ current_char_count = 0
377
+
378
+ for element in elements:
379
+ if not hasattr(element, 'text'):
380
+ # If the element has no text, just add it to the current chunk
381
+ current_chunk.append(element)
382
+ continue
383
+
384
+ element_text_len = len(element.text)
385
+
386
+ # If adding this element would exceed the max chars, start a new chunk
387
+ if current_char_count + element_text_len > max_chars and current_chunk:
388
+ chunks.append(Chunk(elements=current_chunk))
389
+ current_chunk = [element]
390
+ current_char_count = element_text_len
391
+ else:
392
+ current_chunk.append(element)
393
+ current_char_count += element_text_len
394
+
395
+ # Add the last chunk if it's not empty
396
+ if current_chunk:
397
+ chunks.append(Chunk(elements=current_chunk))
398
+
399
+ return chunks
400
+ else:
401
+ # Default to title chunking
402
+ return chunk_by_title(elements, **kwargs)
403
+ except Exception as e:
404
+ logger.error(f"Error chunking elements: {str(e)}")
405
+ # If chunking fails, return single chunk with all elements
406
+ from unstructured.chunking.base import Chunk
407
+ return [Chunk(elements=elements)]
408
+
409
+ # Cell 5: Define functions for visualization and analysis
410
+ def visualize_element_distribution(element_stats):
411
+ """
412
+ Create a bar chart of element type distribution
413
+
414
+ Args:
415
+ element_stats: Dictionary with element statistics
416
+
417
+ Returns:
418
+ Plotly figure
419
+ """
420
+ if not element_stats or 'by_type' not in element_stats:
421
+ return None
422
+
423
+ element_types = list(element_stats['by_type'].keys())
424
+ element_counts = list(element_stats['by_type'].values())
425
+
426
+ fig = px.bar(
427
+ x=element_types,
428
+ y=element_counts,
429
+ labels={'x': 'Element Type', 'y': 'Count'},
430
+ title='Distribution of Element Types',
431
+ color=element_types,
432
+ text=element_counts
433
+ )
434
+
435
+ fig.update_layout(
436
+ xaxis_title='Element Type',
437
+ yaxis_title='Count',
438
+ showlegend=False
439
+ )
440
+
441
+ return fig
442
+
443
+ def generate_embeddings(chunks, model_name):
444
+ """
445
+ Generate embeddings for chunks
446
+
447
+ Args:
448
+ chunks: List of chunks
449
+ model_name: Name of the embedding model to use
450
+
451
+ Returns:
452
+ Dictionary with chunk texts and embeddings
453
+ """
454
+ if not chunks:
455
+ return {}
456
+
457
+ # Load model
458
+ try:
459
+ model = SentenceTransformer(model_name)
460
+ except Exception as e:
461
+ logger.error(f"Error loading embedding model: {str(e)}")
462
+ raise Exception(f"Error loading embedding model {model_name}: {str(e)}")
463
+
464
+ # Generate text for embedding
465
+ chunk_texts = []
466
+ for chunk in chunks:
467
+ chunk_text = "\n".join([e.text for e in chunk.elements if hasattr(e, 'text')])
468
+ chunk_texts.append(chunk_text)
469
+
470
+ # Generate embeddings
471
+ embeddings = model.encode(chunk_texts, show_progress_bar=True)
472
+
473
+ return {
474
+ 'texts': chunk_texts,
475
+ 'embeddings': embeddings,
476
+ 'model': model_name,
477
+ 'dimension': embeddings.shape[1]
478
+ }
479
+
480
+ def visualize_embeddings_tsne(embedding_data):
481
+ """
482
+ Visualize embeddings using t-SNE
483
+
484
+ Args:
485
+ embedding_data: Dictionary with embeddings
486
+
487
+ Returns:
488
+ Plotly figure
489
+ """
490
+ if not embedding_data or 'embeddings' not in embedding_data:
491
+ return None
492
+
493
+ from sklearn.manifold import TSNE
494
+
495
+ # Apply t-SNE to reduce dimensions for visualization
496
+ tsne = TSNE(n_components=2, random_state=42)
497
+ reduced_embeddings = tsne.fit_transform(embedding_data['embeddings'])
498
+
499
+ # Create DataFrame for plotting
500
+ df = pd.DataFrame({
501
+ 'x': reduced_embeddings[:, 0],
502
+ 'y': reduced_embeddings[:, 1],
503
+ 'chunk_id': [f"Chunk {i+1}" for i in range(len(reduced_embeddings))]
504
+ })
505
+
506
+ # Add text length as size
507
+ df['text_length'] = [len(text) for text in embedding_data['texts']]
508
+
509
+ # Normalize text length for sizing
510
+ max_length = df['text_length'].max()
511
+ df['size'] = df['text_length'].apply(lambda x: max(10, min(40, x / max_length * 40)))
512
+
513
+ # Create plot
514
+ fig = px.scatter(
515
+ df, x='x', y='y',
516
+ text='chunk_id',
517
+ size='size',
518
+ title=f"t-SNE Visualization of Document Embeddings ({embedding_data['model']})",
519
+ hover_data=['text_length']
520
+ )
521
+
522
+ fig.update_traces(
523
+ textposition='top center',
524
+ marker=dict(sizemode='diameter')
525
+ )
526
+
527
+ fig.update_layout(
528
+ xaxis_title='t-SNE Dimension 1',
529
+ yaxis_title='t-SNE Dimension 2',
530
+ showlegend=False
531
+ )
532
+
533
+ return fig
534
+
535
+ def generate_similarity_matrix(embedding_data):
536
+ """
537
+ Generate similarity matrix for chunks
538
+
539
+ Args:
540
+ embedding_data: Dictionary with embeddings
541
+
542
+ Returns:
543
+ Plotly figure with similarity matrix
544
+ """
545
+ if not embedding_data or 'embeddings' not in embedding_data:
546
+ return None
547
+
548
+ # Calculate cosine similarity
549
+ embeddings = embedding_data['embeddings']
550
+ similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
551
+
552
+ # Create labels for each chunk
553
+ labels = [f"Chunk {i+1}" for i in range(similarity_matrix.shape[0])]
554
+
555
+ # Create heatmap
556
+ fig = go.Figure(data=go.Heatmap(
557
+ z=similarity_matrix,
558
+ x=labels,
559
+ y=labels,
560
+ colorscale='Viridis',
561
+ zmin=0, zmax=1
562
+ ))
563
+
564
+ fig.update_layout(
565
+ title='Semantic Similarity Between Chunks',
566
+ xaxis_title='Chunk ID',
567
+ yaxis_title='Chunk ID',
568
+ )
569
+
570
+ return fig
571
+
572
+ def extract_top_keywords(chunks, top_n=10):
573
+ """
574
+ Extract top keywords from chunks using TF-IDF
575
+
576
+ Args:
577
+ chunks: List of chunks
578
+ top_n: Number of top keywords to extract
579
+
580
+ Returns:
581
+ Dictionary with top keywords for each chunk
582
+ """
583
+ if not chunks:
584
+ return {}
585
+
586
+ from sklearn.feature_extraction.text import TfidfVectorizer
587
+ from nltk.corpus import stopwords
588
+
589
+ # Get text from each chunk
590
+ chunk_texts = []
591
+ for chunk in chunks:
592
+ chunk_text = " ".join([e.text for e in chunk.elements if hasattr(e, 'text')])
593
+ chunk_texts.append(chunk_text)
594
+
595
+ # Get English stopwords
596
+ stop_words = set(stopwords.words('english'))
597
+
598
+ # Initialize vectorizer
599
+ vectorizer = TfidfVectorizer(
600
+ max_features=1000,
601
+ stop_words=stop_words,
602
+ ngram_range=(1, 2)
603
+ )
604
+
605
+ # Fit vectorizer
606
+ try:
607
+ tfidf_matrix = vectorizer.fit_transform(chunk_texts)
608
+ except Exception as e:
609
+ logger.error(f"Error extracting keywords: {str(e)}")
610
+ return {}
611
+
612
+ # Get feature names
613
+ feature_names = vectorizer.get_feature_names_out()
614
+
615
+ # Extract top keywords for each chunk
616
+ top_keywords = {}
617
+ for i, chunk_vec in enumerate(tfidf_matrix):
618
+ # Convert sparse matrix to dense and get top indices
619
+ dense = chunk_vec.todense()
620
+ dense_list = dense.tolist()[0]
621
+ sorted_indices = np.argsort(dense_list)[::-1][:top_n]
622
+
623
+ # Get keywords and scores
624
+ keywords = [(feature_names[idx], dense_list[idx]) for idx in sorted_indices]
625
+
626
+ top_keywords[f"Chunk {i+1}"] = keywords
627
+
628
+ return top_keywords
629
+
630
+ def visualize_keywords(keywords_data):
631
+ """
632
+ Visualize top keywords across chunks
633
+
634
+ Args:
635
+ keywords_data: Dictionary with keywords for each chunk
636
+
637
+ Returns:
638
+ Plotly figure
639
+ """
640
+ if not keywords_data:
641
+ return None
642
+
643
+ # Prepare data for visualization
644
+ data = []
645
+ for chunk_id, keywords in keywords_data.items():
646
+ for keyword, score in keywords:
647
+ data.append({
648
+ 'chunk': chunk_id,
649
+ 'keyword': keyword,
650
+ 'score': score
651
+ })
652
+
653
+ # Create DataFrame
654
+ df = pd.DataFrame(data)
655
+
656
+ # Create heatmap
657
+ pivot_df = df.pivot(index='keyword', columns='chunk', values='score')
658
+
659
+ # Sort by average score
660
+ pivot_df['avg'] = pivot_df.mean(axis=1)
661
+ pivot_df = pivot_df.sort_values('avg', ascending=False).drop('avg', axis=1)
662
+
663
+ # Create figure
664
+ fig = px.imshow(
665
+ pivot_df,
666
+ labels=dict(x="Chunk", y="Keyword", color="TF-IDF Score"),
667
+ x=pivot_df.columns,
668
+ y=pivot_df.index,
669
+ color_continuous_scale="Viridis",
670
+ aspect="auto"
671
+ )
672
+
673
+ fig.update_layout(
674
+ title='Top Keywords Across Chunks',
675
+ height=600
676
+ )
677
+
678
+ return fig
679
+
680
+ # Cell 6: Define functions for the final output formats
681
+ def generate_final_output(chunks, embedding_data=None, processing_stats=None):
682
+ """
683
+ Generate final structured output
684
+
685
+ Args:
686
+ chunks: List of chunks
687
+ embedding_data: Dictionary with embeddings
688
+ processing_stats: Dictionary with processing statistics
689
+
690
+ Returns:
691
+ Dictionary with final structured data
692
+ """
693
+ if not chunks:
694
+ return {}
695
+
696
+ # Initialize final data structure
697
+ final_data = {
698
+ 'metadata': {
699
+ 'timestamp': datetime.now().isoformat(),
700
+ 'num_chunks': len(chunks),
701
+ 'processing_stats': processing_stats or {}
702
+ },
703
+ 'chunks': []
704
+ }
705
+
706
+ # Get embeddings if available
707
+ embeddings = embedding_data.get('embeddings', []) if embedding_data else []
708
+
709
+ # Process each chunk
710
+ for i, chunk in enumerate(chunks):
711
+ # Get text from chunk
712
+ chunk_text = "\n".join([e.text for e in chunk.elements if hasattr(e, 'text')])
713
+
714
+ # Get element types in chunk
715
+ element_types = {}
716
+ for e in chunk.elements:
717
+ element_type = type(e).__name__
718
+ if element_type not in element_types:
719
+ element_types[element_type] = 0
720
+ element_types[element_type] += 1
721
+
722
+ # Add chunk data
723
+ chunk_data = {
724
+ 'chunk_id': f"chunk_{i+1}",
725
+ 'metadata': {
726
+ 'element_types': element_types,
727
+ 'num_elements': len(chunk.elements),
728
+ 'text_length': len(chunk_text)
729
+ },
730
+ 'text': chunk_text,
731
+ 'elements': [convert_to_dict(e) for e in chunk.elements]
732
+ }
733
+
734
+ # Add embedding if available
735
+ if i < len(embeddings):
736
+ chunk_data['embedding'] = embeddings[i].tolist()
737
+
738
+ final_data['chunks'].append(chunk_data)
739
+
740
+ return final_data
741
+
742
+ def format_for_qa(chunks):
743
+ """
744
+ Format chunks for question answering
745
+
746
+ Args:
747
+ chunks: List of chunks
748
+
749
+ Returns:
750
+ List of documents in format suitable for QA systems
751
+ """
752
+ if not chunks:
753
+ return []
754
+
755
+ qa_docs = []
756
+ for i, chunk in enumerate(chunks):
757
+ # Get text from chunk
758
+ chunk_text = "\n".join([e.text for e in chunk.elements if hasattr(e, 'text')])
759
+
760
+ # Create document
761
+ doc = {
762
+ 'id': f"chunk_{i+1}",
763
+ 'content': chunk_text,
764
+ 'metadata': {
765
+ 'num_elements': len(chunk.elements),
766
+ 'element_types': [type(e).__name__ for e in chunk.elements]
767
+ }
768
+ }
769
+
770
+ qa_docs.append(doc)
771
+
772
+ return qa_docs
773
+
774
+ def format_for_transformers(chunks):
775
+ """
776
+ Format chunks for HuggingFace transformers
777
+
778
+ Args:
779
+ chunks: List of chunks
780
+
781
+ Returns:
782
+ Dictionary with data formatted for transformers
783
+ """
784
+ if not chunks:
785
+ return {}
786
+
787
+ # Create a simple format for transformers
788
+ try:
789
+ # Extract text from chunks
790
+ texts = []
791
+ for chunk in chunks:
792
+ chunk_text = "\n".join([e.text for e in chunk.elements if hasattr(e, 'text')])
793
+ texts.append(chunk_text)
794
+
795
+ # Create dataset structure
796
+ transformer_data = {
797
+ "text": texts,
798
+ "metadata": [{"chunk_id": f"chunk_{i}"} for i in range(len(texts))]
799
+ }
800
+ return transformer_data
801
+
802
+ except Exception as e:
803
+ logger.error(f"Error formatting for transformers: {str(e)}")
804
+ return {}
805
+
806
+ def format_for_label_studio(elements):
807
+ """
808
+ Format elements for Label Studio
809
+
810
+ Args:
811
+ elements: List of elements
812
+
813
+ Returns:
814
+ Dictionary with data formatted for Label Studio
815
+ """
816
+ if not elements:
817
+ return {}
818
+
819
+ try:
820
+ # Create a basic format for Label Studio
821
+ label_studio_data = []
822
+ for i, element in enumerate(elements):
823
+ if hasattr(element, 'text'):
824
+ label_studio_data.append({
825
+ "id": i,
826
+ "text": element.text,
827
+ "element_type": type(element).__name__,
828
+ "metadata": element.metadata if hasattr(element, 'metadata') else {}
829
+ })
830
+
831
+ return label_studio_data
832
+ except Exception as e:
833
+ logger.error(f"Error formatting for Label Studio: {str(e)}")
834
+ return {}
835
+
836
+ # Cell 7: Build the Gradio interface components
837
+ def process_files(
838
+ files,
839
+ partition_options,
840
+ cleaning_options,
841
+ extraction_options,
842
+ chunking_method,
843
+ chunking_options,
844
+ embedding_model,
845
+ output_format
846
+ ):
847
+ """
848
+ Main processing function for the Gradio interface
849
+
850
+ Args:
851
+ files: List of uploaded files
852
+ partition_options: Dictionary of partitioning options
853
+ cleaning_options: Dictionary of cleaning options
854
+ extraction_options: Dictionary of extraction options
855
+ chunking_method: Method to use for chunking
856
+ chunking_options: Dictionary of chunking options
857
+ embedding_model: Model to use for embeddings
858
+ output_format: Format for final output
859
+
860
+ Returns:
861
+ Tuple of (
862
+ status_html,
863
+ log_html,
864
+ element_stats,
865
+ element_chart,
866
+ similarity_matrix,
867
+ embedding_viz,
868
+ keyword_viz,
869
+ output_data
870
+ )
871
+ """
872
+ # Create temp directory for uploads
873
+ temp_dir = create_temp_dir()
874
+
875
+ # Initialize status and logs
876
+ status_html = "<div style='color: blue;'>Initializing processing pipeline...</div>"
877
+ log_html = "<div style='font-family: monospace; height: 200px; overflow-y: auto;'>"
878
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Starting document processing pipeline\n"
879
+
880
+ try:
881
+ # Save uploaded files
882
+ file_paths = []
883
+ for file in files:
884
+ if file is None:
885
+ continue
886
+
887
+ file_path = save_uploaded_file(file, temp_dir)
888
+ file_paths.append(file_path)
889
+
890
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Saved {file.name} to temporary directory\n"
891
+
892
+ if not file_paths:
893
+ status_html = "<div style='color: red;'>No files were uploaded. Please upload at least one file.</div>"
894
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Error: No files were uploaded\n"
895
+ log_html += "</div>"
896
+ return status_html, log_html, None, None, None, None, None, None
897
+
898
+ # Process each file
899
+ all_elements = []
900
+ for file_path in file_paths:
901
+ file_name = os.path.basename(file_path)
902
+ file_type = identify_file_type(file_path)
903
+
904
+ status_html = f"<div style='color: blue;'>Processing {file_name} ({file_type})...</div>"
905
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Processing {file_name} ({file_type})\n"
906
+
907
+ # Partition file
908
+ partition_kwargs = {k: v for k, v in partition_options.items() if v}
909
+
910
+ elements = partition_file(file_path, partition_kwargs)
911
+
912
+ # Add source information to elements
913
+ for element in elements:
914
+ if not hasattr(element, 'metadata'):
915
+ element.metadata = {}
916
+
917
+ element.metadata.update({
918
+ 'source_filename': file_name,
919
+ 'source_filetype': file_type
920
+ })
921
+
922
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Extracted {len(elements)} elements from {file_name}\n"
923
+ all_elements.extend(elements)
924
+
925
+ # Process all elements
926
+ status_html = "<div style='color: blue;'>Cleaning and processing elements...</div>"
927
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Processing {len(all_elements)} elements\n"
928
+
929
+ # Clean elements
930
+ cleaning_kwargs = {k: v for k, v in cleaning_options.items() if v}
931
+ if cleaning_kwargs:
932
+ cleaned_elements = clean_elements(all_elements, cleaning_kwargs)
933
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Applied {len(cleaning_kwargs)} cleaning operations\n"
934
+ else:
935
+ cleaned_elements = all_elements
936
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] No cleaning operations selected\n"
937
+
938
+ # Extract entities
939
+ extraction_kwargs = {k: v for k, v in extraction_options.items() if v}
940
+ if extraction_kwargs:
941
+ processed_elements = extract_entities(cleaned_elements, extraction_kwargs)
942
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Applied {len(extraction_kwargs)} extraction operations\n"
943
+ else:
944
+ processed_elements = cleaned_elements
945
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] No extraction operations selected\n"
946
+
947
+ # Categorize elements
948
+ element_stats = categorize_elements(processed_elements)
949
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Categorized {element_stats['total']} elements into {len(element_stats['by_type'])} types\n"
950
+
951
+ # Create element distribution chart
952
+ element_chart = visualize_element_distribution(element_stats)
953
+
954
+ # Chunk elements
955
+ status_html = "<div style='color: blue;'>Chunking elements...</div>"
956
+ chunking_kwargs = {k: v for k, v in chunking_options.items() if v}
957
+ chunks = chunk_elements(processed_elements, chunking_method, **chunking_kwargs)
958
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Created {len(chunks)} chunks using {chunking_method} method\n"
959
+
960
+ # Extract keywords
961
+ status_html = "<div style='color: blue;'>Extracting keywords...</div>"
962
+ keywords_data = extract_top_keywords(chunks)
963
+ keyword_viz = visualize_keywords(keywords_data)
964
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Extracted keywords from {len(keywords_data)} chunks\n"
965
+
966
+ # Generate embeddings
967
+ if embedding_model:
968
+ status_html = f"<div style='color: blue;'>Generating embeddings using {embedding_model}...</div>"
969
+ embedding_data = generate_embeddings(chunks, embedding_model)
970
+
971
+ # Create embedding visualizations
972
+ embedding_viz = visualize_embeddings_tsne(embedding_data)
973
+ similarity_matrix = generate_similarity_matrix(embedding_data)
974
+
975
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Generated {embedding_data['dimension']}-dimensional embeddings\n"
976
+ else:
977
+ embedding_data = None
978
+ embedding_viz = None
979
+ similarity_matrix = None
980
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Skipped embedding generation (no model selected)\n"
981
+
982
+ # Generate final output
983
+ status_html = "<div style='color: blue;'>Generating final output...</div>"
984
+
985
+ processing_stats = {
986
+ 'num_files': len(file_paths),
987
+ 'file_types': [identify_file_type(fp) for fp in file_paths],
988
+ 'total_elements': element_stats['total'],
989
+ 'element_types': element_stats['by_type'],
990
+ 'num_chunks': len(chunks)
991
+ }
992
+
993
+ if output_format == 'json':
994
+ output_data = generate_final_output(chunks, embedding_data, processing_stats)
995
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Generated JSON output with {len(output_data['chunks'])} chunks\n"
996
+
997
+ elif output_format == 'qa':
998
+ output_data = format_for_qa(chunks)
999
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Generated Q&A format with {len(output_data)} documents\n"
1000
+
1001
+ elif output_format == 'transformers':
1002
+ output_data = format_for_transformers(chunks)
1003
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Generated Transformer format\n"
1004
+
1005
+ elif output_format == 'label_studio':
1006
+ output_data = format_for_label_studio(processed_elements)
1007
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Generated Label Studio format\n"
1008
+
1009
+ else:
1010
+ # Default to JSON
1011
+ output_data = generate_final_output(chunks, embedding_data, processing_stats)
1012
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Generated default JSON output\n"
1013
+
1014
+ status_html = "<div style='color: green;'>Processing complete! ✅</div>"
1015
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Successfully completed document processing pipeline\n"
1016
+
1017
+ except Exception as e:
1018
+ status_html = f"<div style='color: red;'>Error in processing: {str(e)}</div>"
1019
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] ERROR: {str(e)}\n"
1020
+
1021
+ element_stats = None
1022
+ element_chart = None
1023
+ embedding_viz = None
1024
+ similarity_matrix = None
1025
+ keyword_viz = None
1026
+ output_data = None
1027
+
1028
+ finally:
1029
+ # Clean up temp directory
1030
+ try:
1031
+ shutil.rmtree(temp_dir)
1032
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Cleaned up temporary files\n"
1033
+ except Exception as e:
1034
+ log_html += f"[{datetime.now().strftime('%H:%M:%S')}] Warning: Failed to clean temporary files: {str(e)}\n"
1035
+
1036
+ log_html += "</div>"
1037
+ return status_html, log_html, element_stats, element_chart, similarity_matrix, embedding_viz, keyword_viz, output_data
1038
+
1039
+ # Cell 8: Define the Gradio interface
1040
+ def build_gradio_interface():
1041
+ """
1042
+ Build and launch the Gradio interface
1043
+ """
1044
+ # Define theme
1045
+ custom_theme = gr.themes.Default(
1046
+ primary_hue="indigo",
1047
+ secondary_hue="purple",
1048
+ )
1049
+
1050
+ # Create interface
1051
+ with gr.Blocks(theme=custom_theme, title="Unstructured Document Processing") as app:
1052
+ gr.Markdown("""
1053
+ # 📄 Unstructured Document Processing Pipeline
1054
+
1055
+ This application demonstrates a comprehensive document processing pipeline using the [Unstructured](https://unstructured.io/) library.
1056
+ Upload one or more documents to process them through partitioning, cleaning, extraction, chunking, and embedding.
1057
+
1058
+ **Supported file formats**: PDF, DOCX, PPTX, XLSX, HTML, CSV, JSON, XML, Email, Images (JPG, PNG)
1059
+ """)
1060
+
1061
+ # File upload section
1062
+ with gr.Row():
1063
+ with gr.Column(scale=3):
1064
+ files = gr.File(
1065
+ file_count="multiple",
1066
+ label="Upload Documents",
1067
+ type="binary",
1068
+ file_types=[
1069
+ ".pdf", ".docx", ".pptx", ".xlsx", ".html", ".htm",
1070
+ ".csv", ".json", ".xml", ".eml", ".msg",
1071
+ ".jpg", ".jpeg", ".png", ".txt"
1072
+ ]
1073
+ )
1074
+
1075
+ with gr.Column(scale=2):
1076
+ with gr.Accordion("Status", open=True):
1077
+ status = gr.HTML(value="<div style='color: gray;'>Waiting for files...</div>")
1078
+ with gr.Accordion("Processing Log", open=True):
1079
+ log = gr.HTML(value="<div style='font-family: monospace; height: 200px; overflow-y: auto;'>Processing log will appear here...</div>")
1080
+
1081
+ # Processing options
1082
+ with gr.Tabs():
1083
+ # Partitioning options
1084
+ with gr.TabItem("Partitioning"):
1085
+ gr.Markdown("### Document Partitioning Options")
1086
+
1087
+ with gr.Row():
1088
+ with gr.Column():
1089
+ partition_options = {
1090
+ "extract_images": gr.Checkbox(value=True, label="Extract Images", info="Extract images from documents"),
1091
+ "infer_table_structure": gr.Checkbox(value=True, label="Infer Table Structure", info="Extract tables with structure"),
1092
+ "include_page_breaks": gr.Checkbox(value=True, label="Include Page Breaks", info="Include page break elements"),
1093
+ "include_metadata": gr.Checkbox(value=True, label="Include Metadata", info="Extract document metadata"),
1094
+ "strategy": gr.Radio(choices=["fast", "hi_res", "ocr_only"], value="hi_res", label="OCR Strategy (for images/scanned docs)", info="Fast is quicker but less accurate")
1095
+ }
1096
+
1097
+ # Cleaning options
1098
+ with gr.TabItem("Cleaning"):
1099
+ gr.Markdown("### Text Cleaning Options")
1100
+
1101
+ with gr.Row():
1102
+ with gr.Column():
1103
+ cleaning_options = {
1104
+ "extra_whitespace": gr.Checkbox(value=True, label="Clean Extra Whitespace", info="Remove redundant whitespace"),
1105
+ "unicode_quotes": gr.Checkbox(value=True, label="Replace Unicode Quotes", info="Normalize quotes to ASCII"),
1106
+ "bullets": gr.Checkbox(value=True, label="Clean Bullets", info="Standardize bullet points"),
1107
+ "dashes": gr.Checkbox(value=True, label="Clean Dashes", info="Standardize dashes"),
1108
+ "group_paragraphs": gr.Checkbox(value=False, label="Group Broken Paragraphs", info="Combine paragraphs split across pages"),
1109
+ }
1110
+
1111
+ with gr.Column():
1112
+ cleaning_options.update({
1113
+ "remove_punctuation": gr.Checkbox(value=False, label="Remove Punctuation", info="Remove all punctuation")
1114
+ })
1115
+
1116
+ # Extraction options
1117
+ with gr.TabItem("Extraction"):
1118
+ gr.Markdown("### Entity Extraction Options")
1119
+
1120
+ with gr.Row():
1121
+ with gr.Column():
1122
+ extraction_options = {
1123
+ "emails": gr.Checkbox(value=True, label="Extract Emails", info="Extract email addresses"),
1124
+ "urls": gr.Checkbox(value=True, label="Extract URLs", info="Extract URLs"),
1125
+ "phone_numbers": gr.Checkbox(value=True, label="Extract Phone Numbers", info="Extract phone numbers"),
1126
+ "ip_addresses": gr.Checkbox(value=False, label="Extract IP Addresses", info="Extract IP addresses"),
1127
+ "ner": gr.Checkbox(value=True, label="Named Entity Recognition", info="Extract named entities (people, orgs, locations)")
1128
+ }
1129
+
1130
+ # Chunking options
1131
+ with gr.TabItem("Chunking"):
1132
+ gr.Markdown("### Text Chunking Options")
1133
+
1134
+ with gr.Row():
1135
+ with gr.Column():
1136
+ chunking_method = gr.Radio(
1137
+ choices=["by_title", "by_token"],
1138
+ value="by_title",
1139
+ label="Chunking Method",
1140
+ info="How to divide the document into chunks"
1141
+ )
1142
+
1143
+ with gr.Column():
1144
+ chunking_options = {
1145
+ "max_characters": gr.Number(value=2000, label="Max Characters (by_token)", info="Maximum characters per chunk"),
1146
+ "combine_text_under_n_chars": gr.Number(value=300, label="Combine Small Text (by_title)", info="Combine sections smaller than this")
1147
+ }
1148
+
1149
+ # Embedding options
1150
+ with gr.TabItem("Embedding"):
1151
+ gr.Markdown("### Embedding Generation Options")
1152
+
1153
+ with gr.Row():
1154
+ embedding_model = gr.Dropdown(
1155
+ choices=[
1156
+ "all-MiniLM-L6-v2",
1157
+ "paraphrase-multilingual-MiniLM-L12-v2",
1158
+ "all-mpnet-base-v2",
1159
+ "sentence-t5-base",
1160
+ "" # Empty option to skip embedding
1161
+ ],
1162
+ value="all-MiniLM-L6-v2",
1163
+ label="Embedding Model",
1164
+ info="Select a model for generating embeddings (or empty to skip)"
1165
+ )
1166
+
1167
+ # Output format options
1168
+ with gr.TabItem("Output Format"):
1169
+ gr.Markdown("### Output Format Options")
1170
+
1171
+ with gr.Row():
1172
+ output_format = gr.Radio(
1173
+ choices=["json", "qa", "transformers", "label_studio"],
1174
+ value="json",
1175
+ label="Output Format",
1176
+ info="Format for the final processed output"
1177
+ )
1178
+
1179
+ # Process button
1180
+ process_btn = gr.Button("Process Documents", variant="primary")
1181
+
1182
+ # Results section
1183
+ with gr.Tabs():
1184
+ with gr.TabItem("Element Analysis"):
1185
+ with gr.Row():
1186
+ element_stats_json = gr.JSON(label="Element Statistics")
1187
+ element_dist_chart = gr.Plot(label="Element Distribution")
1188
+
1189
+ with gr.TabItem("Semantic Analysis"):
1190
+ with gr.Row():
1191
+ keyword_viz_plot = gr.Plot(label="Keyword Analysis")
1192
+
1193
+ with gr.Row():
1194
+ embedding_viz_plot = gr.Plot(label="Embedding Visualization")
1195
+ similarity_matrix_plot = gr.Plot(label="Semantic Similarity Matrix")
1196
+
1197
+ with gr.TabItem("Processed Output"):
1198
+ output_data_json = gr.JSON(label="Processed Data")
1199
+
1200
+ # Set up event handlers
1201
+ process_btn.click(
1202
+ fn=process_files,
1203
+ inputs=[
1204
+ files,
1205
+ gr.Group(list(partition_options.values())),
1206
+ gr.Group(list(cleaning_options.values())),
1207
+ gr.Group(list(extraction_options.values())),
1208
+ chunking_method,
1209
+ gr.Group(list(chunking_options.values())),
1210
+ embedding_model,
1211
+ output_format
1212
+ ],
1213
+ outputs=[
1214
+ status,
1215
+ log,
1216
+ element_stats_json,
1217
+ element_dist_chart,
1218
+ similarity_matrix_plot,
1219
+ embedding_viz_plot,
1220
+ keyword_viz_plot,
1221
+ output_data_json
1222
+ ]
1223
+ )
1224
+
1225
+ # Examples
1226
+ gr.Examples(
1227
+ examples=[
1228
+ [
1229
+ # Example with default settings - user would upload their own files
1230
+ None
1231
+ ]
1232
+ ],
1233
+ inputs=[files],
1234
+ )
1235
+
1236
+ # Add markdown with instructions
1237
+ with gr.Accordion("Instructions", open=False):
1238
+ gr.Markdown("""
1239
+ ## How to Use This App
1240
+
1241
+ 1. **Upload Documents**: Start by uploading one or more documents in the supported formats.
1242
+
1243
+ 2. **Configure Processing Options**:
1244
+ - **Partitioning**: Control how documents are broken into elements
1245
+ - **Cleaning**: Select text cleaning operations to apply
1246
+ - **Extraction**: Choose entities to extract from the text
1247
+ - **Chunking**: Set how elements are grouped into chunks
1248
+ - **Embedding**: Select a model for generating vector embeddings
1249
+ - **Output Format**: Choose the format of the final processed data
1250
+
1251
+ 3. **Process Documents**: Click the "Process Documents" button to start the pipeline
1252
+
1253
+ 4. **Analyze Results**:
1254
+ - **Element Analysis**: View statistics and distribution of document elements
1255
+ - **Semantic Analysis**: Explore keyword distribution and semantic relationships
1256
+ - **Processed Output**: View the final structured data ready for use with LLMs
1257
+
1258
+ ## Typical Use Cases
1259
+
1260
+ - **Content Extraction**: Extract structured content from unstructured documents
1261
+ - **Document Understanding**: Analyze and categorize document components
1262
+ - **Text Preprocessing**: Prepare text for further NLP or machine learning
1263
+ - **Knowledge Base Creation**: Convert documents into semantic chunks for retrieval
1264
+ - **LLM Integration**: Structure documents for use with large language models
1265
+ """)
1266
+
1267
+ return app
1268
+
1269
+ # Cell 9: Launch the application
1270
+ # Create and launch the app
1271
+ app = build_gradio_interface()
1272
+ app.launch(debug=True)