Merlintxu commited on
Commit
dcf8a98
·
verified ·
1 Parent(s): aeda28e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +251 -0
app.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import logging
4
+ import re
5
+ import requests
6
+ import hashlib
7
+ from urllib.parse import urlparse, urljoin
8
+ from typing import List, Dict, Optional, Tuple
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
+ from bs4 import BeautifulSoup
11
+ import PyPDF2
12
+ from io import BytesIO
13
+ from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
14
+ import numpy as np
15
+ from sklearn.feature_extraction.text import TfidfVectorizer
16
+ from sentence_transformers import SentenceTransformer
17
+ import spacy
18
+ import gradio as gr
19
+
20
+ # Configuración avanzada
21
+ logging.basicConfig(level=logging.INFO,
22
+ format='%(asctime)s - %(levelname)s - %(message)s')
23
+ logger = logging.getLogger(__name__)
24
+
25
+ class AdvancedSEOAanalyzer:
26
+ def __init__(self, sitemap_url: str):
27
+ self.sitemap_url = sitemap_url
28
+ self.session = self._configure_session()
29
+ self.models = self._load_models()
30
+ self.processed_urls = set()
31
+ self.link_graph = defaultdict(list)
32
+ self.content_store = {}
33
+ self.documents = []
34
+
35
+ def _configure_session(self) -> requests.Session:
36
+ session = requests.Session()
37
+ retry = Retry(
38
+ total=5,
39
+ backoff_factor=1,
40
+ status_forcelist=[500, 502, 503, 504]
41
+ )
42
+ adapter = HTTPAdapter(max_retries=retry)
43
+ session.mount('https://', adapter)
44
+ session.headers.update({
45
+ 'User-Agent': 'Mozilla/5.0 (compatible; SEOBot/1.0; +https://seo.example.com/bot)'
46
+ })
47
+ return session
48
+
49
+ def _load_models(self) -> Dict:
50
+ return {
51
+ 'summarization': pipeline("summarization",
52
+ model="facebook/bart-large-cnn",
53
+ device=0 if torch.cuda.is_available() else -1),
54
+ 'qa': pipeline("question-answering",
55
+ model="deepset/roberta-base-squad2",
56
+ tokenizer="deepset/roberta-base-squad2"),
57
+ 'ner': pipeline("ner",
58
+ model="dslim/bert-base-NER",
59
+ aggregation_strategy="simple"),
60
+ 'semantic': SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2'),
61
+ 'spacy': spacy.load("en_core_web_lg")
62
+ }
63
+
64
+ async def download_content(self, url: str) -> Optional[Dict]:
65
+ if url in self.processed_urls:
66
+ return None
67
+
68
+ try:
69
+ response = self.session.get(url, timeout=15)
70
+ response.raise_for_status()
71
+ content_type = response.headers.get('Content-Type', '')
72
+
73
+ if 'application/pdf' in content_type:
74
+ return self._process_pdf(url, response.content)
75
+ elif 'text/html' in content_type:
76
+ return await self._process_html(url, response.text)
77
+ else:
78
+ logger.warning(f"Unsupported content type: {content_type}")
79
+ return None
80
+
81
+ except Exception as e:
82
+ logger.error(f"Error downloading {url}: {str(e)}")
83
+ return None
84
+
85
+ def _process_pdf(self, url: str, content: bytes) -> Dict:
86
+ text = ""
87
+ with BytesIO(content) as pdf_file:
88
+ reader = PyPDF2.PdfReader(pdf_file)
89
+ for page in reader.pages:
90
+ text += page.extract_text()
91
+
92
+ doc_hash = hashlib.sha256(content).hexdigest()
93
+ self._save_document(url, content, 'pdf')
94
+
95
+ return {
96
+ 'url': url,
97
+ 'type': 'pdf',
98
+ 'content': text,
99
+ 'hash': doc_hash,
100
+ 'links': []
101
+ }
102
+
103
+ async def _process_html(self, url: str, html: str) -> Dict:
104
+ soup = BeautifulSoup(html, 'lxml')
105
+ main_content = self._extract_main_content(soup)
106
+ links = self._extract_links(url, soup)
107
+
108
+ self._save_document(url, html.encode('utf-8'), 'html')
109
+
110
+ return {
111
+ 'url': url,
112
+ 'type': 'html',
113
+ 'content': main_content,
114
+ 'hash': hashlib.sha256(html.encode()).hexdigest(),
115
+ 'links': links,
116
+ 'metadata': self._extract_metadata(soup)
117
+ }
118
+
119
+ def _extract_links(self, base_url: str, soup: BeautifulSoup) -> List[Dict]:
120
+ links = []
121
+ base_domain = urlparse(base_url).netloc
122
+
123
+ for tag in soup.find_all(['a', 'link'], href=True):
124
+ href = tag['href']
125
+ full_url = urljoin(base_url, href)
126
+ parsed = urlparse(full_url)
127
+
128
+ link_type = 'internal' if parsed.netloc == base_domain else 'external'
129
+ file_type = 'other'
130
+
131
+ if parsed.path.lower().endswith(('.pdf', '.doc', '.docx')):
132
+ file_type = 'document'
133
+ elif parsed.path.lower().endswith(('.jpg', '.png', '.gif')):
134
+ file_type = 'image'
135
+
136
+ links.append({
137
+ 'url': full_url,
138
+ 'type': link_type,
139
+ 'file_type': file_type,
140
+ 'anchor': tag.text.strip()
141
+ })
142
+
143
+ return links
144
+
145
+ def _extract_metadata(self, soup: BeautifulSoup) -> Dict:
146
+ metadata = {
147
+ 'title': soup.title.string if soup.title else '',
148
+ 'description': '',
149
+ 'keywords': [],
150
+ 'open_graph': {}
151
+ }
152
+
153
+ meta_tags = soup.find_all('meta')
154
+ for tag in meta_tags:
155
+ name = tag.get('name', '').lower()
156
+ property = tag.get('property', '').lower()
157
+ content = tag.get('content', '')
158
+
159
+ if name == 'description':
160
+ metadata['description'] = content
161
+ elif name == 'keywords':
162
+ metadata['keywords'] = [kw.strip() for kw in content.split(',')]
163
+ elif property.startswith('og:'):
164
+ key = property[3:]
165
+ metadata['open_graph'][key] = content
166
+
167
+ return metadata
168
+
169
+ def analyze_content(self, content: Dict) -> Dict:
170
+ analysis = {
171
+ 'summary': self.models['summarization'](content['content'],
172
+ max_length=150,
173
+ min_length=30)[0]['summary_text'],
174
+ 'entities': self.models['ner'](content['content']),
175
+ 'semantic_embedding': self.models['semantic'].encode(content['content']),
176
+ 'seo_analysis': self._perform_seo_analysis(content)
177
+ }
178
+
179
+ if content['type'] == 'pdf':
180
+ analysis['document_analysis'] = self._analyze_document_structure(content)
181
+
182
+ return analysis
183
+
184
+ def _perform_seo_analysis(self, content: Dict) -> Dict:
185
+ text = content['content']
186
+ doc = self.models['spacy'](text)
187
+
188
+ return {
189
+ 'readability_score': self._calculate_readability(text),
190
+ 'keyword_density': self._calculate_keyword_density(text),
191
+ 'heading_structure': self._analyze_headings(doc),
192
+ 'content_length': len(text.split()),
193
+ 'semantic_topics': self._extract_semantic_topics(text)
194
+ }
195
+
196
+ def _extract_semantic_topics(self, text: str) -> List[str]:
197
+ vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
198
+ tfidf = vectorizer.fit_transform([text])
199
+ feature_array = np.array(vectorizer.get_feature_names_out())
200
+ tfidf_sorting = np.argsort(tfidf.toarray()).flatten()[::-1]
201
+
202
+ return feature_array[tfidf_sorting][:5].tolist()
203
+
204
+ def run_analysis(self, max_workers: int = 4) -> Dict:
205
+ sitemap_urls = self._parse_sitemap()
206
+ results = []
207
+
208
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
209
+ futures = [executor.submit(self.download_content, url)
210
+ for url in sitemap_urls]
211
+
212
+ for future in as_completed(futures):
213
+ result = future.result()
214
+ if result:
215
+ analyzed = self.analyze_content(result)
216
+ results.append({**result, **analyzed})
217
+ self._update_link_graph(result)
218
+
219
+ self._save_full_analysis(results)
220
+ return {
221
+ 'total_pages': len(results),
222
+ 'document_types': self._count_document_types(results),
223
+ 'link_analysis': self._analyze_link_graph(),
224
+ 'content_analysis': self._aggregate_content_stats(results)
225
+ }
226
+
227
+ def _save_document(self, url: str, content: bytes, file_type: str) -> None:
228
+ parsed = urlparse(url)
229
+ path = parsed.path.lstrip('/')
230
+ filename = f"documents/{parsed.netloc}/{path}" if path else f"documents/{parsed.netloc}/index"
231
+
232
+ os.makedirs(os.path.dirname(filename), exist_ok=True)
233
+ with open(filename + f'.{file_type}', 'wb') as f:
234
+ f.write(content)
235
+
236
+ def launch_interface(self):
237
+ interface = gr.Interface(
238
+ fn=self.run_analysis,
239
+ inputs=gr.Textbox(label="Sitemap URL"),
240
+ outputs=[
241
+ gr.JSON(label="Analysis Results"),
242
+ gr.File(label="Download Data")
243
+ ],
244
+ title="Advanced SEO Analyzer",
245
+ description="Analyze websites with AI-powered SEO insights"
246
+ )
247
+ interface.launch()
248
+
249
+ if __name__ == "__main__":
250
+ analyzer = AdvancedSEOAanalyzer("https://www.example.com/sitemap.xml")
251
+ analyzer.launch_interface()