Merlintxu commited on
Commit
a3047a6
·
verified ·
1 Parent(s): 8e2d3da

Update seo_analyzer.py

Browse files
Files changed (1) hide show
  1. seo_analyzer.py +216 -10
seo_analyzer.py CHANGED
@@ -19,8 +19,6 @@ from urllib3.util.retry import Retry
19
  from transformers import pipeline
20
  from sentence_transformers import SentenceTransformer, util
21
  import torch
22
- import subprocess
23
- import sys
24
  import spacy
25
  import matplotlib.pyplot as plt
26
 
@@ -115,6 +113,136 @@ class SEOSpaceAnalyzer:
115
  logger.error(f"Error en análisis: {e}")
116
  return {"error": str(e)}, [], {}, {}, [], {}, {}
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def _apply_nlp(self, results: List[Dict]) -> Tuple[Dict[str, str], Dict[str, List[str]]]:
119
  summaries = {}
120
  entities = {}
@@ -126,13 +254,13 @@ class SEOSpaceAnalyzer:
126
  try:
127
  summary = self.models['summarizer'](content[:1024], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
128
  summaries[r['url']] = summary
129
- except Exception as e:
130
- logger.warning(f"Resumen fallido para {r['url']}: {e}")
131
  try:
132
  ents = self.models['ner'](content[:1000])
133
  entities[r['url']] = list(set([e['word'] for e in ents if e['entity_group'] in ['PER', 'ORG', 'LOC']]))
134
- except Exception as e:
135
- logger.warning(f"NER fallido para {r['url']}: {e}")
136
  return summaries, entities
137
 
138
  def _compute_semantic_similarity(self, results: List[Dict]) -> Dict[str, List[Dict]]:
@@ -153,9 +281,87 @@ class SEOSpaceAnalyzer:
153
  ][:3]
154
  similarity_dict[url] = top_similar
155
  return similarity_dict
156
- except Exception as e:
157
- logger.error(f"Error en similitud semántica: {e}")
158
  return {}
159
 
160
- # Aquí continuarías con los métodos restantes como _process_url, _process_html, _save_content, etc.
161
- # Inclúyelos como en el original para que el archivo esté completamente funcional y documentado.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  from transformers import pipeline
20
  from sentence_transformers import SentenceTransformer, util
21
  import torch
 
 
22
  import spacy
23
  import matplotlib.pyplot as plt
24
 
 
113
  logger.error(f"Error en análisis: {e}")
114
  return {"error": str(e)}, [], {}, {}, [], {}, {}
115
 
116
+ def _process_url(self, url: str) -> Dict:
117
+ try:
118
+ response = self.session.get(url, timeout=15)
119
+ response.raise_for_status()
120
+ content_type = response.headers.get('Content-Type', '')
121
+ result: Dict[str, Any] = {'url': url, 'status': 'success'}
122
+ if 'application/pdf' in content_type:
123
+ result.update(self._process_pdf(response.content))
124
+ elif 'text/html' in content_type:
125
+ result.update(self._process_html(response.text, url))
126
+ else:
127
+ result.update({'type': 'unknown', 'content': '', 'word_count': 0})
128
+ self._save_content(url, response.content)
129
+ return result
130
+ except requests.exceptions.Timeout as e:
131
+ return {'url': url, 'status': 'error', 'error': "Timeout"}
132
+ except requests.exceptions.HTTPError as e:
133
+ return {'url': url, 'status': 'error', 'error': "HTTP Error"}
134
+ except Exception as e:
135
+ return {'url': url, 'status': 'error', 'error': str(e)}
136
+
137
+ def _process_html(self, html: str, base_url: str) -> Dict:
138
+ soup = BeautifulSoup(html, 'html.parser')
139
+ clean_text = self._clean_text(soup.get_text())
140
+ return {
141
+ 'type': 'html',
142
+ 'content': clean_text,
143
+ 'word_count': len(clean_text.split()),
144
+ 'metadata': self._extract_metadata(soup),
145
+ 'links': self._extract_links(soup, base_url)
146
+ }
147
+
148
+ def _process_pdf(self, content: bytes) -> Dict:
149
+ try:
150
+ text = ""
151
+ with BytesIO(content) as pdf_file:
152
+ reader = PyPDF2.PdfReader(pdf_file)
153
+ for page in reader.pages:
154
+ extracted = page.extract_text()
155
+ text += extracted if extracted else ""
156
+ clean_text = self._clean_text(text)
157
+ return {
158
+ 'type': 'pdf',
159
+ 'content': clean_text,
160
+ 'word_count': len(clean_text.split()),
161
+ 'page_count': len(reader.pages)
162
+ }
163
+ except Exception as e:
164
+ return {'type': 'pdf', 'error': str(e)}
165
+
166
+ def _clean_text(self, text: str) -> str:
167
+ if not text:
168
+ return ""
169
+ text = re.sub(r'\s+', ' ', text)
170
+ return re.sub(r'[^\w\sáéíóúñÁÉÍÓÚÑ]', ' ', text).strip()
171
+
172
+ def _extract_metadata(self, soup: BeautifulSoup) -> Dict:
173
+ metadata = {'title': '', 'description': '', 'keywords': [], 'og': {}}
174
+ if soup.title and soup.title.string:
175
+ metadata['title'] = soup.title.string.strip()[:200]
176
+ for meta in soup.find_all('meta'):
177
+ name = meta.get('name', '').lower()
178
+ prop = meta.get('property', '').lower()
179
+ content = meta.get('content', '')
180
+ if name == 'description':
181
+ metadata['description'] = content[:300]
182
+ elif name == 'keywords':
183
+ metadata['keywords'] = [kw.strip() for kw in content.split(',') if kw.strip()]
184
+ elif prop.startswith('og:'):
185
+ metadata['og'][prop[3:]] = content
186
+ return metadata
187
+
188
+ def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]:
189
+ links: List[Dict] = []
190
+ base_netloc = urlparse(base_url).netloc
191
+ for tag in soup.find_all('a', href=True):
192
+ try:
193
+ href = tag['href'].strip()
194
+ if not href or href.startswith('javascript:'):
195
+ continue
196
+ full_url = urljoin(base_url, href)
197
+ parsed = urlparse(full_url)
198
+ links.append({
199
+ 'url': full_url,
200
+ 'type': 'internal' if parsed.netloc == base_netloc else 'external',
201
+ 'anchor': self._clean_text(tag.get_text())[:100],
202
+ 'file_type': self._get_file_type(parsed.path)
203
+ })
204
+ except:
205
+ continue
206
+ return links
207
+
208
+ def _get_file_type(self, path: str) -> str:
209
+ ext = Path(path).suffix.lower()
210
+ return ext[1:] if ext else 'html'
211
+
212
+ def _parse_sitemap(self, sitemap_url: str) -> List[str]:
213
+ try:
214
+ response = self.session.get(sitemap_url, timeout=10)
215
+ response.raise_for_status()
216
+ if 'xml' not in response.headers.get('Content-Type', ''):
217
+ return []
218
+ soup = BeautifulSoup(response.text, 'lxml-xml')
219
+ urls: List[str] = []
220
+ if soup.find('sitemapindex'):
221
+ for sitemap in soup.find_all('loc'):
222
+ url = sitemap.text.strip()
223
+ if url.endswith('.xml'):
224
+ urls.extend(self._parse_sitemap(url))
225
+ else:
226
+ urls = [loc.text.strip() for loc in soup.find_all('loc')]
227
+ return list({url for url in urls if url.startswith('http')})
228
+ except:
229
+ return []
230
+
231
+ def _save_content(self, url: str, content: bytes) -> None:
232
+ try:
233
+ parsed = urlparse(url)
234
+ domain_dir = self.base_dir / parsed.netloc
235
+ raw_path = parsed.path.lstrip('/')
236
+ if not raw_path or raw_path.endswith('/'):
237
+ raw_path = os.path.join(raw_path, 'index.html') if raw_path else 'index.html'
238
+ safe_path = sanitize_filename(raw_path)
239
+ save_path = domain_dir / safe_path
240
+ save_path.parent.mkdir(parents=True, exist_ok=True)
241
+ with open(save_path, 'wb') as f:
242
+ f.write(content)
243
+ except:
244
+ pass
245
+
246
  def _apply_nlp(self, results: List[Dict]) -> Tuple[Dict[str, str], Dict[str, List[str]]]:
247
  summaries = {}
248
  entities = {}
 
254
  try:
255
  summary = self.models['summarizer'](content[:1024], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
256
  summaries[r['url']] = summary
257
+ except:
258
+ pass
259
  try:
260
  ents = self.models['ner'](content[:1000])
261
  entities[r['url']] = list(set([e['word'] for e in ents if e['entity_group'] in ['PER', 'ORG', 'LOC']]))
262
+ except:
263
+ pass
264
  return summaries, entities
265
 
266
  def _compute_semantic_similarity(self, results: List[Dict]) -> Dict[str, List[Dict]]:
 
281
  ][:3]
282
  similarity_dict[url] = top_similar
283
  return similarity_dict
284
+ except:
 
285
  return {}
286
 
287
+ def _calculate_stats(self, results: List[Dict]) -> Dict:
288
+ successful = [r for r in results if r.get('status') == 'success']
289
+ content_types = [r.get('type', 'unknown') for r in successful]
290
+ avg_word_count = round(np.mean([r.get('word_count', 0) for r in successful]) if successful else 0, 1)
291
+ return {
292
+ 'total_urls': len(results),
293
+ 'successful': len(successful),
294
+ 'failed': len(results) - len(successful),
295
+ 'content_types': pd.Series(content_types).value_counts().to_dict(),
296
+ 'avg_word_count': avg_word_count,
297
+ 'failed_urls': [r['url'] for r in results if r.get('status') != 'success']
298
+ }
299
+
300
+ def _analyze_content(self, results: List[Dict]) -> Dict:
301
+ successful = [r for r in results if r.get('status') == 'success' and r.get('content')]
302
+ texts = [r['content'] for r in successful if len(r['content'].split()) > 10]
303
+ if not texts:
304
+ return {'top_keywords': [], 'content_samples': []}
305
+ try:
306
+ stop_words = list(self.models['spacy'].Defaults.stop_words)
307
+ vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=50, ngram_range=(1, 2))
308
+ tfidf = vectorizer.fit_transform(texts)
309
+ feature_names = vectorizer.get_feature_names_out()
310
+ sorted_indices = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[-10:]
311
+ top_keywords = feature_names[sorted_indices][::-1].tolist()
312
+ except:
313
+ top_keywords = []
314
+ samples = [{'url': r['url'], 'sample': r['content'][:500] + '...' if len(r['content']) > 500 else r['content']} for r in successful[:3]]
315
+ return {'top_keywords': top_keywords, 'content_samples': samples}
316
+
317
+ def _analyze_links(self, results: List[Dict]) -> Dict:
318
+ all_links = []
319
+ for result in results:
320
+ if result.get('links'):
321
+ all_links.extend(result['links'])
322
+ if not all_links:
323
+ return {'internal_links': {}, 'external_domains': {}, 'common_anchors': {}, 'file_types': {}}
324
+ df = pd.DataFrame(all_links)
325
+ return {
326
+ 'internal_links': df[df['type'] == 'internal']['url'].value_counts().head(20).to_dict(),
327
+ 'external_domains': df[df['type'] == 'external']['url'].apply(lambda x: urlparse(x).netloc).value_counts().head(10).to_dict(),
328
+ 'common_anchors': df['anchor'].value_counts().head(10).to_dict(),
329
+ 'file_types': df['file_type'].value_counts().to_dict()
330
+ }
331
+
332
+ def _generate_seo_recommendations(self, results: List[Dict]) -> List[str]:
333
+ successful = [r for r in results if r.get('status') == 'success']
334
+ if not successful:
335
+ return ["No se pudo analizar ningún contenido exitosamente"]
336
+ recs = []
337
+ missing_titles = sum(1 for r in successful if not r.get('metadata', {}).get('title'))
338
+ if missing_titles:
339
+ recs.append(f"📌 Añadir títulos a {missing_titles} páginas")
340
+ short_descriptions = sum(1 for r in successful if not r.get('metadata', {}).get('description'))
341
+ if short_descriptions:
342
+ recs.append(f"📌 Añadir meta descripciones a {short_descriptions} páginas")
343
+ short_content = sum(1 for r in successful if r.get('word_count', 0) < 300)
344
+ if short_content:
345
+ recs.append(f"📝 Ampliar contenido en {short_content} páginas (menos de 300 palabras)")
346
+ all_links = [link for r in results for link in r.get('links', [])]
347
+ if all_links:
348
+ df_links = pd.DataFrame(all_links)
349
+ internal_links = df_links[df_links['type'] == 'internal']
350
+ if len(internal_links) > 100:
351
+ recs.append(f"🔗 Optimizar estructura de enlaces internos ({len(internal_links)} enlaces)")
352
+ return recs if recs else ["✅ No se detectaron problemas críticos de SEO"]
353
+
354
+ def plot_internal_links(self, links_data: Dict) -> Any:
355
+ internal_links = links_data.get('internal_links', {})
356
+ fig, ax = plt.subplots()
357
+ if not internal_links:
358
+ ax.text(0.5, 0.5, 'No hay enlaces internos', ha='center', va='center', transform=ax.transAxes)
359
+ ax.axis('off')
360
+ else:
361
+ names = list(internal_links.keys())
362
+ counts = list(internal_links.values())
363
+ ax.barh(names, counts)
364
+ ax.set_xlabel("Cantidad de enlaces")
365
+ ax.set_title("Top 20 Enlaces Internos")
366
+ plt.tight_layout()
367
+ return fig