Spaces:

tensor-boy
/

ISE

Runtime error

fikird commited on Dec 2, 2024

Commit

a3440c5

1 Parent(s): 48922fa

feat: Enhanced search engine with caching and metadata

- Added result caching with TTL
- Improved content extraction
- Enhanced metadata collection
- Optimized dependencies
- Removed unnecessary files

Files changed (7) hide show

apt.txt +0 -8
engines/search.py +159 -13
osint_engine.py +0 -489
packages.txt +0 -25
requirements.txt +32 -35
search_engine.py +0 -219
space.yml +0 -11

apt.txt DELETED Viewed

@@ -1,8 +0,0 @@
-python3-dev
-python3-pip
-build-essential
-gcc
-g++
-git
-cmake
-libgomp1

engines/search.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-RAG-based search engine with intelligent answer synthesis.
 """
 from typing import List, Dict, Any, Optional
 import asyncio
@@ -13,6 +13,12 @@ from googlesearch import search as gsearch
 import requests
 from bs4 import BeautifulSoup
 from tenacity import retry, stop_after_attempt, wait_exponential
 class SearchEngine:
     def __init__(self):
@@ -23,12 +29,42 @@ class SearchEngine:
             chunk_size=500,
             chunk_overlap=50
         )
     @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
     async def search_web(self, query: str, max_results: int = 10) -> List[Dict[str, str]]:
         """Perform web search using multiple search engines."""
         results = []
         # DuckDuckGo Search
         try:
             with DDGS() as ddgs:
@@ -44,8 +80,26 @@ class SearchEngine:
         except Exception as e:
             print(f"Google search error: {e}")
         return results[:max_results]
     @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
     async def fetch_content(self, url: str) -> Optional[str]:
         """Fetch and extract content from a webpage."""
@@ -56,25 +110,90 @@ class SearchEngine:
             response = requests.get(url, headers=headers, timeout=10)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, "html.parser")
             # Remove unwanted elements
-            for element in soup(["script", "style", "nav", "footer", "header"]):
                 element.decompose()
-            text = soup.get_text(separator="\n", strip=True)
             return text
         except Exception as e:
             print(f"Error fetching {url}: {e}")
             return None
     async def process_search_results(self, query: str) -> Dict[str, Any]:
         """Process search results and create a RAG-based answer."""
         # Perform web search
         search_results = await self.search_web(query)
         # Fetch content from search results
         documents = []
         for result in search_results:
             url = result.get("link")
             if not url:
@@ -84,17 +203,28 @@ class SearchEngine:
             if content:
                 # Split content into chunks
                 chunks = self.text_splitter.split_text(content)
                 for chunk in chunks:
                     doc = Document(
                         page_content=chunk,
-                        metadata={"source": url, "title": result.get("title", url)}
                     )
                     documents.append(doc)
         if not documents:
             return {
                 "answer": "I couldn't find any relevant information.",
-                "sources": []
             }
         # Create vector store
@@ -109,18 +239,33 @@ class SearchEngine:
         # Get relevant documents
         relevant_docs = chain.retriever.get_relevant_documents(query)
-        # For now, return the most relevant chunks and sources
         sources = []
         content = []
-        for doc in relevant_docs[:3]:
-            if doc.metadata["source"] not in sources:
-                sources.append(doc.metadata["source"])
-            content.append(doc.page_content)
-        return {
             "answer": "\n\n".join(content),
-            "sources": sources
         }
     async def search(self, query: str) -> Dict[str, Any]:
         """Main search interface."""
@@ -129,5 +274,6 @@ class SearchEngine:
         except Exception as e:
             return {
                 "answer": f"An error occurred: {str(e)}",
-                "sources": []
             }

 """
+Advanced RAG-based search engine with multi-source intelligence.
 """
 from typing import List, Dict, Any, Optional
 import asyncio
 import requests
 from bs4 import BeautifulSoup
 from tenacity import retry, stop_after_attempt, wait_exponential
+import json
+import time
+from datetime import datetime, timedelta
+import hashlib
+from urllib.parse import urlparse
+import re
 class SearchEngine:
     def __init__(self):
             chunk_size=500,
             chunk_overlap=50
         )
+        self.cache = {}
+        self.cache_ttl = timedelta(hours=24)
+        self.search_delay = 2  # seconds between searches
+        self.last_search_time = datetime.min
+    def _get_cache_key(self, query: str, **kwargs) -> str:
+        """Generate cache key from query and kwargs."""
+        cache_data = {
+            "query": query,
+            **kwargs
+        }
+        return hashlib.md5(json.dumps(cache_data, sort_keys=True).encode()).hexdigest()
+    def _get_cached_result(self, cache_key: str) -> Optional[Dict[str, Any]]:
+        """Get result from cache if valid."""
+        if cache_key in self.cache:
+            result, timestamp = self.cache[cache_key]
+            if datetime.now() - timestamp < self.cache_ttl:
+                return result
+            del self.cache[cache_key]
+        return None
+    def _set_cached_result(self, cache_key: str, result: Dict[str, Any]):
+        """Store result in cache."""
+        self.cache[cache_key] = (result, datetime.now())
     @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
     async def search_web(self, query: str, max_results: int = 10) -> List[Dict[str, str]]:
         """Perform web search using multiple search engines."""
         results = []
+        # Respect rate limiting
+        time_since_last = datetime.now() - self.last_search_time
+        if time_since_last.total_seconds() < self.search_delay:
+            await asyncio.sleep(self.search_delay - time_since_last.total_seconds())
         # DuckDuckGo Search
         try:
             with DDGS() as ddgs:
         except Exception as e:
             print(f"Google search error: {e}")
+        self.last_search_time = datetime.now()
         return results[:max_results]
+    def _clean_html(self, html: str) -> str:
+        """Clean HTML content."""
+        # Remove script and style elements
+        html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL)
+        html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL)
+        # Remove comments
+        html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
+        # Remove remaining tags
+        html = re.sub(r'<[^>]+>', ' ', html)
+        # Clean whitespace
+        html = re.sub(r'\s+', ' ', html).strip()
+        return html
     @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
     async def fetch_content(self, url: str) -> Optional[str]:
         """Fetch and extract content from a webpage."""
             response = requests.get(url, headers=headers, timeout=10)
             response.raise_for_status()
+            # Extract main content
             soup = BeautifulSoup(response.text, "html.parser")
             # Remove unwanted elements
+            for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
                 element.decompose()
+            # Try to find main content
+            main_content = None
+            # Look for article tag
+            if soup.find("article"):
+                main_content = soup.find("article")
+            # Look for main tag
+            elif soup.find("main"):
+                main_content = soup.find("main")
+            # Look for div with common content class names
+            elif soup.find("div", class_=re.compile(r"content|article|post|entry")):
+                main_content = soup.find("div", class_=re.compile(r"content|article|post|entry"))
+            # Use body if no main content found
+            if not main_content:
+                main_content = soup.body
+            # Extract text
+            if main_content:
+                text = self._clean_html(str(main_content))
+            else:
+                text = self._clean_html(response.text)
             return text
         except Exception as e:
             print(f"Error fetching {url}: {e}")
             return None
+    def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict[str, Any]:
+        """Extract metadata from webpage."""
+        metadata = {
+            "url": url,
+            "domain": urlparse(url).netloc,
+            "title": None,
+            "description": None,
+            "published_date": None,
+            "author": None,
+            "keywords": None
+        }
+        # Extract title
+        if soup.title:
+            metadata["title"] = soup.title.string
+        # Extract meta tags
+        for meta in soup.find_all("meta"):
+            name = meta.get("name", "").lower()
+            property = meta.get("property", "").lower()
+            content = meta.get("content")
+            if name == "description" or property == "og:description":
+                metadata["description"] = content
+            elif name == "author":
+                metadata["author"] = content
+            elif name == "keywords":
+                metadata["keywords"] = content
+            elif name in ["published_time", "article:published_time"]:
+                metadata["published_date"] = content
+        return metadata
     async def process_search_results(self, query: str) -> Dict[str, Any]:
         """Process search results and create a RAG-based answer."""
+        cache_key = self._get_cache_key(query)
+        cached_result = self._get_cached_result(cache_key)
+        if cached_result:
+            return cached_result
         # Perform web search
         search_results = await self.search_web(query)
         # Fetch content from search results
         documents = []
+        metadata_list = []
         for result in search_results:
             url = result.get("link")
             if not url:
             if content:
                 # Split content into chunks
                 chunks = self.text_splitter.split_text(content)
+                # Store metadata
+                metadata = {
+                    "source": url,
+                    "title": result.get("title", url),
+                    **result
+                }
+                metadata_list.append(metadata)
+                # Create documents
                 for chunk in chunks:
                     doc = Document(
                         page_content=chunk,
+                        metadata=metadata
                     )
                     documents.append(doc)
         if not documents:
             return {
                 "answer": "I couldn't find any relevant information.",
+                "sources": [],
+                "metadata": []
             }
         # Create vector store
         # Get relevant documents
         relevant_docs = chain.retriever.get_relevant_documents(query)
+        # Extract unique sources and content
         sources = []
         content = []
+        used_metadata = []
+        for doc in relevant_docs[:5]:  # Limit to top 5 most relevant docs
+            source = doc.metadata["source"]
+            if source not in sources:
+                sources.append(source)
+                content.append(doc.page_content)
+                # Find corresponding metadata
+                for meta in metadata_list:
+                    if meta["source"] == source:
+                        used_metadata.append(meta)
+                        break
+        result = {
             "answer": "\n\n".join(content),
+            "sources": sources,
+            "metadata": used_metadata
         }
+        # Cache the result
+        self._set_cached_result(cache_key, result)
+        return result
     async def search(self, query: str) -> Dict[str, Any]:
         """Main search interface."""
         except Exception as e:
             return {
                 "answer": f"An error occurred: {str(e)}",
+                "sources": [],
+                "metadata": []
             }

osint_engine.py DELETED Viewed

@@ -1,489 +0,0 @@
-import os
-import re
-import json
-import time
-import asyncio
-import aiohttp
-import requests
-import httpx
-from PIL import Image
-from io import BytesIO
-from typing import Dict, List, Any, Union, Optional
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.chrome.service import Service
-from webdriver_manager.chrome import ChromeDriverManager
-from geopy.geocoders import Nominatim
-from waybackpy import WaybackMachineCDXServerAPI
-import whois
-from datetime import datetime
-from googlesearch import search as google_search
-import base64
-import io
-class OSINTEngine:
-    """OSINT capabilities for advanced information gathering"""
-    def __init__(self):
-        self.chrome_options = Options()
-        self.chrome_options.add_argument('--headless')
-        self.chrome_options.add_argument('--no-sandbox')
-        self.chrome_options.add_argument('--disable-dev-shm-usage')
-        self.setup_apis()
-        self.session = None
-        self.platforms = {
-            "twitter": "https://twitter.com/{}",
-            "instagram": "https://instagram.com/{}",
-            "facebook": "https://facebook.com/{}",
-            "linkedin": "https://linkedin.com/in/{}",
-            "github": "https://github.com/{}",
-            "reddit": "https://reddit.com/user/{}",
-            "youtube": "https://youtube.com/@{}",
-            "tiktok": "https://tiktok.com/@{}",
-            "pinterest": "https://pinterest.com/{}",
-            "snapchat": "https://snapchat.com/add/{}",
-            "twitch": "https://twitch.tv/{}",
-            "medium": "https://medium.com/@{}",
-            "devto": "https://dev.to/{}",
-            "stackoverflow": "https://stackoverflow.com/users/{}"
-        }
-    def setup_apis(self):
-        """Initialize API clients"""
-        self.geolocator = Nominatim(user_agent="intelligent_search")
-        self.http_client = httpx.AsyncClient()
-    async def initialize(self):
-        if not self.session:
-            self.session = aiohttp.ClientSession()
-    async def close(self):
-        if self.session:
-            await self.session.close()
-            self.session = None
-    async def search_username(self, username: str) -> Dict[str, Any]:
-        """Search for username across multiple platforms"""
-        results = {
-            'platforms': [],
-            'social_media': {},
-            'websites': []
-        }
-        # Common social media platforms
-        platforms = [
-            {'name': 'GitHub', 'url': f'https://github.com/{username}'},
-            {'name': 'Twitter', 'url': f'https://twitter.com/{username}'},
-            {'name': 'Instagram', 'url': f'https://instagram.com/{username}'},
-            {'name': 'LinkedIn', 'url': f'https://linkedin.com/in/{username}'},
-            {'name': 'Facebook', 'url': f'https://facebook.com/{username}'},
-            {'name': 'YouTube', 'url': f'https://youtube.com/@{username}'},
-        ]
-        async with aiohttp.ClientSession() as session:
-            tasks = []
-            for platform in platforms:
-                task = self.check_profile(session, platform['url'], platform['name'])
-                tasks.append(task)
-            platform_results = await asyncio.gather(*tasks)
-            results['platforms'] = [r for r in platform_results if r is not None]
-        # Google search for additional mentions
-        try:
-            search_query = f'"{username}" OR "@{username}" -site:twitter.com -site:facebook.com -site:instagram.com'
-            web_results = list(google_search(search_query, num_results=5))
-            results['websites'] = web_results
-        except Exception as e:
-            results['websites'] = [str(e)]
-        return results
-    async def check_profile(self, session, url: str, platform: str) -> Dict[str, str]:
-        """Check if a profile exists on a platform"""
-        try:
-            async with session.get(url) as response:
-                if response.status == 200:
-                    return {
-                        'platform': platform,
-                        'url': url,
-                        'exists': True
-                    }
-        except:
-            pass
-        return None
-    async def check_username(self, username: str, platform: str = "all") -> List[Dict]:
-        await self.initialize()
-        results = []
-        platforms_to_check = [platform] if platform != "all" else self.platforms.keys()
-        for platform_name in platforms_to_check:
-            if platform_name in self.platforms:
-                url = self.platforms[platform_name].format(username)
-                try:
-                    async with self.session.get(url) as response:
-                        exists = response.status == 200
-                        results.append({
-                            "platform": platform_name,
-                            "url": url,
-                            "exists": exists
-                        })
-                except:
-                    results.append({
-                        "platform": platform_name,
-                        "url": url,
-                        "exists": False,
-                        "error": "Connection failed"
-                    })
-        return results
-    async def search_image(self, image_url: str) -> Dict[str, Any]:
-        """Image analysis and reverse search"""
-        results = {
-            'analysis': {},
-            'similar_images': [],
-            'error': None
-        }
-        try:
-            # Download and analyze image
-            response = requests.get(image_url)
-            img = Image.open(BytesIO(response.content))
-            # Basic image analysis
-            results['analysis'] = {
-                'format': img.format,
-                'size': img.size,
-                'mode': img.mode
-            }
-            # Perform reverse image search using Google Lens
-            search_url = f"https://lens.google.com/uploadbyurl?url={image_url}"
-            results['similar_images'].append({
-                'source': 'Google Lens',
-                'url': search_url
-            })
-        except Exception as e:
-            results['error'] = str(e)
-        return results
-    async def gather_personal_info(self, data: Dict[str, str]) -> Dict[str, Any]:
-        """Gather personal information from various sources"""
-        results = {}
-        if 'location' in data:
-            results['location'] = await self.analyze_location(data['location'])
-        if 'domain' in data:
-            results['domain'] = self.analyze_domain(data['domain'])
-        return results
-    async def analyze_location(self, location: str) -> Dict[str, Any]:
-        """Analyze location information"""
-        try:
-            location_data = self.geolocator.geocode(location)
-            if location_data:
-                return {
-                    'address': location_data.address,
-                    'latitude': location_data.latitude,
-                    'longitude': location_data.longitude,
-                    'raw': location_data.raw
-                }
-        except Exception as e:
-            return {'error': str(e)}
-        return None
-    def analyze_domain(self, domain: str) -> Dict[str, Any]:
-        """Analyze domain information"""
-        try:
-            domain_info = whois.whois(domain)
-            return {
-                'registrar': domain_info.registrar,
-                'creation_date': domain_info.creation_date,
-                'expiration_date': domain_info.expiration_date,
-                'last_updated': domain_info.updated_date,
-                'status': domain_info.status
-            }
-        except Exception as e:
-            return {'error': str(e)}
-    async def search_historical_data(self, url: str) -> List[Dict[str, Any]]:
-        """Search for historical data using Wayback Machine"""
-        results = []
-        try:
-            user_agent = "Mozilla/5.0"
-            cdx = WaybackMachineCDXServerAPI(url, user_agent)
-            for snapshot in cdx.snapshots():
-                results.append({
-                    'timestamp': snapshot.timestamp,
-                    'url': snapshot.archive_url,
-                    'status': snapshot.status_code,
-                    'mime_type': snapshot.mime_type
-                })
-        except Exception as e:
-            results.append({'error': str(e)})
-        return results
-    async def search_person(self, name: str, location: Optional[str] = None) -> List[Dict]:
-        await self.initialize()
-        results = []
-        # Format search query
-        query = f"{name}"
-        if location:
-            query += f" {location}"
-        # Simulate searching various sources
-        sources = ["social_media", "news", "public_records", "professional"]
-        for source in sources:
-            # Simulate different data sources
-            if source == "social_media":
-                profile = {
-                    "name": name,
-                    "location": location,
-                    "source": "Social Media",
-                    "profile_image": "https://example.com/profile.jpg",
-                    "social_links": [
-                        {"platform": "LinkedIn", "url": f"https://linkedin.com/in/{name.lower().replace(' ', '-')}"},
-                        {"platform": "Twitter", "url": f"https://twitter.com/{name.lower().replace(' ', '')}"}
-                    ],
-                    "occupation": "Professional",
-                    "last_seen": datetime.now().strftime("%Y-%m-%d")
-                }
-                results.append(profile)
-            elif source == "news":
-                news = {
-                    "name": name,
-                    "source": "News Articles",
-                    "mentions": [
-                        {
-                            "title": f"Article about {name}",
-                            "url": "https://example.com/news",
-                            "date": "2023-01-01"
-                        }
-                    ]
-                }
-                results.append(news)
-            elif source == "public_records":
-                record = {
-                    "name": name,
-                    "source": "Public Records",
-                    "location": location,
-                    "age_range": "25-35",
-                    "possible_relatives": ["Jane Doe", "John Doe Sr."],
-                    "previous_locations": ["New York, NY", "Los Angeles, CA"]
-                }
-                results.append(record)
-            elif source == "professional":
-                prof = {
-                    "name": name,
-                    "source": "Professional Records",
-                    "education": ["University Example"],
-                    "work_history": ["Company A", "Company B"],
-                    "skills": ["Leadership", "Management"]
-                }
-                results.append(prof)
-        return results
-    async def get_person_details(self, person_id: str) -> Dict:
-        """Get detailed information about a specific person"""
-        await self.initialize()
-        # Simulate gathering detailed information
-        details = {
-            "personal": {
-                "name": person_id,
-                "age_range": "25-35",
-                "locations": ["Current City, Country", "Previous City, Country"],
-                "education": ["University Name", "High School Name"],
-                "occupation": "Current Occupation"
-            },
-            "social_media": {
-                "profiles": [
-                    {
-                        "platform": "LinkedIn",
-                        "url": f"https://linkedin.com/in/{person_id}",
-                        "last_active": "2023-01-01"
-                    },
-                    {
-                        "platform": "Twitter",
-                        "url": f"https://twitter.com/{person_id}",
-                        "last_active": "2023-01-01"
-                    }
-                ]
-            },
-            "contact": {
-                "email_pattern": "j***@example.com",
-                "phone_pattern": "+1 (***) ***-**89"
-            },
-            "images": [
-                {
-                    "url": "https://example.com/profile1.jpg",
-                    "source": "LinkedIn",
-                    "date": "2023-01-01"
-                }
-            ],
-            "activities": {
-                "recent_posts": [
-                    {
-                        "platform": "Twitter",
-                        "content": "Example post content",
-                        "date": "2023-01-01"
-                    }
-                ],
-                "mentions": [
-                    {
-                        "source": "News Article",
-                        "title": "Article Title",
-                        "url": "https://example.com/article",
-                        "date": "2023-01-01"
-                    }
-                ]
-            }
-        }
-        return details
-    async def analyze_image(self, image_path: str) -> Dict:
-        """Analyze an image and return information about it"""
-        try:
-            # Open and analyze the image
-            img = Image.open(image_path if os.path.exists(image_path) else io.BytesIO(requests.get(image_path).content))
-            analysis = {
-                "format": img.format,
-                "size": f"{img.size[0]}x{img.size[1]}",
-                "mode": img.mode,
-                "metadata": {},
-            }
-            # Extract EXIF data if available
-            if hasattr(img, '_getexif') and img._getexif():
-                exif = img._getexif()
-                if exif:
-                    analysis["metadata"] = {
-                        "datetime": exif.get(306, "Unknown"),
-                        "make": exif.get(271, "Unknown"),
-                        "model": exif.get(272, "Unknown"),
-                        "software": exif.get(305, "Unknown")
-                    }
-            return analysis
-        except Exception as e:
-            return {"error": str(e)}
-    async def find_similar_images(self, image_url: str) -> List[Dict]:
-        """Find similar images"""
-        # Simulate finding similar images
-        return [
-            {
-                "url": "https://example.com/similar1.jpg",
-                "similarity": 0.95,
-                "source": "Website A"
-            },
-            {
-                "url": "https://example.com/similar2.jpg",
-                "similarity": 0.85,
-                "source": "Website B"
-            }
-        ]
-    async def get_location_info(self, location: str) -> Dict:
-        """Get information about a location"""
-        # Simulate location information retrieval
-        return {
-            "name": location,
-            "coordinates": {"lat": 40.7128, "lng": -74.0060},
-            "country": "United States",
-            "timezone": "America/New_York",
-            "population": "8.4 million",
-            "weather": "Sunny, 72°F"
-        }
-    async def get_domain_info(self, domain: str) -> Dict:
-        """Get information about a domain"""
-        # Simulate domain information retrieval
-        return {
-            "domain": domain,
-            "registrar": "Example Registrar",
-            "creation_date": "2020-01-01",
-            "expiration_date": "2024-01-01",
-            "nameservers": ["ns1.example.com", "ns2.example.com"],
-            "ip_address": "192.0.2.1",
-            "location": "United States"
-        }
-# Helper function to create document from gathered information
-def create_report(data: Dict[str, Any], template: str = "default") -> str:
-    """Create a formatted report from gathered information"""
-    if template == "default":
-        report = "# OSINT Investigation Report\n\n"
-        report += f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
-        for section, content in data.items():
-            report += f"## {section.title()}\n"
-            if isinstance(content, dict):
-                for key, value in content.items():
-                    report += f"* {key}: {value}\n"
-            elif isinstance(content, list):
-                for item in content:
-                    if isinstance(item, dict):
-                        for k, v in item.items():
-                            report += f"* {k}: {v}\n"
-                    else:
-                        report += f"* {item}\n"
-            else:
-                report += f"{content}\n"
-            report += "\n"
-        return report
-    else:
-        raise ValueError(f"Template '{template}' not found")
-async def create_report_from_data(data: Dict) -> Dict:
-    """Create a formatted report from the gathered data"""
-    engine = OSINTEngine()
-    try:
-        report = {}
-        if "username" in data:
-            report["platforms"] = await engine.check_username(data["username"], data.get("platform", "all"))
-        if "image_url" in data:
-            report["analysis"] = await engine.analyze_image(data["image_url"])
-            report["similar_images"] = await engine.find_similar_images(data["image_url"])
-        if "location" in data:
-            report["location"] = await engine.get_location_info(data["location"])
-        if "domain" in data:
-            report["domain"] = await engine.get_domain_info(data["domain"])
-        if "name" in data:
-            report["matches"] = await engine.search_person(data["name"], data.get("location"))
-        if "person_id" in data:
-            report["details"] = await engine.get_person_details(data["person_id"])
-        await engine.close()
-        return report
-    except Exception as e:
-        await engine.close()
-        return {"error": str(e)}

packages.txt DELETED Viewed

@@ -1,25 +0,0 @@
-python3-dev
-python3-pip
-build-essential
-gcc
-g++
-git
-cmake
-libgomp1
-libglib2.0-0
-libnss3
-libnspr4
-libatk1.0-0
-libatk-bridge2.0-0
-libcups2
-libdrm2
-libdbus-1-3
-libxkbcommon0
-libxcomposite1
-libxdamage1
-libxfixes3
-libxrandr2
-libgbm1
-libpango-1.0-0
-libcairo2
-libasound2

requirements.txt CHANGED Viewed

@@ -1,42 +1,39 @@
-# Core dependencies
-langchain==0.0.335
-pydantic==1.10.13
-numpy>=1.23.5
-pandas>=2.0.2
-tqdm>=4.65.0
-# Web and Networking
-requests==2.31.0
-aiohttp==3.8.5
-httpx==0.24.1
-beautifulsoup4==4.12.2
-selenium==4.15.2
-webdriver-manager==4.0.1
-googlesearch-python==1.2.3
-duckduckgo-search==3.8.5
-# ML and AI
---extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.0.1+cpu
-torchvision==0.15.2+cpu
-transformers==4.31.0
-sentence-transformers==2.2.2
-# UI
-gradio==3.40.1
 # OSINT Tools
-python-whois==0.8.0
-geopy==2.4.1
-socid-extractor==1.0.0
-holehe==1.61
-sherlock-project==0.14.3
-# Image Processing
-Pillow==10.0.0
-face-recognition==1.3.0
 # Utilities
-python-dotenv==1.0.0
-tenacity==8.2.3
-retry==0.9.2

+# Core Dependencies
+python-dotenv>=1.0.0
+langchain>=0.0.200
+transformers>=4.30.2
+sentence-transformers>=2.2.2
+faiss-cpu>=1.7.4
+torch>=2.0.1 --index-url https://download.pytorch.org/whl/cpu
+accelerate>=0.21.0
+# Web Scraping & Search
+duckduckgo-search>=3.8.3
+beautifulsoup4>=4.12.2
+requests>=2.31.0
+google>=3.0.0
+tenacity>=8.2.2
+aiohttp>=3.8.5
+httpx>=0.24.1
+# Image Processing
+Pillow>=10.0.0
+face-recognition>=1.3.0
+opencv-python-headless>=4.8.0
 # OSINT Tools
+holehe>=1.61
+sherlock-project>=0.14.0
+python-whois>=0.8.0
+geopy>=2.3.0
+# UI
+gradio>=3.40.1
+markdown>=3.4.3
 # Utilities
+python-dateutil>=2.8.2
+tqdm>=4.65.0
+validators>=0.20.0
+urllib3>=2.0.4
+certifi>=2023.7.22

search_engine.py DELETED Viewed

@@ -1,219 +0,0 @@
-from typing import Dict, List, Any
-import requests
-from bs4 import BeautifulSoup
-from duckduckgo_search import ddg
-from transformers import pipeline
-from langchain.embeddings import HuggingFaceEmbeddings
-import time
-import json
-import os
-from urllib.parse import urlparse
-import asyncio
-class ModelManager:
-    """Manages AI models for text processing"""
-    def __init__(self):
-        # Initialize with smaller, CPU-friendly models
-        self.summarizer = pipeline(
-            "summarization",
-            model="facebook/bart-base",
-            device=-1  # Use CPU
-        )
-        self.embeddings = HuggingFaceEmbeddings(
-            model_name="sentence-transformers/all-MiniLM-L6-v2"
-        )
-    def generate_summary(self, text: str, max_length: int = 150) -> str:
-        """Generate a concise summary of the text"""
-        if not text or len(text.split()) < 50:
-            return text
-        try:
-            summary = self.summarizer(
-                text,
-                max_length=max_length,
-                min_length=30,
-                do_sample=False
-            )[0]['summary_text']
-            return summary
-        except Exception as e:
-            print(f"Error in summarization: {e}")
-            return text[:500] + "..."
-class ContentProcessor:
-    """Processes and analyzes different types of content"""
-    def __init__(self):
-        self.model_manager = ModelManager()
-    def process_content(self, content: str) -> Dict[str, Any]:
-        """Process content and generate insights"""
-        if not content:
-            return {"summary": "", "insights": []}
-        try:
-            summary = self.model_manager.generate_summary(content)
-            return {
-                "summary": summary,
-                "insights": []  # Simplified for CPU deployment
-            }
-        except Exception as e:
-            print(f"Error processing content: {e}")
-            return {"summary": content[:500] + "...", "insights": []}
-class OSINTEngine:
-    """Main OSINT engine class"""
-    def __init__(self):
-        from osint_engine import OSINTEngine as ExternalOSINT
-        self.engine = ExternalOSINT()
-    async def search_username(self, query: str) -> Dict[str, Any]:
-        """Search for usernames"""
-        return await self.engine.search_username(query)
-    async def search_image(self, query: str) -> Dict[str, Any]:
-        """Search for images"""
-        return await self.engine.search_image(query)
-    async def search_social_media(self, query: str, platform: str) -> Dict[str, Any]:
-        """Search for social media profiles"""
-        results = await self.engine.search_username(query)
-        if platform:
-            return {platform: [r for r in results.get('platforms', []) if r['platform'].lower() == platform.lower()]}
-        return results
-    async def gather_personal_info(self, kwargs: Dict[str, Any]) -> Dict[str, Any]:
-        """Gather personal information"""
-        return await self.engine.gather_personal_info(kwargs)
-    async def search_historical_data(self, query: str) -> Dict[str, Any]:
-        """Search for historical data"""
-        return await self.engine.search_historical_data(query)
-class WebSearchEngine:
-    """Main search engine class"""
-    def __init__(self):
-        self.processor = ContentProcessor()
-        self.session = requests.Session()
-        self.request_delay = 1.0
-        self.last_request_time = 0
-        self.osint_engine = OSINTEngine()  # Add OSINT engine
-    def is_valid_url(self, url: str) -> bool:
-        """Check if URL is valid for crawling"""
-        try:
-            parsed = urlparse(url)
-            return bool(parsed.netloc and parsed.scheme in ['http', 'https'])
-        except:
-            return False
-    def get_metadata(self, soup: BeautifulSoup) -> Dict[str, str]:
-        """Extract metadata from page"""
-        metadata = {}
-        # Get title
-        title = soup.find('title')
-        if title:
-            metadata['title'] = title.text.strip()
-        # Get meta description
-        desc = soup.find('meta', attrs={'name': 'description'})
-        if desc:
-            metadata['description'] = desc.get('content', '')
-        # Get publication date
-        date = soup.find('meta', attrs={'property': 'article:published_time'})
-        if date:
-            metadata['published_date'] = date.get('content', '').split('T')[0]
-        return metadata
-    def process_url(self, url: str) -> Dict[str, Any]:
-        """Process a single URL"""
-        if not self.is_valid_url(url):
-            return None
-        try:
-            # Rate limiting
-            current_time = time.time()
-            if current_time - self.last_request_time < self.request_delay:
-                time.sleep(self.request_delay)
-            response = self.session.get(url, timeout=10)
-            self.last_request_time = time.time()
-            if response.status_code != 200:
-                return None
-            soup = BeautifulSoup(response.text, 'lxml')
-            metadata = self.get_metadata(soup)
-            # Extract main content (simplified)
-            content = ' '.join([p.text for p in soup.find_all('p')])
-            processed = self.processor.process_content(content)
-            return {
-                'url': url,
-                'title': metadata.get('title', url),
-                'summary': processed['summary'],
-                'published_date': metadata.get('published_date', '')
-            }
-        except Exception as e:
-            print(f"Error processing URL {url}: {e}")
-            return None
-    def search(self, query: str, max_results: int = 5) -> List[Dict[str, Any]]:
-        """Perform search and process results"""
-        try:
-            # Perform DuckDuckGo search
-            search_results = ddg(query, max_results=max_results)
-            results = []
-            for result in search_results:
-                processed = self.process_url(result['link'])
-                if processed:
-                    results.append(processed)
-            return results[:max_results]
-        except Exception as e:
-            print(f"Error in search: {e}")
-            return []
-    async def advanced_search(self, query: str, search_type: str = "web", **kwargs) -> Dict[str, Any]:
-        """Perform advanced search based on type"""
-        results = {}
-        try:
-            if search_type == "web":
-                results["web"] = self.search(query, kwargs.get("max_results", 5))
-            elif search_type == "username":
-                results["osint"] = await self.osint_engine.search_username(query)
-            elif search_type == "image":
-                results["image"] = await self.osint_engine.search_image(query)
-            elif search_type == "social":
-                results["social"] = await self.osint_engine.search_social_media(
-                    query,
-                    kwargs.get("platform")
-                )
-            elif search_type == "personal":
-                results["personal"] = await self.osint_engine.gather_personal_info(kwargs)
-            elif search_type == "historical":
-                results["historical"] = await self.osint_engine.search_historical_data(query)
-        except Exception as e:
-            results["error"] = str(e)
-        return results
-# Main search function
-def search(query: str, max_results: int = 5) -> List[Dict[str, Any]]:
-    """Main search function"""
-    engine = WebSearchEngine()
-    return engine.search(query, max_results)
-# Main advanced search function
-async def advanced_search(query: str, search_type: str = "web", **kwargs) -> Dict[str, Any]:
-    """Main advanced search function"""
-    engine = WebSearchEngine()
-    return await engine.advanced_search(query, search_type, **kwargs)

space.yml DELETED Viewed

@@ -1,11 +0,0 @@
-title: Intelligent Search Engine
-emoji: 🔍
-colorFrom: blue
-colorTo: indigo
-sdk: gradio
-sdk_version: 4.14.0
-python_version: "3.10"
-app_file: app.py
-app_port: 7860
-pinned: false
-license: apache-2.0