75 lines
2.5 KiB
Python
75 lines
2.5 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urljoin, urlparse
|
|
from typing import Dict, List, Set
|
|
|
|
class WebCrawler:
|
|
"""Simple web crawler that extracts content and follows links"""
|
|
|
|
def __init__(self, base_url: str, max_pages: int = 10):
|
|
self.base_url = base_url
|
|
self.max_pages = max_pages
|
|
self.visited: Set[str] = set()
|
|
|
|
def is_valid_url(self, url: str) -> bool:
|
|
"""Check if URL belongs to the same domain"""
|
|
base_domain = urlparse(self.base_url).netloc
|
|
url_domain = urlparse(url).netloc
|
|
return base_domain == url_domain
|
|
|
|
def extract_page_content(self, url: str) -> Dict:
|
|
"""Extract content from a single page"""
|
|
try:
|
|
response = requests.get(url)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
# Extract main content
|
|
content = {
|
|
"url": url,
|
|
"title": soup.title.string if soup.title else "",
|
|
"text": soup.get_text(separator="\n", strip=True),
|
|
"links": []
|
|
}
|
|
|
|
# Extract links
|
|
for link in soup.find_all("a"):
|
|
href = link.get("href")
|
|
if href:
|
|
absolute_url = urljoin(url, href)
|
|
if self.is_valid_url(absolute_url):
|
|
content["links"].append(absolute_url)
|
|
|
|
return content
|
|
|
|
except Exception as e:
|
|
print(f"Error crawling {url}: {str(e)}")
|
|
return None
|
|
|
|
def crawl(self) -> List[Dict]:
|
|
"""Crawl website starting from base_url"""
|
|
to_visit = [self.base_url]
|
|
results = []
|
|
|
|
while to_visit and len(self.visited) < self.max_pages:
|
|
url = to_visit.pop(0)
|
|
|
|
if url in self.visited:
|
|
continue
|
|
|
|
print(f"Crawling: {url}")
|
|
content = self.extract_page_content(url)
|
|
|
|
if content:
|
|
self.visited.add(url)
|
|
results.append(content)
|
|
|
|
# Add new URLs to visit
|
|
new_urls = [url for url in content["links"]
|
|
if url not in self.visited
|
|
and url not in to_visit]
|
|
to_visit.extend(new_urls)
|
|
|
|
return results
|