pocketflow/cookbook/pocketflow-tool-crawler/tools/crawler.py

75 lines
2.5 KiB
Python

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from typing import Dict, List, Set
class WebCrawler:
"""Simple web crawler that extracts content and follows links"""
def __init__(self, base_url: str, max_pages: int = 10):
self.base_url = base_url
self.max_pages = max_pages
self.visited: Set[str] = set()
def is_valid_url(self, url: str) -> bool:
"""Check if URL belongs to the same domain"""
base_domain = urlparse(self.base_url).netloc
url_domain = urlparse(url).netloc
return base_domain == url_domain
def extract_page_content(self, url: str) -> Dict:
"""Extract content from a single page"""
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Extract main content
content = {
"url": url,
"title": soup.title.string if soup.title else "",
"text": soup.get_text(separator="\n", strip=True),
"links": []
}
# Extract links
for link in soup.find_all("a"):
href = link.get("href")
if href:
absolute_url = urljoin(url, href)
if self.is_valid_url(absolute_url):
content["links"].append(absolute_url)
return content
except Exception as e:
print(f"Error crawling {url}: {str(e)}")
return None
def crawl(self) -> List[Dict]:
"""Crawl website starting from base_url"""
to_visit = [self.base_url]
results = []
while to_visit and len(self.visited) < self.max_pages:
url = to_visit.pop(0)
if url in self.visited:
continue
print(f"Crawling: {url}")
content = self.extract_page_content(url)
if content:
self.visited.add(url)
results.append(content)
# Add new URLs to visit
new_urls = [url for url in content["links"]
if url not in self.visited
and url not in to_visit]
to_visit.extend(new_urls)
return results