import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from typing import Dict, List, Set

class WebCrawler:
    """Simple web crawler that extracts content and follows links"""
    
    def __init__(self, base_url: str, max_pages: int = 10):
        self.base_url = base_url
        self.max_pages = max_pages
        self.visited: Set[str] = set()
        
    def is_valid_url(self, url: str) -> bool:
        """Check if URL belongs to the same domain"""
        base_domain = urlparse(self.base_url).netloc
        url_domain = urlparse(url).netloc
        return base_domain == url_domain
        
    def extract_page_content(self, url: str) -> Dict:
        """Extract content from a single page"""
        try:
            response = requests.get(url)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, "html.parser")
            
            # Extract main content
            content = {
                "url": url,
                "title": soup.title.string if soup.title else "",
                "text": soup.get_text(separator="\n", strip=True),
                "links": []
            }
            
            # Extract links
            for link in soup.find_all("a"):
                href = link.get("href")
                if href:
                    absolute_url = urljoin(url, href)
                    if self.is_valid_url(absolute_url):
                        content["links"].append(absolute_url)
            
            return content
            
        except Exception as e:
            print(f"Error crawling {url}: {str(e)}")
            return None
    
    def crawl(self) -> List[Dict]:
        """Crawl website starting from base_url"""
        to_visit = [self.base_url]
        results = []
        
        while to_visit and len(self.visited) < self.max_pages:
            url = to_visit.pop(0)
            
            if url in self.visited:
                continue
                
            print(f"Crawling: {url}")
            content = self.extract_page_content(url)
            
            if content:
                self.visited.add(url)
                results.append(content)
                
                # Add new URLs to visit
                new_urls = [url for url in content["links"] 
                          if url not in self.visited 
                          and url not in to_visit]
                to_visit.extend(new_urls)
        
        return results