78 lines
2.0 KiB
Python
78 lines
2.0 KiB
Python
from typing import Dict, List
|
|
from utils.call_llm import call_llm
|
|
|
|
def analyze_content(content: Dict) -> Dict:
|
|
"""Analyze webpage content using LLM
|
|
|
|
Args:
|
|
content (Dict): Webpage content with url, title and text
|
|
|
|
Returns:
|
|
Dict: Analysis results including summary and topics
|
|
"""
|
|
prompt = f"""
|
|
Analyze this webpage content:
|
|
|
|
Title: {content['title']}
|
|
URL: {content['url']}
|
|
Content: {content['text'][:2000]} # Limit content length
|
|
|
|
Please provide:
|
|
1. A brief summary (2-3 sentences)
|
|
2. Main topics/keywords (up to 5)
|
|
3. Content type (article, product page, etc)
|
|
|
|
Output in YAML format:
|
|
```yaml
|
|
summary: >
|
|
brief summary here
|
|
topics:
|
|
- topic 1
|
|
- topic 2
|
|
content_type: type here
|
|
```
|
|
"""
|
|
|
|
try:
|
|
response = call_llm(prompt)
|
|
# Extract YAML between code fences
|
|
yaml_str = response.split("```yaml")[1].split("```")[0].strip()
|
|
|
|
import yaml
|
|
analysis = yaml.safe_load(yaml_str)
|
|
|
|
# Validate required fields
|
|
assert "summary" in analysis
|
|
assert "topics" in analysis
|
|
assert "content_type" in analysis
|
|
assert isinstance(analysis["topics"], list)
|
|
|
|
return analysis
|
|
|
|
except Exception as e:
|
|
print(f"Error analyzing content: {str(e)}")
|
|
return {
|
|
"summary": "Error analyzing content",
|
|
"topics": [],
|
|
"content_type": "unknown"
|
|
}
|
|
|
|
def analyze_site(crawl_results: List[Dict]) -> List[Dict]:
|
|
"""Analyze all crawled pages
|
|
|
|
Args:
|
|
crawl_results (List[Dict]): List of crawled page contents
|
|
|
|
Returns:
|
|
List[Dict]: Original content with added analysis
|
|
"""
|
|
analyzed_results = []
|
|
|
|
for content in crawl_results:
|
|
if content and content.get("text"):
|
|
analysis = analyze_content(content)
|
|
content["analysis"] = analysis
|
|
analyzed_results.append(content)
|
|
|
|
return analyzed_results
|