pocketflow/cookbook/pocketflow-tool-crawler/tools/parser.py

78 lines
2.0 KiB
Python

from typing import Dict, List
from utils.call_llm import call_llm
def analyze_content(content: Dict) -> Dict:
"""Analyze webpage content using LLM
Args:
content (Dict): Webpage content with url, title and text
Returns:
Dict: Analysis results including summary and topics
"""
prompt = f"""
Analyze this webpage content:
Title: {content['title']}
URL: {content['url']}
Content: {content['text'][:2000]} # Limit content length
Please provide:
1. A brief summary (2-3 sentences)
2. Main topics/keywords (up to 5)
3. Content type (article, product page, etc)
Output in YAML format:
```yaml
summary: >
brief summary here
topics:
- topic 1
- topic 2
content_type: type here
```
"""
try:
response = call_llm(prompt)
# Extract YAML between code fences
yaml_str = response.split("```yaml")[1].split("```")[0].strip()
import yaml
analysis = yaml.safe_load(yaml_str)
# Validate required fields
assert "summary" in analysis
assert "topics" in analysis
assert "content_type" in analysis
assert isinstance(analysis["topics"], list)
return analysis
except Exception as e:
print(f"Error analyzing content: {str(e)}")
return {
"summary": "Error analyzing content",
"topics": [],
"content_type": "unknown"
}
def analyze_site(crawl_results: List[Dict]) -> List[Dict]:
"""Analyze all crawled pages
Args:
crawl_results (List[Dict]): List of crawled page contents
Returns:
List[Dict]: Original content with added analysis
"""
analyzed_results = []
for content in crawl_results:
if content and content.get("text"):
analysis = analyze_content(content)
content["analysis"] = analysis
analyzed_results.append(content)
return analyzed_results