128 lines
3.7 KiB
Python
128 lines
3.7 KiB
Python
from pocketflow import Node, BatchNode
|
|
from tools.pdf import pdf_to_images
|
|
from tools.vision import extract_text_from_image
|
|
from typing import List, Dict, Any
|
|
from pathlib import Path
|
|
import os
|
|
|
|
class ProcessPDFBatchNode(BatchNode):
|
|
"""Node for processing multiple PDFs from a directory"""
|
|
|
|
def prep(self, shared):
|
|
# Get PDF directory path
|
|
root_dir = Path(__file__).parent
|
|
pdf_dir = root_dir / "pdfs"
|
|
|
|
# List all PDFs
|
|
pdf_files = []
|
|
for file in os.listdir(pdf_dir):
|
|
if file.lower().endswith('.pdf'):
|
|
pdf_files.append({
|
|
"pdf_path": str(pdf_dir / file),
|
|
"extraction_prompt": shared.get("extraction_prompt",
|
|
"Extract all text from this document, preserving formatting and layout.")
|
|
})
|
|
|
|
if not pdf_files:
|
|
print("No PDF files found in 'pdfs' directory!")
|
|
return []
|
|
|
|
print(f"Found {len(pdf_files)} PDF files")
|
|
return pdf_files
|
|
|
|
def exec(self, item):
|
|
# Create flow for single PDF
|
|
flow = create_single_pdf_flow()
|
|
|
|
# Process PDF
|
|
print(f"\nProcessing: {os.path.basename(item['pdf_path'])}")
|
|
print("-" * 50)
|
|
|
|
# Run flow
|
|
shared = item.copy()
|
|
flow.run(shared)
|
|
|
|
return {
|
|
"filename": os.path.basename(item["pdf_path"]),
|
|
"text": shared.get("final_text", "No text extracted")
|
|
}
|
|
|
|
def post(self, shared, prep_res, exec_res_list):
|
|
shared["results"] = exec_res_list
|
|
return "default"
|
|
|
|
class LoadPDFNode(Node):
|
|
"""Node for loading and converting a single PDF to images"""
|
|
|
|
def prep(self, shared):
|
|
return shared.get("pdf_path", "")
|
|
|
|
def exec(self, pdf_path):
|
|
return pdf_to_images(pdf_path)
|
|
|
|
def post(self, shared, prep_res, exec_res):
|
|
shared["page_images"] = exec_res
|
|
return "default"
|
|
|
|
class ExtractTextNode(Node):
|
|
"""Node for extracting text from images using Vision API"""
|
|
|
|
def prep(self, shared):
|
|
return (
|
|
shared.get("page_images", []),
|
|
shared.get("extraction_prompt", None)
|
|
)
|
|
|
|
def exec(self, inputs):
|
|
images, prompt = inputs
|
|
results = []
|
|
|
|
for img, page_num in images:
|
|
text = extract_text_from_image(img, prompt)
|
|
results.append({
|
|
"page": page_num,
|
|
"text": text
|
|
})
|
|
|
|
return results
|
|
|
|
def post(self, shared, prep_res, exec_res):
|
|
shared["extracted_text"] = exec_res
|
|
return "default"
|
|
|
|
class CombineResultsNode(Node):
|
|
"""Node for combining and formatting extracted text"""
|
|
|
|
def prep(self, shared):
|
|
return shared.get("extracted_text", [])
|
|
|
|
def exec(self, results):
|
|
# Sort by page number
|
|
sorted_results = sorted(results, key=lambda x: x["page"])
|
|
|
|
# Combine text with page numbers
|
|
combined = []
|
|
for result in sorted_results:
|
|
combined.append(f"=== Page {result['page']} ===\n{result['text']}\n")
|
|
|
|
return "\n".join(combined)
|
|
|
|
def post(self, shared, prep_res, exec_res):
|
|
shared["final_text"] = exec_res
|
|
return "default"
|
|
|
|
def create_single_pdf_flow():
|
|
"""Create a flow for processing a single PDF"""
|
|
from pocketflow import Flow
|
|
|
|
# Create nodes
|
|
load_pdf = LoadPDFNode()
|
|
extract_text = ExtractTextNode()
|
|
combine_results = CombineResultsNode()
|
|
|
|
# Connect nodes
|
|
load_pdf >> extract_text >> combine_results
|
|
|
|
# Create and return flow
|
|
return Flow(start=load_pdf)
|