pocketflow/cookbook/pocketflow-tool-pdf-vision/tools/vision.py

41 lines
1.2 KiB
Python

from PIL import Image
from utils.call_llm import client
from tools.pdf import image_to_base64
def extract_text_from_image(image: Image.Image, prompt: str = None) -> str:
"""Extract text from image using OpenAI Vision API
Args:
image (PIL.Image): Image to process
prompt (str, optional): Custom prompt for extraction. Defaults to general OCR.
Returns:
str: Extracted text from image
"""
# Convert image to base64
img_base64 = image_to_base64(image)
# Default prompt for general OCR
if prompt is None:
prompt = "Please extract all text from this image."
# Call Vision API
response = client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}}
]
}]
)
return response.choices[0].message.content
if __name__ == "__main__":
# Test vision processing
test_image = Image.open("example.png")
result = extract_text_from_image(test_image)
print("Extracted text:", result)