41 lines
1.2 KiB
Python
41 lines
1.2 KiB
Python
from PIL import Image
|
|
from utils.call_llm import client
|
|
from tools.pdf import image_to_base64
|
|
|
|
def extract_text_from_image(image: Image.Image, prompt: str = None) -> str:
|
|
"""Extract text from image using OpenAI Vision API
|
|
|
|
Args:
|
|
image (PIL.Image): Image to process
|
|
prompt (str, optional): Custom prompt for extraction. Defaults to general OCR.
|
|
|
|
Returns:
|
|
str: Extracted text from image
|
|
"""
|
|
# Convert image to base64
|
|
img_base64 = image_to_base64(image)
|
|
|
|
# Default prompt for general OCR
|
|
if prompt is None:
|
|
prompt = "Please extract all text from this image."
|
|
|
|
# Call Vision API
|
|
response = client.chat.completions.create(
|
|
model="gpt-4o",
|
|
messages=[{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "text", "text": prompt},
|
|
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}}
|
|
]
|
|
}]
|
|
)
|
|
|
|
return response.choices[0].message.content
|
|
|
|
if __name__ == "__main__":
|
|
# Test vision processing
|
|
test_image = Image.open("example.png")
|
|
result = extract_text_from_image(test_image)
|
|
print("Extracted text:", result)
|