49 lines
1.4 KiB
Python
49 lines
1.4 KiB
Python
from pathlib import Path
|
|
import base64
|
|
import requests
|
|
|
|
OLLAMA_URL = "http://127.0.0.1:11434/api/chat"
|
|
MODEL = "llama3.2-vision"
|
|
IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif"}
|
|
|
|
PROMPT = """
|
|
Extract all readable text from this image.
|
|
Return only the extracted text.
|
|
Do not describe the image.
|
|
If nothing is readable, return an empty string.
|
|
""".strip()
|
|
|
|
def img_to_b64(path: Path) -> str:
|
|
return base64.b64encode(path.read_bytes()).decode("utf-8")
|
|
|
|
def ocr_image(path: Path) -> str:
|
|
payload = {
|
|
"model": MODEL,
|
|
"stream": False,
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": PROMPT,
|
|
"images": [img_to_b64(path)]
|
|
}
|
|
]
|
|
}
|
|
|
|
r = requests.post(OLLAMA_URL, json=payload, timeout=900000)
|
|
r.raise_for_status()
|
|
data = r.json()
|
|
print(data)
|
|
return data.get("message", {}).get("content", "").strip()
|
|
|
|
def process_folder(folder: str, result : str):
|
|
folder_path = Path(folder)
|
|
result_path = Path(result)
|
|
for img in folder_path.iterdir():
|
|
if img.is_file() and img.suffix.lower() in IMAGE_EXTS:
|
|
print(f"Processing: {img.name}")
|
|
text = ocr_image(img)
|
|
out_file = result_path / f"{img.stem}.txt"
|
|
out_file.write_text(text, encoding="utf-8")
|
|
|
|
if __name__ == "__main__":
|
|
process_folder("images", "output") |