from pathlib import Path import base64 import requests OLLAMA_URL = "http://127.0.0.1:11434/api/chat" MODEL = "llama3.2-vision" IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif"} PROMPT = """ Extract all readable text from this image. Return only the extracted text. Do not describe the image. If nothing is readable, return an empty string. """.strip() def img_to_b64(path: Path) -> str: return base64.b64encode(path.read_bytes()).decode("utf-8") def ocr_image(path: Path) -> str: payload = { "model": MODEL, "stream": False, "messages": [ { "role": "user", "content": PROMPT, "images": [img_to_b64(path)] } ] } r = requests.post(OLLAMA_URL, json=payload, timeout=900000) r.raise_for_status() data = r.json() print(data) return data.get("message", {}).get("content", "").strip() def process_folder(folder: str, result : str): folder_path = Path(folder) result_path = Path(result) for img in folder_path.iterdir(): if img.is_file() and img.suffix.lower() in IMAGE_EXTS: print(f"Processing: {img.name}") text = ocr_image(img) out_file = result_path / f"{img.stem}.txt" out_file.write_text(text, encoding="utf-8") if __name__ == "__main__": process_folder("images", "output")