llama-vision-first-try/main.py

from pathlib import Path
import base64
import requests

OLLAMA_URL = "http://127.0.0.1:11434/api/chat"
MODEL = "llama3.2-vision"
IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".gif"}

PROMPT = """
Extract all readable text from this image.
Return only the extracted text.
Do not describe the image.
If nothing is readable, return an empty string.
""".strip()

def img_to_b64(path: Path) -> str:
    return base64.b64encode(path.read_bytes()).decode("utf-8")

def ocr_image(path: Path) -> str:
    payload = {
        "model": MODEL,
        "stream": False,
        "messages": [
            {
                "role": "user",
                "content": PROMPT,
                "images": [img_to_b64(path)]
            }
        ]
    }

    r = requests.post(OLLAMA_URL, json=payload, timeout=900000)
    r.raise_for_status()
    data = r.json()
    print(data)
    return data.get("message", {}).get("content", "").strip()

def process_folder(folder: str, result : str):
    folder_path = Path(folder)
    result_path = Path(result)
    for img in folder_path.iterdir():
        if img.is_file() and img.suffix.lower() in IMAGE_EXTS:
            print(f"Processing: {img.name}")
            text = ocr_image(img)
            out_file = result_path / f"{img.stem}.txt"
            out_file.write_text(text, encoding="utf-8")

if __name__ == "__main__":
    process_folder("images", "output")