{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "import fitz  # PyMuPDF\n",
    "import pandas as pd\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "def extract_tables_from_pdf(pdf_path):\n",
    "    \"\"\"\n",
    "    Extracts tables and their highlighted rows from a PDF file.\n",
    "\n",
    "    Args:\n",
    "        pdf_path (str): Path to the PDF file.\n",
    "\n",
    "    Returns:\n",
    "        list: A list of dictionaries representing the tables.\n",
    "    \"\"\"\n",
    "    doc = fitz.open(pdf_path)\n",
    "    tables = []\n",
    "\n",
    "    for page_num in range(len(doc)):\n",
    "        page = doc[page_num]\n",
    "        blocks = page.get_text(\"dict\")['blocks']\n",
    "\n",
    "        for block in blocks:\n",
    "            if 'lines' in block:\n",
    "                table_data = []\n",
    "                for line in block['lines']:\n",
    "                    row_data = \"\"\n",
    "                    highlight_detected = False\n",
    "\n",
    "                    for span in line['spans']:\n",
    "                        row_data += span['text'] + ' '\n",
    "                        if span['bgcolor'] != 0:  # Non-zero bgcolor indicates highlighting\n",
    "                            highlight_detected = True\n",
    "\n",
    "                    table_data.append((row_data.strip(), highlight_detected))\n",
    "                \n",
    "                if table_data:\n",
    "                    tables.append(table_data)\n",
    "\n",
    "    return tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "def generate_quiz_from_tables(tables):\n",
    "    \"\"\"\n",
    "    Converts extracted tables into a multiple-choice quiz.\n",
    "\n",
    "    Args:\n",
    "        tables (list): List of dictionaries representing tables.\n",
    "\n",
    "    Returns:\n",
    "        dict: A dictionary containing questions and their respective choices.\n",
    "    \"\"\"\n",
    "    quiz = {}\n",
    "\n",
    "    for table in tables:\n",
    "        for i, (row, is_highlighted) in enumerate(table):\n",
    "            if i == 0:  # First row contains the question\n",
    "                question = row\n",
    "                quiz[question] = []\n",
    "            else:\n",
    "                choice = row\n",
    "                if is_highlighted:\n",
    "                    quiz[question].insert(0, {\"choice\": choice, \"correct\": True})\n",
    "                else:\n",
    "                    quiz[question].append({\"choice\": choice, \"correct\": False})\n",
    "\n",
    "    return quiz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "def main():\n",
    "    pdf_path = \"pelda_2.pdf\"  # Replace with your PDF file path\n",
    "    output_json = \"quiz.json\"       # Replace with desired output JSON file name\n",
    "\n",
    "    print(\"Extracting tables from the PDF...\")\n",
    "    tables = extract_tables_from_pdf(pdf_path)\n",
    "\n",
    "    print(\"Generating the quiz...\")\n",
    "    quiz = generate_quiz_from_tables(tables)\n",
    "\n",
    "    print(\"Saving the quiz to JSON file...\")\n",
    "    with open(output_json, \"w\", encoding=\"utf-8\") as f:\n",
    "        import json\n",
    "        json.dump(quiz, f, ensure_ascii=False, indent=4)\n",
    "\n",
    "    print(f\"Quiz saved to {output_json}\")\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    main()\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}