{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "plaintext" } }, "outputs": [], "source": [ "import fitz # PyMuPDF\n", "import pandas as pd\n", "import re" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "plaintext" } }, "outputs": [], "source": [ "def extract_tables_from_pdf(pdf_path):\n", " \"\"\"\n", " Extracts tables and their highlighted rows from a PDF file.\n", "\n", " Args:\n", " pdf_path (str): Path to the PDF file.\n", "\n", " Returns:\n", " list: A list of dictionaries representing the tables.\n", " \"\"\"\n", " doc = fitz.open(pdf_path)\n", " tables = []\n", "\n", " for page_num in range(len(doc)):\n", " page = doc[page_num]\n", " blocks = page.get_text(\"dict\")['blocks']\n", "\n", " for block in blocks:\n", " if 'lines' in block:\n", " table_data = []\n", " for line in block['lines']:\n", " row_data = \"\"\n", " highlight_detected = False\n", "\n", " for span in line['spans']:\n", " row_data += span['text'] + ' '\n", " if span['bgcolor'] != 0: # Non-zero bgcolor indicates highlighting\n", " highlight_detected = True\n", "\n", " table_data.append((row_data.strip(), highlight_detected))\n", " \n", " if table_data:\n", " tables.append(table_data)\n", "\n", " return tables" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "plaintext" } }, "outputs": [], "source": [ "def generate_quiz_from_tables(tables):\n", " \"\"\"\n", " Converts extracted tables into a multiple-choice quiz.\n", "\n", " Args:\n", " tables (list): List of dictionaries representing tables.\n", "\n", " Returns:\n", " dict: A dictionary containing questions and their respective choices.\n", " \"\"\"\n", " quiz = {}\n", "\n", " for table in tables:\n", " for i, (row, is_highlighted) in enumerate(table):\n", " if i == 0: # First row contains the question\n", " question = row\n", " quiz[question] = []\n", " else:\n", " choice = row\n", " if is_highlighted:\n", " quiz[question].insert(0, {\"choice\": choice, \"correct\": True})\n", " else:\n", " quiz[question].append({\"choice\": choice, \"correct\": False})\n", "\n", " return quiz" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "plaintext" } }, "outputs": [], "source": [ "def main():\n", " pdf_path = \"pelda_2.pdf\" # Replace with your PDF file path\n", " output_json = \"quiz.json\" # Replace with desired output JSON file name\n", "\n", " print(\"Extracting tables from the PDF...\")\n", " tables = extract_tables_from_pdf(pdf_path)\n", "\n", " print(\"Generating the quiz...\")\n", " quiz = generate_quiz_from_tables(tables)\n", "\n", " print(\"Saving the quiz to JSON file...\")\n", " with open(output_json, \"w\", encoding=\"utf-8\") as f:\n", " import json\n", " json.dump(quiz, f, ensure_ascii=False, indent=4)\n", "\n", " print(f\"Quiz saved to {output_json}\")\n", "\n", "if __name__ == \"__main__\":\n", " main()\n" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }