FirstTry

2025-01-06 13:33:22 +01:00
commit f2e44e0419
2 changed files with 141 additions and 0 deletions
--- a/Converter.ipynb
+++ b/Converter.ipynb
@ -0,0 +1,141 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import fitz  # PyMuPDF\n",
+    "import pandas as pd\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def extract_tables_from_pdf(pdf_path):\n",
+    "    \"\"\"\n",
+    "    Extracts tables and their highlighted rows from a PDF file.\n",
+    "\n",
+    "    Args:\n",
+    "        pdf_path (str): Path to the PDF file.\n",
+    "\n",
+    "    Returns:\n",
+    "        list: A list of dictionaries representing the tables.\n",
+    "    \"\"\"\n",
+    "    doc = fitz.open(pdf_path)\n",
+    "    tables = []\n",
+    "\n",
+    "    for page_num in range(len(doc)):\n",
+    "        page = doc[page_num]\n",
+    "        blocks = page.get_text(\"dict\")['blocks']\n",
+    "\n",
+    "        for block in blocks:\n",
+    "            if 'lines' in block:\n",
+    "                table_data = []\n",
+    "                for line in block['lines']:\n",
+    "                    row_data = \"\"\n",
+    "                    highlight_detected = False\n",
+    "\n",
+    "                    for span in line['spans']:\n",
+    "                        row_data += span['text'] + ' '\n",
+    "                        if span['bgcolor'] != 0:  # Non-zero bgcolor indicates highlighting\n",
+    "                            highlight_detected = True\n",
+    "\n",
+    "                    table_data.append((row_data.strip(), highlight_detected))\n",
+    "                \n",
+    "                if table_data:\n",
+    "                    tables.append(table_data)\n",
+    "\n",
+    "    return tables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def generate_quiz_from_tables(tables):\n",
+    "    \"\"\"\n",
+    "    Converts extracted tables into a multiple-choice quiz.\n",
+    "\n",
+    "    Args:\n",
+    "        tables (list): List of dictionaries representing tables.\n",
+    "\n",
+    "    Returns:\n",
+    "        dict: A dictionary containing questions and their respective choices.\n",
+    "    \"\"\"\n",
+    "    quiz = {}\n",
+    "\n",
+    "    for table in tables:\n",
+    "        for i, (row, is_highlighted) in enumerate(table):\n",
+    "            if i == 0:  # First row contains the question\n",
+    "                question = row\n",
+    "                quiz[question] = []\n",
+    "            else:\n",
+    "                choice = row\n",
+    "                if is_highlighted:\n",
+    "                    quiz[question].insert(0, {\"choice\": choice, \"correct\": True})\n",
+    "                else:\n",
+    "                    quiz[question].append({\"choice\": choice, \"correct\": False})\n",
+    "\n",
+    "    return quiz"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def main():\n",
+    "    pdf_path = \"pelda_2.pdf\"  # Replace with your PDF file path\n",
+    "    output_json = \"quiz.json\"       # Replace with desired output JSON file name\n",
+    "\n",
+    "    print(\"Extracting tables from the PDF...\")\n",
+    "    tables = extract_tables_from_pdf(pdf_path)\n",
+    "\n",
+    "    print(\"Generating the quiz...\")\n",
+    "    quiz = generate_quiz_from_tables(tables)\n",
+    "\n",
+    "    print(\"Saving the quiz to JSON file...\")\n",
+    "    with open(output_json, \"w\", encoding=\"utf-8\") as f:\n",
+    "        import json\n",
+    "        json.dump(quiz, f, ensure_ascii=False, indent=4)\n",
+    "\n",
+    "    print(f\"Quiz saved to {output_json}\")\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    main()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/pelda_2.pdf
+++ b/pelda_2.pdf