commit f2e44e041920c548dd556c3b35e92de94ff90764 Author: Kilokem Date: Mon Jan 6 13:33:22 2025 +0100 FirstTry diff --git a/Converter.ipynb b/Converter.ipynb new file mode 100644 index 0000000..85948e3 --- /dev/null +++ b/Converter.ipynb @@ -0,0 +1,141 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "import fitz # PyMuPDF\n", + "import pandas as pd\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "def extract_tables_from_pdf(pdf_path):\n", + " \"\"\"\n", + " Extracts tables and their highlighted rows from a PDF file.\n", + "\n", + " Args:\n", + " pdf_path (str): Path to the PDF file.\n", + "\n", + " Returns:\n", + " list: A list of dictionaries representing the tables.\n", + " \"\"\"\n", + " doc = fitz.open(pdf_path)\n", + " tables = []\n", + "\n", + " for page_num in range(len(doc)):\n", + " page = doc[page_num]\n", + " blocks = page.get_text(\"dict\")['blocks']\n", + "\n", + " for block in blocks:\n", + " if 'lines' in block:\n", + " table_data = []\n", + " for line in block['lines']:\n", + " row_data = \"\"\n", + " highlight_detected = False\n", + "\n", + " for span in line['spans']:\n", + " row_data += span['text'] + ' '\n", + " if span['bgcolor'] != 0: # Non-zero bgcolor indicates highlighting\n", + " highlight_detected = True\n", + "\n", + " table_data.append((row_data.strip(), highlight_detected))\n", + " \n", + " if table_data:\n", + " tables.append(table_data)\n", + "\n", + " return tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "def generate_quiz_from_tables(tables):\n", + " \"\"\"\n", + " Converts extracted tables into a multiple-choice quiz.\n", + "\n", + " Args:\n", + " tables (list): List of dictionaries representing tables.\n", + "\n", + " Returns:\n", + " dict: A dictionary containing questions and their respective choices.\n", + " \"\"\"\n", + " quiz = {}\n", + "\n", + " for table in tables:\n", + " for i, (row, is_highlighted) in enumerate(table):\n", + " if i == 0: # First row contains the question\n", + " question = row\n", + " quiz[question] = []\n", + " else:\n", + " choice = row\n", + " if is_highlighted:\n", + " quiz[question].insert(0, {\"choice\": choice, \"correct\": True})\n", + " else:\n", + " quiz[question].append({\"choice\": choice, \"correct\": False})\n", + "\n", + " return quiz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "def main():\n", + " pdf_path = \"pelda_2.pdf\" # Replace with your PDF file path\n", + " output_json = \"quiz.json\" # Replace with desired output JSON file name\n", + "\n", + " print(\"Extracting tables from the PDF...\")\n", + " tables = extract_tables_from_pdf(pdf_path)\n", + "\n", + " print(\"Generating the quiz...\")\n", + " quiz = generate_quiz_from_tables(tables)\n", + "\n", + " print(\"Saving the quiz to JSON file...\")\n", + " with open(output_json, \"w\", encoding=\"utf-8\") as f:\n", + " import json\n", + " json.dump(quiz, f, ensure_ascii=False, indent=4)\n", + "\n", + " print(f\"Quiz saved to {output_json}\")\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pelda_2.pdf b/pelda_2.pdf new file mode 100644 index 0000000..e2a9303 Binary files /dev/null and b/pelda_2.pdf differ