This commit is contained in:
2025-01-06 13:33:22 +01:00
commit f2e44e0419
2 changed files with 141 additions and 0 deletions

141
Converter.ipynb Normal file
View File

@ -0,0 +1,141 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"import fitz # PyMuPDF\n",
"import pandas as pd\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"def extract_tables_from_pdf(pdf_path):\n",
" \"\"\"\n",
" Extracts tables and their highlighted rows from a PDF file.\n",
"\n",
" Args:\n",
" pdf_path (str): Path to the PDF file.\n",
"\n",
" Returns:\n",
" list: A list of dictionaries representing the tables.\n",
" \"\"\"\n",
" doc = fitz.open(pdf_path)\n",
" tables = []\n",
"\n",
" for page_num in range(len(doc)):\n",
" page = doc[page_num]\n",
" blocks = page.get_text(\"dict\")['blocks']\n",
"\n",
" for block in blocks:\n",
" if 'lines' in block:\n",
" table_data = []\n",
" for line in block['lines']:\n",
" row_data = \"\"\n",
" highlight_detected = False\n",
"\n",
" for span in line['spans']:\n",
" row_data += span['text'] + ' '\n",
" if span['bgcolor'] != 0: # Non-zero bgcolor indicates highlighting\n",
" highlight_detected = True\n",
"\n",
" table_data.append((row_data.strip(), highlight_detected))\n",
" \n",
" if table_data:\n",
" tables.append(table_data)\n",
"\n",
" return tables"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"def generate_quiz_from_tables(tables):\n",
" \"\"\"\n",
" Converts extracted tables into a multiple-choice quiz.\n",
"\n",
" Args:\n",
" tables (list): List of dictionaries representing tables.\n",
"\n",
" Returns:\n",
" dict: A dictionary containing questions and their respective choices.\n",
" \"\"\"\n",
" quiz = {}\n",
"\n",
" for table in tables:\n",
" for i, (row, is_highlighted) in enumerate(table):\n",
" if i == 0: # First row contains the question\n",
" question = row\n",
" quiz[question] = []\n",
" else:\n",
" choice = row\n",
" if is_highlighted:\n",
" quiz[question].insert(0, {\"choice\": choice, \"correct\": True})\n",
" else:\n",
" quiz[question].append({\"choice\": choice, \"correct\": False})\n",
"\n",
" return quiz"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"outputs": [],
"source": [
"def main():\n",
" pdf_path = \"pelda_2.pdf\" # Replace with your PDF file path\n",
" output_json = \"quiz.json\" # Replace with desired output JSON file name\n",
"\n",
" print(\"Extracting tables from the PDF...\")\n",
" tables = extract_tables_from_pdf(pdf_path)\n",
"\n",
" print(\"Generating the quiz...\")\n",
" quiz = generate_quiz_from_tables(tables)\n",
"\n",
" print(\"Saving the quiz to JSON file...\")\n",
" with open(output_json, \"w\", encoding=\"utf-8\") as f:\n",
" import json\n",
" json.dump(quiz, f, ensure_ascii=False, indent=4)\n",
"\n",
" print(f\"Quiz saved to {output_json}\")\n",
"\n",
"if __name__ == \"__main__\":\n",
" main()\n"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

BIN
pelda_2.pdf Normal file

Binary file not shown.