FirstTry
This commit is contained in:
141
Converter.ipynb
Normal file
141
Converter.ipynb
Normal file
@ -0,0 +1,141 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import fitz # PyMuPDF\n",
|
||||
"import pandas as pd\n",
|
||||
"import re"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_tables_from_pdf(pdf_path):\n",
|
||||
" \"\"\"\n",
|
||||
" Extracts tables and their highlighted rows from a PDF file.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" pdf_path (str): Path to the PDF file.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" list: A list of dictionaries representing the tables.\n",
|
||||
" \"\"\"\n",
|
||||
" doc = fitz.open(pdf_path)\n",
|
||||
" tables = []\n",
|
||||
"\n",
|
||||
" for page_num in range(len(doc)):\n",
|
||||
" page = doc[page_num]\n",
|
||||
" blocks = page.get_text(\"dict\")['blocks']\n",
|
||||
"\n",
|
||||
" for block in blocks:\n",
|
||||
" if 'lines' in block:\n",
|
||||
" table_data = []\n",
|
||||
" for line in block['lines']:\n",
|
||||
" row_data = \"\"\n",
|
||||
" highlight_detected = False\n",
|
||||
"\n",
|
||||
" for span in line['spans']:\n",
|
||||
" row_data += span['text'] + ' '\n",
|
||||
" if span['bgcolor'] != 0: # Non-zero bgcolor indicates highlighting\n",
|
||||
" highlight_detected = True\n",
|
||||
"\n",
|
||||
" table_data.append((row_data.strip(), highlight_detected))\n",
|
||||
" \n",
|
||||
" if table_data:\n",
|
||||
" tables.append(table_data)\n",
|
||||
"\n",
|
||||
" return tables"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def generate_quiz_from_tables(tables):\n",
|
||||
" \"\"\"\n",
|
||||
" Converts extracted tables into a multiple-choice quiz.\n",
|
||||
"\n",
|
||||
" Args:\n",
|
||||
" tables (list): List of dictionaries representing tables.\n",
|
||||
"\n",
|
||||
" Returns:\n",
|
||||
" dict: A dictionary containing questions and their respective choices.\n",
|
||||
" \"\"\"\n",
|
||||
" quiz = {}\n",
|
||||
"\n",
|
||||
" for table in tables:\n",
|
||||
" for i, (row, is_highlighted) in enumerate(table):\n",
|
||||
" if i == 0: # First row contains the question\n",
|
||||
" question = row\n",
|
||||
" quiz[question] = []\n",
|
||||
" else:\n",
|
||||
" choice = row\n",
|
||||
" if is_highlighted:\n",
|
||||
" quiz[question].insert(0, {\"choice\": choice, \"correct\": True})\n",
|
||||
" else:\n",
|
||||
" quiz[question].append({\"choice\": choice, \"correct\": False})\n",
|
||||
"\n",
|
||||
" return quiz"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"vscode": {
|
||||
"languageId": "plaintext"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def main():\n",
|
||||
" pdf_path = \"pelda_2.pdf\" # Replace with your PDF file path\n",
|
||||
" output_json = \"quiz.json\" # Replace with desired output JSON file name\n",
|
||||
"\n",
|
||||
" print(\"Extracting tables from the PDF...\")\n",
|
||||
" tables = extract_tables_from_pdf(pdf_path)\n",
|
||||
"\n",
|
||||
" print(\"Generating the quiz...\")\n",
|
||||
" quiz = generate_quiz_from_tables(tables)\n",
|
||||
"\n",
|
||||
" print(\"Saving the quiz to JSON file...\")\n",
|
||||
" with open(output_json, \"w\", encoding=\"utf-8\") as f:\n",
|
||||
" import json\n",
|
||||
" json.dump(quiz, f, ensure_ascii=False, indent=4)\n",
|
||||
"\n",
|
||||
" print(f\"Quiz saved to {output_json}\")\n",
|
||||
"\n",
|
||||
"if __name__ == \"__main__\":\n",
|
||||
" main()\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
BIN
pelda_2.pdf
Normal file
BIN
pelda_2.pdf
Normal file
Binary file not shown.
Reference in New Issue
Block a user