From 32d805ecf125d677db2e6d78a61d1c65b3bb4119 Mon Sep 17 00:00:00 2001 From: Kilokem Date: Mon, 9 Dec 2024 19:41:42 +0100 Subject: [PATCH] Pandas feladatok --- SajatMegoldasok/vizsga_01_minta.ipynb | 120 +++++++++++++++++++++++++- SajatMegoldasok/vizsga_02_minta.ipynb | 99 ++++++++++++++++++++- SajatMegoldasok/vizsga_03_minta.ipynb | 96 +++++++++++++++++++-- 3 files changed, 301 insertions(+), 14 deletions(-) diff --git a/SajatMegoldasok/vizsga_01_minta.ipynb b/SajatMegoldasok/vizsga_01_minta.ipynb index 263423b..2995a17 100644 --- a/SajatMegoldasok/vizsga_01_minta.ipynb +++ b/SajatMegoldasok/vizsga_01_minta.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -68,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -96,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -145,6 +145,118 @@ "- Cégkategóriánként hány dollárt fektettek be összesen?" ] }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 LifeLock\n", + "1 LifeLock\n", + "2 LifeLock\n", + "3 MyCityFaces\n", + "4 Flypaper\n", + "Name: company, dtype: object\n", + "The head: company numEmps category city state fundedDate raisedAmt \\\n", + "0 LifeLock NaN web Tempe AZ 1-May-07 6850000 \n", + "1 LifeLock NaN web Tempe AZ 1-Oct-06 6000000 \n", + "2 LifeLock NaN web Tempe AZ 1-Jan-08 25000000 \n", + "3 MyCityFaces 7.0 web Scottsdale AZ 1-Jan-08 50000 \n", + "4 Flypaper NaN web Phoenix AZ 1-Feb-08 3000000 \n", + "\n", + " raisedCurrency round \n", + "0 USD b \n", + "1 USD a \n", + "2 USD c \n", + "3 USD seed \n", + "4 USD a \n", + "1434\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "frame = pd.read_csv(\"investments.txt\", sep=\"|\")\n", + "\n", + "print(frame[\"company\"].iloc[:5])\n", + "\n", + "print(\"The head:\", frame.head())\n", + "\n", + "framecount = frame[\"company\"].count()\n", + "print(framecount)\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "company\n", + "Facebook 7\n", + "Juice Wireless 5\n", + "Viv’simo 5\n", + "Glam Media 5\n", + "Brightcove 5\n", + " ..\n", + "AdReady 1\n", + "AdMob 1\n", + "Acquia 1\n", + "x+1 1\n", + "vbs tv 1\n", + "Name: category, Length: 891, dtype: int64\n" + ] + } + ], + "source": [ + "grouped = frame.groupby(\"company\").count()\n", + "\n", + "grouped = frame[\"company\"].value_counts()\n", + "\n", + "grouped = frame.groupby(\"company\")[\"category\"].count().sort_values(ascending=False)\n", + "\n", + "\n", + "print(grouped)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "category\n", + "web 11753474750\n", + "software 1017942000\n", + "hardware 824500000\n", + "mobile 323020000\n", + "cleantech 258900000\n", + "other 119850000\n", + "biotech 77250000\n", + "consulting 32135000\n", + "Name: raisedAmt, dtype: int64\n" + ] + } + ], + "source": [ + "categ = frame.groupby(\"category\")[\"raisedAmt\"].sum().sort_values(ascending=False)\n", + "\n", + "print(categ)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -169,7 +281,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.6" + "version": "3.13.0" } }, "nbformat": 4, diff --git a/SajatMegoldasok/vizsga_02_minta.ipynb b/SajatMegoldasok/vizsga_02_minta.ipynb index 1b57e11..a6a0ffe 100644 --- a/SajatMegoldasok/vizsga_02_minta.ipynb +++ b/SajatMegoldasok/vizsga_02_minta.ipynb @@ -137,17 +137,108 @@ "- Hány ponttal magasabb a medencével (`Pool`) rendelkező szállodák átlagos értékelése (`Score`) a többi szálloda átlagos értékelésénél?" ] }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Hotel name Nr. rooms Pool Gym Tennis court \\\n", + "0 Circus Circus Hotel & Casino Las Vegas 3773 NO YES NO \n", + "1 Circus Circus Hotel & Casino Las Vegas 3773 NO YES NO \n", + "2 Circus Circus Hotel & Casino Las Vegas 3773 NO YES NO \n", + "3 Circus Circus Hotel & Casino Las Vegas 3773 NO YES NO \n", + "4 Circus Circus Hotel & Casino Las Vegas 3773 NO YES NO \n", + "\n", + " Spa Casino Traveler type Period of stay Score \n", + "0 NO YES Friends Dec-Feb 5 \n", + "1 NO YES Business Dec-Feb 3 \n", + "2 NO YES Families Mar-May 5 \n", + "3 NO YES Friends Mar-May 4 \n", + "4 NO YES Solo Mar-May 4 \n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "frame = pd.read_csv(\"hotels.txt\", sep=';')\n", + "print(frame[:5])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hotel name\n", + "The Venetian Las Vegas Hotel 96648\n", + "Excalibur Hotel & Casino 95544\n", + "Bellagio Las Vegas 94392\n", + "Circus Circus Hotel & Casino Las Vegas 90552\n", + "Caesars Palace 80352\n", + "Name: Nr. rooms, dtype: int64\n" + ] + } + ], + "source": [ + "legtobb = frame.groupby(\"Hotel name\")[\"Nr. rooms\"].sum().sort_values(ascending=False)[:5]\n", + "\n", + "print(legtobb)" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hotel name\n", + "Wynn Las Vegas 4.625\n", + "Name: Score, dtype: float64\n" + ] + } + ], + "source": [ + "scored = frame.groupby(\"Hotel name\")[\"Score\"].mean().sort_values(ascending=False)[:1]\n", + "\n", + "print(scored)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.9604166666666667\n" + ] + } + ], + "source": [ + "withspas = frame[frame[\"Pool\"] == \"YES\"][\"Score\"].mean()\n", + "withoutspas = frame[frame[\"Pool\"] == \"NO\"][\"Score\"].mean()\n", + "\n", + "print(withspas - withoutspas)" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -161,7 +252,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.5" + "version": "3.13.0" } }, "nbformat": 4, diff --git a/SajatMegoldasok/vizsga_03_minta.ipynb b/SajatMegoldasok/vizsga_03_minta.ipynb index a62fa6a..9348e5a 100644 --- a/SajatMegoldasok/vizsga_03_minta.ipynb +++ b/SajatMegoldasok/vizsga_03_minta.ipynb @@ -168,16 +168,100 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "### 3. feladat [10p]\n", + "\n", + "Az [unicef.txt](unicef.txt) szövegfájl a világ 5 év alatti népességének élelmezési helyzetéről tartalmaz adatokat. Az egyes sorok felméréseknek felelnek meg, a felmérések országonként időbeli sorrendben vannak felsorolva. Töltsük be az adatokat, határozzuk meg és írjuk ki az alábbi statisztikákat!\n", + "- Hány felmérés készült és hány országot érintett?\n", + "- Az alábbi statisztikákat csak azon felmérések alapján készítsük el, amelyeknél mind a három érintett indikátor (`Severe Wasting`, `Underweight`, `Overweight`) definiált (azaz ezek pozitív adatok). Ha egy országra több ilyen felmérés is van, akkor a legutóbbit vegyük figyelembe!\n", + " - Mely 5 országban a legmagasabb a `Severe Wasting` indikátor?\n", + " - Az országok hányadrészében magasabb az `Underweight` indikátor az `Overweight` indikátornál?" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Country United Nations Region United Nations Sub-Region \\\n", + "0 AFGHANISTAN Asia Southern Asia \n", + "1 AFGHANISTAN Asia Southern Asia \n", + "2 AFGHANISTAN Asia Southern Asia \n", + "3 ALBANIA Europe Southern Europe \n", + "4 ALBANIA Europe Southern Europe \n", + "\n", + " World Bank Income Classification Survey Year Survey Sample (N) \\\n", + "0 Low Income 1997 4846.0 \n", + "1 Low Income 2004 946.0 \n", + "2 Low Income 2013 21922.0 \n", + "3 Upper Middle Income 1996-98 7642.0 \n", + "4 Upper Middle Income 2000 1382.0 \n", + "\n", + " Severe Wasting Wasting Stunting Underweight Overweight \\\n", + "0 NaN 18,2 53,2 44,9 6,5 \n", + "1 3,5 8,6 59,3 32,9 4,6 \n", + "2 4,0 9,5 40,9 25,0 5,4 \n", + "3 NaN 8,1 20,4 7,1 9,5 \n", + "4 6,2 12,2 39,2 17,0 30,0 \n", + "\n", + " Source Notes \\\n", + "0  Afghanistan 1997 multiple indicator baseline ... Converted estimates \n", + "1 Summary report of the national nutrition surve... NaN \n", + "2 Afghanistan National Nutrition Survey 2013. (pending reanalysis) \n", + "3 National study on nutrition in Albania. Instit... Converted estimates \n", + "4 Multiple indicator cluster survey report Alban... NaN \n", + "\n", + " U5 Population ('000s) \n", + "0 3637,632 \n", + "1 4667,487 \n", + "2 5235,867 \n", + "3 307,887 \n", + "4 278,753 \n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "frame = pd.read_csv(\"unicef.txt\", sep=\"|\")\n", + "print(frame[:5])" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "keszult: 854\n", + "Orszag: 152\n" + ] + } + ], + "source": [ + "orszag = frame[\"Country\"].count()\n", + "\n", + "print(\"keszult: \",orszag)\n", + "\n", + "city = frame.groupby(\"Country\")[\"Survey Year\"].count().count()\n", + "\n", + "\n", + "print(\"Orszag: \",city)" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -191,7 +275,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.5" + "version": "3.13.0" } }, "nbformat": 4,