some kind of pipeline with sklearn?

2025-01-30 10:07:11 +01:00 · 2025-01-30 10:07:11 +01:00 · b343b99395
commit b343b99395
parent b67e483be2
5 changed files with 1570 additions and 24 deletions
--- a/feature.ipynb
+++ b/feature.ipynb
@ -1066,7 +1066,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.8"
  },
  "varInspector": {
   "cols": {
--- a/sklearn.ipynb
+++ b/sklearn.ipynb
@ -0,0 +1,381 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f19ad4d1",
+   "metadata": {},
+   "source": [
+    "# Lineare Regression mit scikit-learn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "fca110ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "# plotting settings\n",
+    "pd.plotting.register_matplotlib_converters()\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline\n",
+    "import seaborn as sns\n",
+    "from sklearn.linear_model import LinearRegression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "6edf6b65",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>BuildingArea</th>\n",
+       "      <th>Rooms</th>\n",
+       "      <th>Price</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>79.0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1035000.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>150.0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1465000.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>142.0</td>\n",
+       "      <td>4</td>\n",
+       "      <td>1600000.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>210.0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1876000.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>107.0</td>\n",
+       "      <td>2</td>\n",
+       "      <td>1636000.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   BuildingArea  Rooms      Price\n",
+       "1          79.0      2  1035000.0\n",
+       "2         150.0      3  1465000.0\n",
+       "4         142.0      4  1600000.0\n",
+       "6         210.0      3  1876000.0\n",
+       "7         107.0      2  1636000.0"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "melbourne_file_path = 'data/melb_data.csv'\n",
+    "melbourne_data = pd.read_csv(melbourne_file_path)\n",
+    "melbourne_data = melbourne_data.dropna(axis=0)  # entfernen von Daten mit fehlenden Werten\n",
+    "# wählen für unser Beispiel einen kleinen Ausschnitt aus den Daten (denselben, wie im ersten Beispiel)\n",
+    "max_area = 400\n",
+    "max_datapoints = 100\n",
+    "data = melbourne_data[melbourne_data['BuildingArea'] < max_area][:max_datapoints][['BuildingArea', 'Rooms', 'Price']]\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b226e29",
+   "metadata": {},
+   "source": [
+    "Die scikit-learn API erwartet die Trainingsdaten (Inputs, Features) ähnlich zu unserer Feature-Matrix als 2D-Array, allerdings ohne der '1'."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9e851d19",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([1, 2, 3])"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "r = np.array([1,2,3])\n",
+    "r"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "a2903e63",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[1],\n",
+       "       [2],\n",
+       "       [3]])"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# ändern eines Vektors der Länge n in eine Matrix der Dimension n x 1:\n",
+    "r.reshape((-1,1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "89d8210c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = data[['BuildingArea'] ].to_numpy(copy=True).reshape((-1, 1))\n",
+    "y = data['Price'].to_numpy(copy=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b0d5c350",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(100, 1)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "bd1d76e9",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "w0: 441524.4208318128\n",
+      "w1: [6024.22929588]\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = LinearRegression().fit(x, y)\n",
+    "# print model parameters\n",
+    "print('w0: {}'.format(model.intercept_))\n",
+    "print('w1: {}'.format(model.coef_))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "12eacd13",
+   "metadata": {},
+   "source": [
+    "Vergleichen Sie die Werte mit der selbstprogrammierten Lösung!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c5f6e509",
+   "metadata": {},
+   "source": [
+    "**Achtung:** Alle scikit-learn Modelle haben Parameter, die das Verhalten beeinflussen. Mehr in der scikit-learn Dokumentation, z.B. hier für LinearRegression: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "055183fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "erklärte Varianz (R^2): 0.22971025499088593\n"
+     ]
+    }
+   ],
+   "source": [
+    "r_sq = model.score(x, y)\n",
+    "print('erklärte Varianz (R^2): {}'.format(r_sq))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5a66667c",
+   "metadata": {},
+   "source": [
+    "Vorhersage mit trainiertem scikit-learn Modell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "b81f3f03",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Preis-Vorhersage für ein neues Haus mit Wohnfläche 287: [2170478.22874863]\n"
+     ]
+    }
+   ],
+   "source": [
+    "building_area_new_house = [[287]]\n",
+    "y_pred = model.predict(building_area_new_house)\n",
+    "print('Preis-Vorhersage für ein neues Haus mit Wohnfläche {}: {}'.format(building_area_new_house[0][0], y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "0d1da016",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Preis-Vorhersagen für Objekte mit den Wohnflächen [[287], [80], [110]]: [2170478.22874863  923462.76450201 1104189.64337833]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Vorhersage für mehrere Objekte auf einmal\n",
+    "building_area_new_houses = [[287], [80], [110]]\n",
+    "y_preds = model.predict(building_area_new_houses)\n",
+    "print('Preis-Vorhersagen für Objekte mit den Wohnflächen {}: {}'.format(building_area_new_houses, y_preds))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5aac9e7a",
+   "metadata": {},
+   "source": [
+    "Alle scikit-learm Modelle haben bestimmte Funktionen, insbesondere `model.fit()` und `model.predict()`, sodass es sehr einfach ist, verschiedene ML-Modelle auszuprobieren."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f39ef84b",
+   "metadata": {},
+   "source": [
+    "**Aufgabe: Mehrere Input-Features ausprobieren!**"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.1"
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/requirements.txt
+++ b/requirements.txt
@ -31,6 +31,7 @@ ipywidgets==8.1.5
 isoduration==20.11.0
 jedi==0.19.2
 Jinja2==3.1.5
+joblib==1.4.2
 json5==0.10.0
 jsonpointer==3.0.0
 jsonschema==4.23.0
@ -86,6 +87,8 @@ requests==2.32.3
 rfc3339-validator==0.1.4
 rfc3986-validator==0.1.1
 rpds-py==0.22.3
+scikit-learn==1.6.1
+scipy==1.15.1
 seaborn==0.13.2
 Send2Trash==1.8.3
 setuptools==75.8.0
@ -94,6 +97,7 @@ sniffio==1.3.1
 soupsieve==2.6
 stack-data==0.6.3
 terminado==0.18.1
+threadpoolctl==3.5.0
 tinycss2==1.4.0
 tornado==6.4.2
 tqdm==4.67.1
--- a/tasks/03skykit-regularisierung.ipynb
+++ b/tasks/03skykit-regularisierung.ipynb
--- a/tasks/Untitled.ipynb
+++ b/tasks/Untitled.ipynb