some kind of pipeline with sklearn?
This commit is contained in:
parent
b67e483be2
commit
b343b99395
5 changed files with 1570 additions and 24 deletions
|
@ -1066,7 +1066,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
"version": "3.12.8"
|
||||
},
|
||||
"varInspector": {
|
||||
"cols": {
|
||||
|
|
381
Aufgaben/05 - lineare regression - sklearn.ipynb
Normal file
381
Aufgaben/05 - lineare regression - sklearn.ipynb
Normal file
|
@ -0,0 +1,381 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f19ad4d1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Lineare Regression mit scikit-learn"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "fca110ae",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"# plotting settings\n",
|
||||
"pd.plotting.register_matplotlib_converters()\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"%matplotlib inline\n",
|
||||
"import seaborn as sns\n",
|
||||
"from sklearn.linear_model import LinearRegression"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "6edf6b65",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>BuildingArea</th>\n",
|
||||
" <th>Rooms</th>\n",
|
||||
" <th>Price</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>79.0</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1035000.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>150.0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>1465000.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>142.0</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>1600000.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>210.0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>1876000.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>7</th>\n",
|
||||
" <td>107.0</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1636000.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" BuildingArea Rooms Price\n",
|
||||
"1 79.0 2 1035000.0\n",
|
||||
"2 150.0 3 1465000.0\n",
|
||||
"4 142.0 4 1600000.0\n",
|
||||
"6 210.0 3 1876000.0\n",
|
||||
"7 107.0 2 1636000.0"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"melbourne_file_path = 'data/melb_data.csv'\n",
|
||||
"melbourne_data = pd.read_csv(melbourne_file_path)\n",
|
||||
"melbourne_data = melbourne_data.dropna(axis=0) # entfernen von Daten mit fehlenden Werten\n",
|
||||
"# wählen für unser Beispiel einen kleinen Ausschnitt aus den Daten (denselben, wie im ersten Beispiel)\n",
|
||||
"max_area = 400\n",
|
||||
"max_datapoints = 100\n",
|
||||
"data = melbourne_data[melbourne_data['BuildingArea'] < max_area][:max_datapoints][['BuildingArea', 'Rooms', 'Price']]\n",
|
||||
"data.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7b226e29",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Die scikit-learn API erwartet die Trainingsdaten (Inputs, Features) ähnlich zu unserer Feature-Matrix als 2D-Array, allerdings ohne der '1'."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "9e851d19",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([1, 2, 3])"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"r = np.array([1,2,3])\n",
|
||||
"r"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "a2903e63",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([[1],\n",
|
||||
" [2],\n",
|
||||
" [3]])"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# ändern eines Vektors der Länge n in eine Matrix der Dimension n x 1:\n",
|
||||
"r.reshape((-1,1))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "89d8210c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"x = data[['BuildingArea'] ].to_numpy(copy=True).reshape((-1, 1))\n",
|
||||
"y = data['Price'].to_numpy(copy=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "b0d5c350",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(100, 1)"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"x.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "bd1d76e9",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"w0: 441524.4208318128\n",
|
||||
"w1: [6024.22929588]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model = LinearRegression().fit(x, y)\n",
|
||||
"# print model parameters\n",
|
||||
"print('w0: {}'.format(model.intercept_))\n",
|
||||
"print('w1: {}'.format(model.coef_))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "12eacd13",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Vergleichen Sie die Werte mit der selbstprogrammierten Lösung!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c5f6e509",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Achtung:** Alle scikit-learn Modelle haben Parameter, die das Verhalten beeinflussen. Mehr in der scikit-learn Dokumentation, z.B. hier für LinearRegression: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "055183fa",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"erklärte Varianz (R^2): 0.22971025499088593\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"r_sq = model.score(x, y)\n",
|
||||
"print('erklärte Varianz (R^2): {}'.format(r_sq))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5a66667c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Vorhersage mit trainiertem scikit-learn Modell"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "b81f3f03",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Preis-Vorhersage für ein neues Haus mit Wohnfläche 287: [2170478.22874863]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"building_area_new_house = [[287]]\n",
|
||||
"y_pred = model.predict(building_area_new_house)\n",
|
||||
"print('Preis-Vorhersage für ein neues Haus mit Wohnfläche {}: {}'.format(building_area_new_house[0][0], y_pred))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "0d1da016",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Preis-Vorhersagen für Objekte mit den Wohnflächen [[287], [80], [110]]: [2170478.22874863 923462.76450201 1104189.64337833]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Vorhersage für mehrere Objekte auf einmal\n",
|
||||
"building_area_new_houses = [[287], [80], [110]]\n",
|
||||
"y_preds = model.predict(building_area_new_houses)\n",
|
||||
"print('Preis-Vorhersagen für Objekte mit den Wohnflächen {}: {}'.format(building_area_new_houses, y_preds))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5aac9e7a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Alle scikit-learm Modelle haben bestimmte Funktionen, insbesondere `model.fit()` und `model.predict()`, sodass es sehr einfach ist, verschiedene ML-Modelle auszuprobieren."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f39ef84b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Aufgabe: Mehrere Input-Features ausprobieren!**"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.1"
|
||||
},
|
||||
"varInspector": {
|
||||
"cols": {
|
||||
"lenName": 16,
|
||||
"lenType": 16,
|
||||
"lenVar": 40
|
||||
},
|
||||
"kernels_config": {
|
||||
"python": {
|
||||
"delete_cmd_postfix": "",
|
||||
"delete_cmd_prefix": "del ",
|
||||
"library": "var_list.py",
|
||||
"varRefreshCmd": "print(var_dic_list())"
|
||||
},
|
||||
"r": {
|
||||
"delete_cmd_postfix": ") ",
|
||||
"delete_cmd_prefix": "rm(",
|
||||
"library": "var_list.r",
|
||||
"varRefreshCmd": "cat(var_dic_list()) "
|
||||
}
|
||||
},
|
||||
"types_to_exclude": [
|
||||
"module",
|
||||
"function",
|
||||
"builtin_function_or_method",
|
||||
"instance",
|
||||
"_Feature"
|
||||
],
|
||||
"window_display": false
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
|
@ -31,6 +31,7 @@ ipywidgets==8.1.5
|
|||
isoduration==20.11.0
|
||||
jedi==0.19.2
|
||||
Jinja2==3.1.5
|
||||
joblib==1.4.2
|
||||
json5==0.10.0
|
||||
jsonpointer==3.0.0
|
||||
jsonschema==4.23.0
|
||||
|
@ -86,6 +87,8 @@ requests==2.32.3
|
|||
rfc3339-validator==0.1.4
|
||||
rfc3986-validator==0.1.1
|
||||
rpds-py==0.22.3
|
||||
scikit-learn==1.6.1
|
||||
scipy==1.15.1
|
||||
seaborn==0.13.2
|
||||
Send2Trash==1.8.3
|
||||
setuptools==75.8.0
|
||||
|
@ -94,6 +97,7 @@ sniffio==1.3.1
|
|||
soupsieve==2.6
|
||||
stack-data==0.6.3
|
||||
terminado==0.18.1
|
||||
threadpoolctl==3.5.0
|
||||
tinycss2==1.4.0
|
||||
tornado==6.4.2
|
||||
tqdm==4.67.1
|
||||
|
|
1079
tasks/03skykit-regularisierung.ipynb
Normal file
1079
tasks/03skykit-regularisierung.ipynb
Normal file
File diff suppressed because it is too large
Load diff
File diff suppressed because one or more lines are too long
Loading…
Add table
Reference in a new issue